In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
import statsmodels.api as sm
import statsmodels.formula.api as smf
import time

from prediction_ML_pipeline import train_and_evaluate_model, concatenate_csv_files, add_date_ticker, save_dataframe_to_folder
from order_imbalance import order_imbalance, combined_order_imbalance, diagnostic_plots, lm_results, iceberg_order_imbalance

In [2]:
start = time.time()
print("Starting load data", flush=True)
# Load data
folder_path = "/Users/jinghuitan/Desktop/Dissertation/data/test_data"
folder_path_train = "/Users/jinghuitan/Desktop/Dissertation/data/train_data"
concatenated_df_m, concatenated_df_ob = concatenate_csv_files(folder_path)
concatenated_df_m_train, concatenated_df_ob_train = concatenate_csv_files(folder_path_train)    

df_m_labelled_lst = [concatenated_df_m_train]
df_ob_labelled_lst = [concatenated_df_ob_train]
tickers_train = ['SPY']
df_m_predict_lst = [concatenated_df_m]
df_ob_predict_lst = [concatenated_df_ob]
tickers_pred = ['SPY']

print('Load data done', flush=True)
print(f"{time.time() - start:.3f} seconds elapsed", flush=True)
start = time.time()

# Run ML prediction on data
classifier = RandomForestClassifier(n_jobs=-1)
param_grid = {
    'n_estimators': [50]
}


results = train_and_evaluate_model(classifier, param_grid, df_ob_labelled_lst, 
                                df_m_labelled_lst, tickers_train,
                                df_ob_predict_lst, df_m_predict_lst, tickers_pred)
df_labelled_dict, df_predict_dict, features_dict, prediction_dict, best_classifier = results

print('Prediction time done', flush=True)
print(f"{time.time() - start:.3f} seconds elapsed", flush=True)

start = time.time()

# # Save predicted results to folder
# file_names = ['predict_df_m.csv', 'predict_df_ob.csv', 'predict_df_pred.csv']
# folder_path = '/Users/jinghuitan/Desktop/Dissertation/data/predicted_data'
# dfs = [df_predict_dict[tickers_pred[0]][0], df_predict_dict[tickers_pred[0]][1], prediction_dict['pred']]
# for df, file_name in zip(dfs, file_names):
#     save_dataframe_to_folder(df, folder_path=folder_path, file_name=file_name)


# Run OI analysis
df_results = lm_results(df_predict_dict['SPY'][0], prediction_dict['pred'], df_predict_dict['SPY'][1],
                        delta_lst=['5s', '10s', '15s', '30s', '1min', '5min', '10min'],
                        order_type='size', predictive=True, weighted_mp=False,
                        momentum=True)

print(df_results)

folder_path = "/Users/jinghuitan/Desktop/Dissertation/data/output_folders"
file_name = "Predictive_Momentum_size_unweighted_SPY_2019_01_02"
save_dataframe_to_folder(df_results, folder_path=folder_path, file_name=file_name)

print('Data storing done', flush=True)
print(f"{time.time() - start:.3f} seconds elapsed", flush=True)

Starting load data
Load data done
20.596 seconds elapsed
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END ....................................n_estimators=50; total time=   0.2s
[CV] END ....................................n_estimators=50; total time=   0.1s
[CV] END ....................................n_estimators=50; total time=   0.1s
[CV] END ....................................n_estimators=50; total time=   0.1s
[CV] END ....................................n_estimators=50; total time=   0.1s
Best parameters found:  {'n_estimators': 50}
Best accuracy found:  0.9105651414280983
Accuracy on the train data: 1.0
Accuracy on the test data: 0.9296987087517934
Prediction time done
25.754 seconds elapsed
  timeframe  params_vis  tvalues_vis  params_small  tvalues_small  \
0        5s   -0.000055    -2.248186 -1.966577e-06      -0.380687   
1       10s    0.000012     0.370779  4.743620e-07       0.082732   
2       15s    0.000002     0.046040  5.680514e-06       0.84690