In [12]:
# basic
import numpy as np
import pandas as pd

# diagnostics
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score

# models
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
import statsmodels.api as sm
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

# cosmetics
import warnings
warnings.filterwarnings('ignore')

# Read Data

Preprocessing is in ML_get_data.py

In [17]:
data_raw = pd.read_csv('../data/ML_data_raw.csv')
data_raw['date'] = pd.to_datetime(data_raw['date'])
data_raw.isna().sum()

date                0
permno              0
weight              0
weight_diff      1496
ret                 0
ret_exc            49
dolvol             45
div12m_me        1360
ret_1_0            49
ret_6_1           392
ret_12_1         1525
ret_60_12        9482
at_gr1            499
ebit_sale         974
ebit_bev         2663
sale_bev         2651
niq_be           2777
niq_su           3269
ni_ar1           7594
beta_60m         6016
resff3_12_1      4030
resff3_6_1       4039
ivol_ff3_21d       47
iskew_ff3_21d      47
dolvol_126d        51
ami_126d           51
betadown_252d     685
rvolhl_21d         46
dtype: int64

# Data Cleaning

In [18]:
data = data_raw.dropna()
data

Unnamed: 0,date,permno,weight,weight_diff,ret,ret_exc,dolvol,div12m_me,ret_1_0,ret_6_1,...,ni_ar1,beta_60m,resff3_12_1,resff3_6_1,ivol_ff3_21d,iskew_ff3_21d,dolvol_126d,ami_126d,betadown_252d,rvolhl_21d
1495,2015-09-30,10026,0.005153,0.000405,0.007803,0.010951,1.570063e+08,0.012255,0.010946,0.128117,...,-0.012856,0.614872,0.013828,0.332833,0.009394,0.047453,6.306525e+06,0.001528,0.741778,0.012353
1496,2015-09-30,10032,0.003450,0.002825,0.035149,-0.039745,1.566307e+08,0.000000,-0.039750,0.013147,...,-0.099030,1.816931,0.319809,0.214736,0.014715,-0.546169,7.071727e+06,0.001474,1.034485,0.015583
1497,2015-09-30,10104,0.004525,0.002126,0.017178,-0.034235,1.247125e+10,0.014873,-0.034240,-0.083623,...,0.223797,1.296457,0.129342,-0.648162,0.007818,-0.761677,5.998794e+08,0.000016,0.823032,0.010550
1498,2015-09-30,10107,0.011309,0.006029,0.018877,0.004440,3.251213e+10,0.028017,0.004435,0.088365,...,0.688890,0.796584,0.123791,0.320664,0.008896,-0.129808,1.559764e+09,0.000006,1.234306,0.014360
1499,2015-09-30,10138,0.005447,0.002529,0.018017,-0.033882,2.552828e+09,0.055543,-0.033887,-0.029998,...,0.516445,1.241626,0.323954,0.082674,0.005766,-0.140538,1.136964e+08,0.000066,1.009653,0.010671
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53319,2022-09-30,93380,0.001168,-0.000532,0.063965,-0.072730,1.535784e+09,0.000000,-0.071182,0.379521,...,-0.338523,0.837692,0.103327,0.238461,0.034592,0.529944,7.238722e+07,0.000549,1.243914,0.030146
53320,2022-09-30,93419,0.000913,0.000349,0.000914,-0.088333,6.592271e+08,0.081465,-0.086785,-0.429028,...,-0.277155,0.968798,-0.130240,-0.850293,0.018514,0.347340,3.597777e+07,0.000578,0.960695,0.022964
53322,2022-09-30,93427,0.005992,0.003661,0.005266,0.059514,3.725360e+08,0.000000,0.061062,-0.115248,...,-0.101019,0.920422,0.023427,-0.198731,0.018748,1.068643,1.741167e+07,0.001244,0.921809,0.018366
53323,2022-09-30,93429,0.005812,0.001656,-0.003058,0.013447,1.533133e+09,0.016113,0.014994,0.019673,...,-0.319949,0.581937,0.300529,0.587520,0.012735,-0.207982,7.388176e+07,0.000178,0.622542,0.012583


# Machine Learning

In [10]:
data

Unnamed: 0,date,permno,weight,weight_diff,ret,ret_exc,dolvol,div12m_me,ret_1_0,ret_6_1,...,ni_ar1,beta_60m,resff3_12_1,resff3_6_1,ivol_ff3_21d,iskew_ff3_21d,dolvol_126d,ami_126d,betadown_252d,rvolhl_21d
0,2015-06-30,10026,0.004940,,-0.001714,0.013661,1.077341e+08,0.012754,0.013670,0.012258,...,0.108591,0.639249,-0.045735,-0.314639,0.007934,0.162090,6.656049e+06,0.001587,0.751343,0.009442
1,2015-06-30,10032,0.003270,,0.002284,0.025720,1.402114e+08,0.000000,0.025730,0.064384,...,-0.282517,2.003287,0.188331,0.364191,0.007195,0.005530,6.809497e+06,0.001537,1.289511,0.010380
2,2015-06-30,10104,0.005695,,-0.002969,-0.020663,1.252593e+10,0.012155,-0.020654,0.038793,...,0.250553,1.344712,0.073771,0.030064,0.009298,-0.585758,6.035824e+08,0.000016,0.819437,0.009231
3,2015-06-30,10107,0.010017,,-0.004958,0.036109,3.385554e+10,0.026274,0.036118,-0.025024,...,0.537525,0.900599,0.219538,-0.025676,0.012656,0.904831,1.627370e+09,0.000006,1.263078,0.011568
4,2015-06-30,10138,0.005963,,0.003356,-0.003045,2.375793e+09,0.048478,-0.003036,-0.008304,...,0.374431,1.303486,0.187043,-0.067774,0.005177,-0.021765,1.127723e+08,0.000071,1.110027,0.006170
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53320,2022-09-30,93419,0.000913,0.000349,0.000914,-0.088333,6.592271e+08,0.081465,-0.086785,-0.429028,...,-0.277155,0.968798,-0.130240,-0.850293,0.018514,0.347340,3.597777e+07,0.000578,0.960695,0.022964
53321,2022-09-30,93423,0.008836,0.005538,-0.023718,-0.061261,1.629095e+09,0.000000,-0.059713,-0.474020,...,-0.040343,2.154664,0.056040,-0.281275,0.033111,-0.726882,7.419395e+07,0.000430,1.582715,0.040376
53322,2022-09-30,93427,0.005992,0.003661,0.005266,0.059514,3.725360e+08,0.000000,0.061062,-0.115248,...,-0.101019,0.920422,0.023427,-0.198731,0.018748,1.068643,1.741167e+07,0.001244,0.921809,0.018366
53323,2022-09-30,93429,0.005812,0.001656,-0.003058,0.013447,1.533133e+09,0.016113,0.014994,0.019673,...,-0.319949,0.581937,0.300529,0.587520,0.012735,-0.207982,7.388176e+07,0.000178,0.622542,0.012583
