In [6]:
import datetime

import numpy as np
import pandas as pd
import scipy.stats
from matplotlib import pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler, PowerTransformer, FunctionTransformer, StandardScaler
from sklearn.svm import LinearSVR, SVR
from xgboost import XGBRegressor

from utils.ml_data_provider import SectoralDataProvider

In [28]:
data_provider = SectoralDataProvider(cache_path='/Users/j4yzer/PycharmProjects/VKR/data/sectoral_ml')
data : pd.DataFrame = data_provider.load_data()

data['date'] = pd.to_datetime(data['date'], format='%Y-%m-%d', utc=False)

# data = data.set_index('date')
data.info()
data.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20675 entries, 0 to 20674
Data columns (total 26 columns):
 #   Column                                   Non-Null Count  Dtype         
---  ------                                   --------------  -----         
 0   ticker                                   20675 non-null  object        
 1   date                                     20675 non-null  datetime64[ns]
 2   sector                                   20675 non-null  object        
 3   sectoralIndex                            20675 non-null  object        
 4   absoluteReturn                           19209 non-null  float64       
 5   relativeToSPReturn                       19209 non-null  float64       
 6   relativeToSectoralIndexReturn            19209 non-null  float64       
 7   nextPeriodRelativeToSectoralIndexReturn  19209 non-null  float64       
 8   closePrice                               20675 non-null  float64       
 9   dilutedEarningsPerShare                

Unnamed: 0,absoluteReturn,relativeToSPReturn,relativeToSectoralIndexReturn,nextPeriodRelativeToSectoralIndexReturn,closePrice,dilutedEarningsPerShare,freeCashFlowPerShare,bookValuePerShare,equityRatio,marketCap,...,priceToFreeCashFlow,priceToBookRatio,enterpriseValueToSalesRatio,currentRatio,roic,netCurrentAssetValuePerShare,ebitdaMargin,netIncomeMargin,returnOnAssets,returnOnEquity
count,19209.0,19209.0,19209.0,19209.0,20675.0,20675.0,20675.0,20675.0,20675.0,20675.0,...,20675.0,20675.0,20675.0,20675.0,20675.0,20675.0,20675.0,20675.0,20675.0,20675.0
mean,inf,inf,inf,inf,48.993744,,0.547413,28.182981,0.432702,19226980000.0,...,31.681448,2.615234,7.88878,2.452396,0.084762,-inf,0.064115,-0.186277,0.051213,0.117414
std,,,,,63.675534,,42.691063,178.631231,0.217576,39170580000.0,...,1498.330549,64.835246,379.846695,10.321225,0.77219,,12.290531,17.683512,0.205015,2.513563
min,-1.0,-0.9505327,-0.9726245,-0.9726245,0.0,-inf,-3049.3913,-1021.3642,-2.0407,0.0,...,-16096.3679,-8717.45,-338.0,0.0,-60.0,-inf,-1395.0,-2014.0,-3.2546,-257.9736
25%,-0.06722966,-0.08005409,-0.07233059,-0.07233059,15.295,0.08712618,-0.0356,7.10645,0.3255,1906255000.0,...,0.0,1.1869,0.8793,1.1046,0.0218,-21.17023,0.115,0.0221,0.01645,0.03785
50%,0.03159665,0.005274651,0.005118437,0.005118437,32.28,0.4273292,0.3442,14.5949,0.4342,6308645000.0,...,13.1136,2.0846,1.7533,1.5929,0.0743,-8.463073,0.194,0.0762,0.0525,0.1258
75%,0.1392366,0.09811018,0.09195576,0.09195576,60.05,1.000837,1.0316,25.95245,0.55215,19768680000.0,...,24.3325,3.5527,3.1774,2.3719,0.1332,-1.483547,0.3452,0.1434,0.0907,0.2177
max,inf,inf,inf,inf,1777.23,inf,2217.0212,5743.3043,1.0,1041448000000.0,...,188750.58,926.2768,42707.4,945.062,45.8461,97.49637,219.0,136.6666,17.8544,97.2727


In [29]:
data = data.replace([-np.Inf, np.Inf], np.nan)
data = data.dropna()
data.info()
data.describe()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18791 entries, 1 to 20383
Data columns (total 26 columns):
 #   Column                                   Non-Null Count  Dtype         
---  ------                                   --------------  -----         
 0   ticker                                   18791 non-null  object        
 1   date                                     18791 non-null  datetime64[ns]
 2   sector                                   18791 non-null  object        
 3   sectoralIndex                            18791 non-null  object        
 4   absoluteReturn                           18791 non-null  float64       
 5   relativeToSPReturn                       18791 non-null  float64       
 6   relativeToSectoralIndexReturn            18791 non-null  float64       
 7   nextPeriodRelativeToSectoralIndexReturn  18791 non-null  float64       
 8   closePrice                               18791 non-null  float64       
 9   dilutedEarningsPerShare                

Unnamed: 0,absoluteReturn,relativeToSPReturn,relativeToSectoralIndexReturn,nextPeriodRelativeToSectoralIndexReturn,closePrice,dilutedEarningsPerShare,freeCashFlowPerShare,bookValuePerShare,equityRatio,marketCap,...,priceToFreeCashFlow,priceToBookRatio,enterpriseValueToSalesRatio,currentRatio,roic,netCurrentAssetValuePerShare,ebitdaMargin,netIncomeMargin,returnOnAssets,returnOnEquity
count,18791.0,18791.0,18791.0,18791.0,18791.0,18791.0,18791.0,18791.0,18791.0,18791.0,...,18791.0,18791.0,18791.0,18791.0,18791.0,18791.0,18791.0,18791.0,18791.0,18791.0
mean,0.043209,0.019861,0.019986,0.019528,52.236485,0.6435,0.557155,21.262092,0.442125,20569560000.0,...,34.225321,2.752549,7.11436,2.369996,0.086677,-15.923386,0.145815,-0.10395,0.053325,0.123919
std,0.224688,0.199733,0.183917,0.182372,63.062626,3.378358,1.895245,36.291237,0.199597,39629720000.0,...,1570.554497,67.7371,348.464641,4.995833,0.54403,25.657805,7.562779,15.755655,0.140794,2.541372
min,-0.943074,-0.868436,-0.955658,-0.972624,0.01,-155.800325,-43.3746,-74.8954,-2.0259,4046290.0,...,-16096.3679,-8717.45,-323.1871,0.0,-9.7149,-294.656175,-732.0,-2014.0,-3.2546,-257.9736
25%,-0.067228,-0.081466,-0.073122,-0.072742,18.515,0.111079,-0.0276,7.7675,0.3339,2466253000.0,...,5.00085,1.3472,0.9877,1.1348,0.0265,-21.331817,0.1184,0.027,0.0202,0.0443
50%,0.031616,0.003875,0.004017,0.003972,35.05,0.453408,0.3738,15.2289,0.4381,7351087000.0,...,14.4214,2.2168,1.8809,1.6273,0.0769,-8.740189,0.1949,0.0778,0.0544,0.1297
75%,0.137967,0.09584,0.089772,0.089375,63.385,1.023903,1.053,26.36095,0.55275,21558740000.0,...,25.49635,3.70065,3.3117,2.41175,0.1346,-1.625384,0.3404,0.1427,0.0917,0.2188
max,2.849057,2.732176,2.62758,2.62758,1599.74,283.036184,31.563,2327.7253,1.0,505713600000.0,...,188750.58,926.2768,42707.4,301.0833,45.8461,77.369916,219.0,136.6666,8.8927,97.2727


In [30]:
data_by_sector = data.groupby('sector')
print(data_by_sector.groups.keys())
# for sector, sector_data in data_by_sector:
#     print(sector)
#     sector_data.info()

dict_keys(['Energy', 'Industrials', 'Materials'])


In [69]:
def drop_outliers_iqr(df, iqr_bound=(0.25, 0.75), IQR_k=7):
    Q1 = df.quantile(iqr_bound[0])
    Q3 = df.quantile(iqr_bound[1])
    IQR = Q3 - Q1
    return df[~((df < (Q1 - IQR_k * IQR)) | (df > (Q3 + IQR_k * IQR))).any(axis=1)]
def drop_outliers_quantile(df, keep_inside_quantiles_bounds = (0.01, 0.99)):
    q_l = df.quantile(keep_inside_quantiles_bounds[0])
    q_u = df.quantile(keep_inside_quantiles_bounds[1])
    print(keep_inside_quantiles_bounds)
    return df[~((df < q_l) | (df > q_u)).any(axis=1)]

def visualise_data(df: pd.DataFrame, remove_outliers=True, remove_outliers_function=drop_outliers_quantile, remove_outliers_kwargs = None):
    df_to_visualize = df.copy()
    if remove_outliers:
        df_to_visualize = remove_outliers_function(df_to_visualize, **remove_outliers_kwargs)
        print(df.index.size - df_to_visualize.index.size)
    df_to_visualize.hist(bins=50, figsize=(20, 15))
    fig, axes = plt.subplots(5, int(np.ceil(len(df_to_visualize.columns) / 5)), figsize=(20, 15))
    for i,el in enumerate(list(df_to_visualize.columns.values)):
        df_to_visualize.boxplot([el], ax=axes.flatten()[i])
    plt.tight_layout()
    plt.show()
def eval(model, features, labels):
    predictions = model.predict(features)
    errors = abs(predictions - labels)
    mape = 100 * np.mean(errors / labels)
    accuracy = 100 - mape
    print('Model Performance')
    stats = pd.DataFrame([('RMSE', mean_squared_error(labels, predictions, squared=False)),(
                  'MAE', mean_absolute_error(labels, predictions)),(
                  'R2', r2_score(labels, predictions))], columns=['Статистика', 'Значение'])
    print(stats)
    return stats

In [32]:
split_data_by_sector = {sector: train_test_split(sector_data.drop(['ticker', 'sector', 'sectoralIndex', 'date'], axis=1), test_size=0.2, shuffle=False) for sector, sector_data in data_by_sector}
train_data_by_sector = {k: v[0] for k, v in split_data_by_sector.items()}
test_data_by_sector = {k: v[1] for k, v in split_data_by_sector.items()}

# for sector, train_data in train_data_by_sector.items():
#     print(sector)
#     visualise_data(train_data, remove_outliers=False)

In [48]:
# for sector, train_data in train_data_by_sector.items():
#     print(sector)
#     visualise_data(train_data, remove_outliers=True, remove_outliers_function=drop_outliers_iqr, remove_outliers_kwargs={'IQR_k': 30})
len(data_by_sector.get_group('Materials')
    [(data_by_sector.get_group('Materials')['date'] >= '2003-1-19') & (data_by_sector.get_group('Materials')['date'] <= '2009-1-19')])

1738

In [50]:
log_transform = FunctionTransformer(np.log)
r_scaler = RobustScaler()
p_scaler = PowerTransformer()

long_tail_scaler = make_pipeline(p_scaler)
normal_scaler = make_pipeline(r_scaler)


train_data_by_sector = {k: drop_outliers_iqr(v, IQR_k=30) for k, v in train_data_by_sector.items()}
y_train_by_sector = {k: v['nextPeriodRelativeToSectoralIndexReturn'] for k, v in train_data_by_sector.items()}
X_train_by_sector = {k: v.drop(['nextPeriodRelativeToSectoralIndexReturn'], axis=1) for k, v in train_data_by_sector.items()}

long_tail_columns = ['closePrice', 'bookValuePerShare', 'marketCap', 'priceToSalesRatio', 'enterpriseValueToSalesRatio', 'currentRatio', 'netCurrentAssetValuePerShare']
noraml_columns = [k for k in X_train_by_sector[list(X_train_by_sector.keys())[0]].columns if k not in long_tail_columns]
prep_pipeline = ColumnTransformer([('lt', long_tail_scaler, long_tail_columns), ('normal', normal_scaler, noraml_columns)])

# X_train_by_sector_scaled = {k: pd.DataFrame(prep_pipeline.fit_transform(X=v), columns=v.columns, index=v.index) for k, v in X_train_by_sector.items()}
# for sector, X_train_scaled in X_train_by_sector_scaled.items():
#     print(sector)
#     visualise_data(X_train_scaled, remove_outliers=False)

In [60]:
from sklearn.model_selection import TimeSeriesSplit
def get_cross_val_row(cross_val_res, stat, algo, stat_metrics=['mean', 'std']):
    stat_map = {'r2': 'R2', 'neg_mean_absolute_error': 'MAE', 'neg_root_mean_squared_error': 'RMSE'}
    stat = stat_map[stat]
    stats = pd.Series(cross_val_res).describe().to_frame('Значение').reset_index().rename({'index': 'Показатель статистики'}, axis=1)
    stats.insert(0, 'Статистика', value=stat)
    stats.insert(0, 'Алгоритм', value=algo)
    return stats[stats['Показатель статистики'].isin(stat_metrics)]
scorings = ['r2', 'neg_mean_absolute_error', 'neg_root_mean_squared_error']

test_data_by_sector = {k: drop_outliers_iqr(v, IQR_k=30) for k, v in test_data_by_sector.items()}
y_test_by_sector = {k: v['nextPeriodRelativeToSectoralIndexReturn'] for k, v in test_data_by_sector.items()}
X_test_by_sector = {k: v.drop(['nextPeriodRelativeToSectoralIndexReturn'], axis=1) for k, v in test_data_by_sector.items()}


linear_regr_pipeline = make_pipeline(prep_pipeline, LinearRegression())

svr_pipeline = make_pipeline(prep_pipeline, SVR(C=1.2, gamma=1e-5))

random_forest_pipeline = make_pipeline(prep_pipeline, RandomForestRegressor(max_features=8, max_depth=5, n_estimators=300, bootstrap=True,random_state=42))

pipelines = [('Linear regr', linear_regr_pipeline), ('SVR', svr_pipeline)
    , ('Random forest', random_forest_pipeline)]


out_table = None
for sector, y_train_data in y_train_by_sector.items():
    print(sector)
    sector_data_size = len(data_by_sector.get_group(sector))
    X_train_data = X_train_by_sector[sector]
    X_test_data = X_test_by_sector[sector]
    y_test_data = y_test_by_sector[sector]
    methods_table_for_sector = pd.DataFrame(columns=['Алгоритм', 'Статистика','Показатель статистики', 'Значение'])
    for name, pipeline in pipelines:
        print(name)
        cv_num = 10
        for scoring in scorings:
            # TimeSeriesSplit(n_splits=5, test_size=int(sector_data_size * 0.035), max_train_size=int(sector_data_size * 0.15))
            scrs = -cross_val_score(pipeline, X_train_data, y_train_data,
                                      scoring=scoring, cv=TimeSeriesSplit(n_splits=5, test_size=int(sector_data_size * 0.035)))
            print(pd.Series(scrs).describe())
            methods_table_for_sector = pd.concat([methods_table_for_sector, get_cross_val_row(scrs, scoring, name)])

        est = pipeline.fit(X_train_data, y_train_data)

        print('train: ')
        eval(est, X_train_data, y_train_data)
        print('test: ')
        X_tst_h  = X_test_data.head(int(sector_data_size * 0.035))
        y_tst_h = y_test_data.head(int(sector_data_size * 0.035))
        eval(est, X_tst_h, y_tst_h)

    out = pd.Series(methods_table_for_sector['Значение'].array, index=pd.MultiIndex.from_frame(methods_table_for_sector.drop('Значение', axis=1))).to_frame(sector).transpose()
    if out_table is None:
        out_table = out
    else:
        out_table = pd.concat([out_table, out])
print(out_table)
out_table.to_excel('out_table_init_ts.xlsx')

pydev debugger: Unable to find real location for: C:\Users\j4yze\AppData\Local\Temp\ipykernel_7520\1743288561.py
pydev debugger: Unable to find real location for: C:\Users\j4yze\AppData\Local\Temp\ipykernel_7520\3668116294.py
pydev debugger: Unable to find real location for: C:\Users\j4yze\AppData\Local\Temp\ipykernel_7520\453956364.py


Energy
Linear regr
count    5.000000
mean     0.079419
std      0.053968
min      0.015884
25%      0.032126
50%      0.087126
75%      0.130674
max      0.131286
dtype: float64
count    5.000000
mean     0.100113
std      0.008394
min      0.089320
25%      0.096158
50%      0.097860
75%      0.108339
max      0.108885
dtype: float64
count    5.000000
mean     0.133694
std      0.009826
min      0.124751
25%      0.126508
50%      0.131232
75%      0.136837
max      0.149142
dtype: float64
train: 
Model Performance
  Статистика             Значение
0        MSE  0.14437982789863424
1        MAE  0.10308785862649283
2         R2  0.03649232026656679
test: 


  for item in s.iteritems():
  s = s[:max_items]
  for item in s.iteritems():
  s = s[:max_items]
  for item in s.iteritems():
  s = s[:max_items]
  for item in s.iteritems():
  s = s[:max_items]
  for item in s.iteritems():
  s = s[:max_items]
  for item in s.iteritems():
  s = s[:max_items]
  for item in s.iteritems():
  s = s[:max_items]
  for item in s.iteritems():
  s = s[:max_items]
  for item in s.iteritems():
  s = s[:max_items]
  for item in s.iteritems():
  s = s[:max_items]
  for item in s.iteritems():
  s = s[:max_items]
  for item in s.iteritems():
  s = s[:max_items]
  for item in s.iteritems():
  s = s[:max_items]
  for item in s.iteritems():
  s = s[:max_items]
  for item in s.iteritems():
  s = s[:max_items]
  for item in s.iteritems():
  s = s[:max_items]
  for item in s.iteritems():
  s = s[:max_items]
  for item in s.iteritems():
  s = s[:max_items]
  for item in s.iteritems():
  s = s[:max_items]
  for item in s.iteritems():
  s = s[:max_items]
  for item in s.iter

KeyboardInterrupt: 

In [175]:
svr_pipeline

In [14]:
svm_param_grid = [{'svr__C': [0.1, 1, 1.2, 1.5, 5],
              'svr__gamma': [0.01, 0.001, 0.0001],
              'svr__kernel': ['rbf']},]

y_train_data = y_train_by_sector['Materials']
X_train_data = X_train_by_sector['Materials']
grid = GridSearchCV(svr_pipeline, svm_param_grid, scoring='r2', refit = True, verbose = 3, cv=3)
res = grid.fit(X_train_data, y_train_data)
print(res.best_params_)
print(res.best_score_)

Fitting 4 folds for each of 15 candidates, totalling 60 fits
[CV 1/4] END svr__C=0.1, svr__epsilon=0.1, svr__gamma=1, svr__kernel=rbf;, score=-0.201 total time=   2.0s
[CV 2/4] END svr__C=0.1, svr__epsilon=0.1, svr__gamma=1, svr__kernel=rbf;, score=-0.225 total time=   1.2s
[CV 3/4] END svr__C=0.1, svr__epsilon=0.1, svr__gamma=1, svr__kernel=rbf;, score=-0.203 total time=   1.1s
[CV 4/4] END svr__C=0.1, svr__epsilon=0.1, svr__gamma=1, svr__kernel=rbf;, score=-0.174 total time=   1.0s
[CV 1/4] END svr__C=0.1, svr__epsilon=0.1, svr__gamma=0.1, svr__kernel=rbf;, score=-0.202 total time=   0.8s
[CV 2/4] END svr__C=0.1, svr__epsilon=0.1, svr__gamma=0.1, svr__kernel=rbf;, score=-0.223 total time=   0.7s
[CV 3/4] END svr__C=0.1, svr__epsilon=0.1, svr__gamma=0.1, svr__kernel=rbf;, score=-0.201 total time=   0.8s
[CV 4/4] END svr__C=0.1, svr__epsilon=0.1, svr__gamma=0.1, svr__kernel=rbf;, score=-0.174 total time=   0.8s
[CV 1/4] END svr__C=0.1, svr__epsilon=0.1, svr__gamma=0.01, svr__kernel=rbf

In [61]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import reciprocal, uniform

from scipy.stats import expon

param_distribs_svr = {
        'svr__kernel': ['rbf'],
        'svr__C': uniform(0.01, 5),
        'svr__gamma': reciprocal(0.0001, 1)
    }
sector_data_size = len(data_by_sector.get_group('Energy'))

rnd_search_svr = RandomizedSearchCV(svr_pipeline,
                                param_distributions=param_distribs_svr,
                                n_iter=23, cv=TimeSeriesSplit(n_splits=7, test_size=int(sector_data_size * 0.035), max_train_size=int(sector_data_size * 0.15)),
                                scoring='neg_mean_absolute_error',
                                verbose=3,
                                random_state=42)
# for sector, y_train_data in y_train_by_sector.items():
#     print(sector)

y_train_data = y_train_by_sector['Energy']
X_train_data = X_train_by_sector['Energy']
res = rnd_search_svr.fit(X_train_data, y_train_data)
print(res.best_params_)
print(res.best_score_)
# C=1.2 gamma = 1e-05

Fitting 5 folds for each of 23 candidates, totalling 115 fits
[CV 1/5] END svr__C=1.8827005942368125, svr__gamma=0.6351221010640693, svr__kernel=rbf;, score=-0.083 total time=   0.0s
[CV 2/5] END svr__C=1.8827005942368125, svr__gamma=0.6351221010640693, svr__kernel=rbf;, score=-0.092 total time=   0.0s
[CV 3/5] END svr__C=1.8827005942368125, svr__gamma=0.6351221010640693, svr__kernel=rbf;, score=-0.093 total time=   0.0s
[CV 4/5] END svr__C=1.8827005942368125, svr__gamma=0.6351221010640693, svr__kernel=rbf;, score=-0.110 total time=   0.0s
[CV 5/5] END svr__C=1.8827005942368125, svr__gamma=0.6351221010640693, svr__kernel=rbf;, score=-0.105 total time=   0.0s
[CV 1/5] END svr__C=3.669969709057025, svr__gamma=0.024810409748678097, svr__kernel=rbf;, score=-0.100 total time=   0.0s
[CV 2/5] END svr__C=3.669969709057025, svr__gamma=0.024810409748678097, svr__kernel=rbf;, score=-0.110 total time=   0.0s
[CV 3/5] END svr__C=3.669969709057025, svr__gamma=0.024810409748678097, svr__kernel=rbf;,

In [62]:
from scipy.stats import randint

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 500, num = 10)]
# Number of features to consider at every split
max_features = randint(2, 12)
# Maximum number of levels in tree
max_depth = randint(1, 12)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
param_distribs_rf = {
    'randomforestregressor__bootstrap': bootstrap,
    'randomforestregressor__max_depth': max_depth,
    'randomforestregressor__max_features': randint(low=1, high=10),
    'randomforestregressor__min_samples_leaf': min_samples_leaf,
    'randomforestregressor__min_samples_split': min_samples_split,
    'randomforestregressor__n_estimators': n_estimators
}

selected_features = ['absoluteReturn', 'relativeToSPReturn','relativeToSectoralIndexReturn', 'closePrice',
'freeCashFlowPerShare', 'bookValuePerShare', 'marketCap', 'priceToSalesRatio', 'currentRatio']
X_train_by_sector_selected = {k: v[selected_features] for k, v in X_train_by_sector.items()}
X_test_by_sector_selected = {k: v[selected_features] for k, v in X_test_by_sector.items()}
long_tail_columns = ['closePrice', 'bookValuePerShare', 'marketCap', 'priceToSalesRatio', 'currentRatio']
noraml_columns = [k for k in X_train_by_sector_selected[list(X_train_by_sector_selected.keys())[0]].columns if k not in long_tail_columns]

prep_pipeline = ColumnTransformer([('lt', long_tail_scaler, long_tail_columns), ('normal', normal_scaler, noraml_columns)])

random_forest_pipeline = make_pipeline(prep_pipeline, RandomForestRegressor())

rnd_searc_rf = RandomizedSearchCV(
    random_forest_pipeline, param_distributions=param_distribs_rf, n_iter=14, cv=TimeSeriesSplit(n_splits=7, test_size=int(sector_data_size * 0.035), max_train_size=int(sector_data_size * 0.15)),
    scoring='neg_mean_absolute_error', random_state=42, verbose=3)

y_train_data = y_train_by_sector['Energy']
X_train_data = X_train_by_sector['Energy']
res = rnd_searc_rf.fit(X_train_data, y_train_data)
print(res.best_params_)
print(res.best_score_)
# max_feat low, bootstrap?, n_of_est big, max_depth?

Fitting 7 folds for each of 14 candidates, totalling 98 fits
[CV 1/7] END randomforestregressor__bootstrap=True, randomforestregressor__max_depth=4, randomforestregressor__max_features=8, randomforestregressor__min_samples_leaf=1, randomforestregressor__min_samples_split=2, randomforestregressor__n_estimators=336;, score=-0.140 total time=   1.6s
[CV 2/7] END randomforestregressor__bootstrap=True, randomforestregressor__max_depth=4, randomforestregressor__max_features=8, randomforestregressor__min_samples_leaf=1, randomforestregressor__min_samples_split=2, randomforestregressor__n_estimators=336;, score=-0.114 total time=   1.7s
[CV 3/7] END randomforestregressor__bootstrap=True, randomforestregressor__max_depth=4, randomforestregressor__max_features=8, randomforestregressor__min_samples_leaf=1, randomforestregressor__min_samples_split=2, randomforestregressor__n_estimators=336;, score=-0.091 total time=   0.8s
[CV 4/7] END randomforestregressor__bootstrap=True, randomforestregressor__

In [21]:

param_distribs_rf = {
    'lasso__alpha': uniform(0.01, 3)
}

lasso_pipeline = make_pipeline(prep_pipeline, Lasso())
rnd_searc_lasso = RandomizedSearchCV(
    lasso_pipeline, param_distributions=param_distribs_rf, n_iter=20, cv=4,
    scoring='neg_root_mean_squared_error', random_state=42, verbose=3)

y_train_data = y_train_by_sector['Materials']
X_train_data = X_train_by_sector['Materials']
res = rnd_searc_lasso.fit(X_train_data, y_train_data)
print(res.best_params_)
print(res.best_score_)

Fitting 4 folds for each of 20 candidates, totalling 80 fits
[CV 1/4] END ..lasso__alpha=1.1336203565420875;, score=-0.199 total time=   0.2s
[CV 2/4] END ..lasso__alpha=1.1336203565420875;, score=-0.224 total time=   0.1s
[CV 3/4] END ..lasso__alpha=1.1336203565420875;, score=-0.204 total time=   0.1s
[CV 4/4] END ..lasso__alpha=1.1336203565420875;, score=-0.175 total time=   0.0s
[CV 1/4] END ..lasso__alpha=2.8621429192297483;, score=-0.199 total time=   0.0s
[CV 2/4] END ..lasso__alpha=2.8621429192297483;, score=-0.224 total time=   0.0s
[CV 3/4] END ..lasso__alpha=2.8621429192297483;, score=-0.204 total time=   0.0s
[CV 4/4] END ..lasso__alpha=2.8621429192297483;, score=-0.175 total time=   0.0s
[CV 1/4] END ...lasso__alpha=2.205981825434215;, score=-0.199 total time=   0.0s
[CV 2/4] END ...lasso__alpha=2.205981825434215;, score=-0.224 total time=   0.0s
[CV 3/4] END ...lasso__alpha=2.205981825434215;, score=-0.204 total time=   0.0s
[CV 4/4] END ...lasso__alpha=2.205981825434215;,

In [25]:
lasso_feat_selection = SelectFromModel(estimator=Lasso(alpha=1.13))
rf_feat_selection = SelectFromModel(estimator=RandomForestRegressor(max_features=4, n_estimators=800, bootstrap=True))

for sector, y_train_data in y_train_by_sector.items():
    X_train_data = X_train_by_sector[sector]
    print(lasso_feat_selection.fit(X=X_train_data, y=y_train_data).get_feature_names_out())
    print(rf_feat_selection.fit(X=X_train_data, y=y_train_data).get_feature_names_out())

[]
['absoluteReturn' 'relativeToSPReturn' 'relativeToSectoralIndexReturn'
 'closePrice' 'marketCap' 'currentRatio' 'netCurrentAssetValuePerShare']
[]
['absoluteReturn' 'relativeToSPReturn' 'relativeToSectoralIndexReturn'
 'closePrice' 'marketCap']
[]
['absoluteReturn' 'relativeToSPReturn' 'closePrice'
 'dilutedEarningsPerShare' 'freeCashFlowPerShare' 'bookValuePerShare'
 'marketCap' 'priceToSalesRatio']


In [65]:
selected_features = ['closePrice',
'freeCashFlowPerShare', 'bookValuePerShare', 'marketCap', 'priceToSalesRatio', 'currentRatio']
X_train_by_sector_selected = {k: v[selected_features] for k, v in X_train_by_sector.items()}
X_test_by_sector_selected = {k: v[selected_features] for k, v in X_test_by_sector.items()}
long_tail_columns = ['closePrice', 'bookValuePerShare', 'marketCap', 'priceToSalesRatio', 'currentRatio']
noraml_columns = [k for k in X_train_by_sector_selected[list(X_train_by_sector_selected.keys())[0]].columns if k not in long_tail_columns]

prep_pipeline = ColumnTransformer([('lt', long_tail_scaler, long_tail_columns), ('normal', normal_scaler, noraml_columns)])


linear_regr_pipeline = make_pipeline(prep_pipeline, LinearRegression())
svr_pipeline = make_pipeline(prep_pipeline, SVR(C=1.2, gamma=1e-5))

random_forest_pipeline = make_pipeline(prep_pipeline, RandomForestRegressor(max_features=8, max_depth=4, n_estimators=300, bootstrap=True,random_state=42))

pipelines = [('Linear regr', linear_regr_pipeline), ('SVR', svr_pipeline)
    , ('Random forest', random_forest_pipeline)]

out_table = None
for sector, y_train_data in y_train_by_sector.items():
    print(sector)
    sector_data_size = len(data_by_sector.get_group(sector))
    X_train_data = X_train_by_sector_selected[sector]
    X_test_data = X_test_by_sector_selected[sector]
    y_test_data = y_test_by_sector[sector]
    methods_table_for_sector = pd.DataFrame(columns=['Алгоритм', 'Статистика','Показатель статистики', 'Значение'])
    for name, pipeline in pipelines:
        print(name)
        cv_num = 10
        # rmses = -cross_val_score(pipeline, X_train_data, y_train_data,
        #                           scoring="r2", cv=TimeSeriesSplit(n_splits=5, test_size=int(X_train_data.index.size * 0.05), max_train_size=int(X_train_data.index.size / 6)))
        # print(pd.Series(rmses).describe())
        #
        # maes = -cross_val_score(pipeline, X_train_data, y_train_data,
        #                           scoring="neg_mean_absolute_error", cv=TimeSeriesSplit(n_splits=5, test_size=int(X_train_data.index.size * 0.05), max_train_size=int(X_train_data.index.size / 6)))
        # print(pd.Series(maes).describe())
        #
        #  mses = -cross_val_score(pipeline, X_train_data, y_train_data,
        #                           scoring="neg_root_mean_squared_error", cv=TimeSeriesSplit(n_splits=5, test_size=int(X_train_data.index.size * 0.05), max_train_size=int(X_train_data.index.size / 6)))
        # print(pd.Series(mses).describe())

        for scoring in scorings:
            scrs = -cross_val_score(pipeline, X_train_data, y_train_data,
                                      scoring=scoring, cv=5)
            print(pd.Series(scrs).describe())
            methods_table_for_sector = pd.concat([methods_table_for_sector, get_cross_val_row(scrs, scoring, name)])

        est = pipeline.fit(X_train_data, y_train_data)

        print('train: ')
        eval(est, X_train_data, y_train_data)
        print('test: ')
        eval(est, X_test_data, y_test_data)

    out = pd.Series(methods_table_for_sector['Значение'].array, index=pd.MultiIndex.from_frame(methods_table_for_sector.drop('Значение', axis=1))).to_frame(sector).transpose()
    if out_table is None:
        out_table = out
    else:
        out_table = pd.concat([out_table, out])
print(out_table)
out_table.to_excel('out_table_tuned_selected_cs.xlsx')


Energy
Linear regr
count    5.000000
mean    -0.002033
std      0.045217
min     -0.045329
25%     -0.013856
50%     -0.012972
75%     -0.012947
max      0.074941
dtype: float64
count    5.000000
mean     0.104187
std      0.012603
min      0.088687
25%      0.099260
50%      0.100822
75%      0.109905
max      0.122263
dtype: float64
count    5.000000
mean     0.145369
std      0.020332
min      0.121072
25%      0.134037
50%      0.140139
75%      0.159769
max      0.171829
dtype: float64
train: 
Model Performance
  Статистика             Значение
0        MSE  0.14522948460322932
1        MAE  0.10328220618572871
2         R2  0.02511871497636753
test: 
Model Performance
  Статистика              Значение
0        MSE   0.31179340142010314
1        MAE   0.18191767794534552
2         R2  0.037008013550086316
SVR
count    5.000000
mean    -0.001820
std      0.007309
min     -0.008066
25%     -0.004854
50%     -0.004594
75%     -0.002318
max      0.010730
dtype: float64
count    5.000

In [70]:
selected_features = ['absoluteReturn', 'relativeToSPReturn', 'relativeToSectoralIndexReturn', 'closePrice',
                     'freeCashFlowPerShare', 'bookValuePerShare', 'marketCap', 'priceToSalesRatio', 'currentRatio']
X_train_by_sector_selected = {k: v[selected_features] for k, v in X_train_by_sector.items()}
X_test_by_sector_selected = {k: v[selected_features] for k, v in X_test_by_sector.items()}
long_tail_columns = ['closePrice', 'bookValuePerShare', 'marketCap', 'priceToSalesRatio', 'currentRatio']
noraml_columns = [k for k in X_train_by_sector_selected[list(X_train_by_sector_selected.keys())[0]].columns if
                  k not in long_tail_columns]

prep_pipeline = ColumnTransformer(
    [('lt', long_tail_scaler, long_tail_columns), ('normal', normal_scaler, noraml_columns)])

linear_regr_pipeline = make_pipeline(prep_pipeline, LinearRegression())
svr_pipeline = make_pipeline(prep_pipeline, SVR(C=1.2, gamma=1e-5))

random_forest_pipeline = make_pipeline(prep_pipeline,
                                       RandomForestRegressor(max_features=8, max_depth=4, n_estimators=300,
                                                             bootstrap=True, random_state=42))

pipelines = [('Linear regr', linear_regr_pipeline), ('SVR', svr_pipeline)
    , ('Random forest', random_forest_pipeline)]

out_table = None
for sector, y_train_data in y_train_by_sector.items():
    print(sector)
    X_train_data = X_train_by_sector_selected[sector]
    X_test_data = X_test_by_sector_selected[sector]
    y_test_data = y_test_by_sector[sector]
    methods_table_for_sector = pd.DataFrame(columns=['Алгоритм', 'Сет', 'Статистика', 'Значение'])
    for name, pipeline in pipelines:
        print(name)

        est = pipeline.fit(X_train_data, y_train_data)

        print('train: ')
        stats_train = eval(est, X_train_data, y_train_data)
        stats_train.insert(0, 'Сет', 'train')
        print('test: ')
        stats_test = eval(est, X_test_data, y_test_data)
        stats_test.insert(0, 'Сет', 'test')

        stats_for_algo = pd.concat([stats_train, stats_test])
        stats_for_algo.insert(0, 'Алгоритм', name)
        methods_table_for_sector = pd.concat([methods_table_for_sector, stats_for_algo])


    out = pd.Series(methods_table_for_sector['Значение'].array,
                    index=pd.MultiIndex.from_frame(methods_table_for_sector.drop('Значение', axis=1))).to_frame(
        sector).transpose()
    if out_table is None:
        out_table = out
    else:
        out_table = pd.concat([out_table, out])
print(out_table)
out_table.to_excel('out_table_tuned_selected_train_test.xlsx')

Energy
Linear regr
train: 
Model Performance
  Статистика  Значение
0       RMSE  0.144696
1        MAE  0.103085
2         R2  0.032268
test: 
Model Performance
  Статистика  Значение
0       RMSE  0.315610
1        MAE  0.185252
2         R2  0.013285
SVR
train: 
Model Performance
  Статистика  Значение
0       RMSE  0.145924
1        MAE  0.103411
2         R2  0.015767
test: 
Model Performance
  Статистика  Значение
0       RMSE  0.318998
1        MAE  0.183976
2         R2 -0.008010
Random forest
train: 
Model Performance
  Статистика  Значение
0       RMSE  0.137937
1        MAE  0.100445
2         R2  0.120564
test: 
Model Performance
  Статистика  Значение
0       RMSE  0.295719
1        MAE  0.178578
2         R2  0.133743
Industrials
Linear regr
train: 
Model Performance
  Статистика  Значение
0       RMSE  0.126771
1        MAE  0.085646
2         R2  0.034593
test: 
Model Performance
  Статистика  Значение
0       RMSE  0.117086
1        MAE  0.087518
2         R2 -0.005379