In [21]:
import typing
import pandas as pd
import xgboost as xg
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, SGDRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from catboost import CatBoostRegressor

In [3]:
data = pd.read_csv('data/financial_data.csv', index_col=0)
data = data.drop(['price_to_sales', 'target', 'SP_target'], axis=1)
data = data.dropna()

data_groups = pd.read_csv('data/S&P500-Info.csv', index_col=0)
merged = data.merge(data_groups[['Symbol', 'GICS Sector']], left_index=True, right_on='Symbol')
X = merged.drop('Symbol', axis=1).drop('regression_target', axis=1)
target = merged.regression_target

In [4]:
X_train_df, X_test_df, y_train, y_test = train_test_split(X, target.values, test_size=0.3, random_state=42)

In [5]:
X_train = X_train_df.groupby('GICS Sector').transform(lambda x: (x - x.mean()) / x.std()).values
X_test_mean = X_test_df.merge(X_train_df.groupby('GICS Sector').mean(), left_on='GICS Sector', right_index=True)
X_test_mean_std = X_test_mean.merge(X_train_df.groupby('GICS Sector').std(), left_on='GICS Sector', right_index=True)
X_test_mean_std = X_test_mean_std.drop('GICS Sector', axis=1)
X_test = (X_test_mean_std.iloc[:, :9].values - X_test_mean_std.iloc[:, 9:18].values)/X_test_mean_std.iloc[:, 18:].values

# Regression 

#### LinearRegression

In [6]:
reg_model = LinearRegression()
reg_model.fit(X_train, y_train)
pred = reg_model.predict(X_test)

In [7]:
print(f'Train MSE: {mean_squared_error(reg_model.predict(X_train), y_train):.5}')
print(f'Test MSE: {mean_squared_error(pred, y_test):.5}')

Train MSE: 0.028117
Test MSE: 0.052553


#### Ridge

In [8]:
ridge_model = Ridge(alpha=2.0)
ridge_model.fit(X_train, y_train)
pred = ridge_model.predict(X_test)

In [9]:
print(f'Train MSE: {mean_squared_error(ridge_model.predict(X_train), y_train):.5}')
print(f'Test MSE: {mean_squared_error(pred, y_test):.5}')

Train MSE: 0.028117
Test MSE: 0.052999


#### SGDRegressor - l2

In [10]:
SGD_model_l2 = SGDRegressor(loss='squared_loss', penalty='l2', alpha=20.0, random_state=42)
SGD_model_l2.fit(X_train, y_train)
pred = SGD_model_l2.predict(X_test)

In [11]:
print(f'Train MSE: {mean_squared_error(SGD_model_l2.predict(X_train), y_train):.5}')
print(f'Test MSE: {mean_squared_error(pred, y_test):.5}')

Train MSE: 0.031143
Test MSE: 0.03138


#### SGDRegressor - l1

In [12]:
SGD_model_l1 = SGDRegressor(loss='squared_loss', penalty='l1', alpha=2.0, random_state=42)
SGD_model_l1.fit(X_train, y_train)
pred = SGD_model_l1.predict(X_test)

In [13]:
print(f'Train MSE: {mean_squared_error(SGD_model_l1.predict(X_train), y_train):.5}')
print(f'Test MSE: {mean_squared_error(pred, y_test):.5}')

Train MSE: 0.031935
Test MSE: 0.031524


#### RandomForestRegressor

In [14]:
RF = RandomForestRegressor(criterion='mae', n_estimators=100, min_samples_leaf=1, max_depth=2)
RF.fit(X_train, y_train)
pred = RF.predict(X_test)

In [15]:
print(f'Train MAE: {mean_squared_error(RF.predict(X_train), y_train):.5}')
print(f'Test MAE: {mean_squared_error(pred, y_test):.5}')

Train MAE: 0.023891
Test MAE: 0.033057


In [16]:
RF = RandomForestRegressor(criterion='mse', n_estimators=150, min_samples_leaf=1, max_depth=2)
RF.fit(X_train, y_train)
pred = RF.predict(X_test)

In [17]:
print(f'Train MSE: {mean_squared_error(RF.predict(X_train), y_train):.5}')
print(f'Test MSE: {mean_squared_error(pred, y_test):.5}')

Train MSE: 0.023479
Test MSE: 0.032268


#### XGBoost

In [18]:
XGB_model = xg.XGBRegressor(objective ='reg:squarederror', n_estimators=50, max_depth=1, learning_rate=0.05, seed=42)
XGB_model.fit(X_train, y_train)
pred = XGB_model.predict(X_test)

In [19]:
print(f'Train MSE: {mean_squared_error(XGB_model.predict(X_train), y_train):.5}')
print(f'Test MSE: {mean_squared_error(pred, y_test):.5}')

Train MSE: 0.025938
Test MSE: 0.033233


#### CatBoost

In [72]:
CB_model = CatBoostRegressor(learning_rate=1e-5, loss_function='MAE', depth=2, random_seed=42)
CB_model.fit(X_train, y_train)
pred = CB_model.predict(X_test)

0:	learn: 0.1448018	total: 318us	remaining: 318ms
1:	learn: 0.1448016	total: 662us	remaining: 330ms
2:	learn: 0.1448015	total: 959us	remaining: 319ms
3:	learn: 0.1448013	total: 1.21ms	remaining: 302ms
4:	learn: 0.1448011	total: 1.45ms	remaining: 289ms
5:	learn: 0.1448009	total: 1.72ms	remaining: 286ms
6:	learn: 0.1448007	total: 2.04ms	remaining: 290ms
7:	learn: 0.1448006	total: 2.35ms	remaining: 291ms
8:	learn: 0.1448004	total: 2.63ms	remaining: 290ms
9:	learn: 0.1448003	total: 2.9ms	remaining: 287ms
10:	learn: 0.1448001	total: 3.14ms	remaining: 283ms
11:	learn: 0.1447999	total: 3.41ms	remaining: 281ms
12:	learn: 0.1447997	total: 3.69ms	remaining: 281ms
13:	learn: 0.1447995	total: 3.96ms	remaining: 279ms
14:	learn: 0.1447994	total: 4.25ms	remaining: 279ms
15:	learn: 0.1447992	total: 4.5ms	remaining: 277ms
16:	learn: 0.1447991	total: 4.81ms	remaining: 278ms
17:	learn: 0.1447989	total: 5.07ms	remaining: 276ms
18:	learn: 0.1447987	total: 5.37ms	remaining: 278ms
19:	learn: 0.1447985	total:

583:	learn: 0.1447083	total: 145ms	remaining: 103ms
584:	learn: 0.1447082	total: 145ms	remaining: 103ms
585:	learn: 0.1447080	total: 146ms	remaining: 103ms
586:	learn: 0.1447078	total: 146ms	remaining: 103ms
587:	learn: 0.1447076	total: 146ms	remaining: 102ms
588:	learn: 0.1447074	total: 146ms	remaining: 102ms
589:	learn: 0.1447073	total: 147ms	remaining: 102ms
590:	learn: 0.1447071	total: 147ms	remaining: 102ms
591:	learn: 0.1447069	total: 147ms	remaining: 101ms
592:	learn: 0.1447067	total: 147ms	remaining: 101ms
593:	learn: 0.1447065	total: 148ms	remaining: 101ms
594:	learn: 0.1447064	total: 148ms	remaining: 101ms
595:	learn: 0.1447062	total: 148ms	remaining: 100ms
596:	learn: 0.1447061	total: 148ms	remaining: 100ms
597:	learn: 0.1447059	total: 149ms	remaining: 100ms
598:	learn: 0.1447058	total: 149ms	remaining: 99.7ms
599:	learn: 0.1447056	total: 149ms	remaining: 99.5ms
600:	learn: 0.1447055	total: 149ms	remaining: 99.2ms
601:	learn: 0.1447054	total: 150ms	remaining: 99ms
602:	learn

In [73]:
print(f'Train MSE: {mean_squared_error(CB_model.predict(X_train), y_train):.5}')
print(f'Test MSE: {mean_squared_error(pred, y_test):.5}')

Train MSE: 0.031839
Test MSE: 0.031351


In [74]:
CB_model = CatBoostRegressor(learning_rate=1e-5, loss_function='RMSE', depth=2, random_seed=42)
CB_model.fit(X_train, y_train)
pred = CB_model.predict(X_test)

0:	learn: 0.1785870	total: 311us	remaining: 311ms
1:	learn: 0.1785868	total: 829us	remaining: 414ms
2:	learn: 0.1785866	total: 1.12ms	remaining: 373ms
3:	learn: 0.1785864	total: 1.48ms	remaining: 369ms
4:	learn: 0.1785863	total: 1.81ms	remaining: 360ms
5:	learn: 0.1785860	total: 2.14ms	remaining: 354ms
6:	learn: 0.1785859	total: 2.49ms	remaining: 353ms
7:	learn: 0.1785858	total: 2.75ms	remaining: 341ms
8:	learn: 0.1785855	total: 3.02ms	remaining: 333ms
9:	learn: 0.1785853	total: 3.29ms	remaining: 326ms
10:	learn: 0.1785851	total: 3.62ms	remaining: 326ms
11:	learn: 0.1785850	total: 3.92ms	remaining: 323ms
12:	learn: 0.1785848	total: 4.24ms	remaining: 322ms
13:	learn: 0.1785846	total: 4.55ms	remaining: 320ms
14:	learn: 0.1785845	total: 4.8ms	remaining: 315ms
15:	learn: 0.1785843	total: 5.05ms	remaining: 310ms
16:	learn: 0.1785841	total: 5.29ms	remaining: 306ms
17:	learn: 0.1785839	total: 5.53ms	remaining: 302ms
18:	learn: 0.1785837	total: 5.79ms	remaining: 299ms
19:	learn: 0.1785834	tota

639:	learn: 0.1784733	total: 150ms	remaining: 84.3ms
640:	learn: 0.1784731	total: 150ms	remaining: 84.1ms
641:	learn: 0.1784729	total: 150ms	remaining: 83.9ms
642:	learn: 0.1784727	total: 151ms	remaining: 83.6ms
643:	learn: 0.1784725	total: 151ms	remaining: 83.4ms
644:	learn: 0.1784722	total: 151ms	remaining: 83.2ms
645:	learn: 0.1784721	total: 151ms	remaining: 82.9ms
646:	learn: 0.1784719	total: 152ms	remaining: 82.7ms
647:	learn: 0.1784717	total: 152ms	remaining: 82.5ms
648:	learn: 0.1784716	total: 152ms	remaining: 82.2ms
649:	learn: 0.1784713	total: 152ms	remaining: 82ms
650:	learn: 0.1784711	total: 153ms	remaining: 81.8ms
651:	learn: 0.1784709	total: 153ms	remaining: 81.5ms
652:	learn: 0.1784708	total: 153ms	remaining: 81.3ms
653:	learn: 0.1784706	total: 153ms	remaining: 81.1ms
654:	learn: 0.1784704	total: 153ms	remaining: 80.8ms
655:	learn: 0.1784703	total: 154ms	remaining: 80.6ms
656:	learn: 0.1784701	total: 154ms	remaining: 80.4ms
657:	learn: 0.1784699	total: 154ms	remaining: 80

In [71]:
print(f'Train MSE: {mean_squared_error(CB_model.predict(X_train), y_train):.5}')
print(f'Test MSE: {mean_squared_error(pred, y_test):.5}')

Train MSE: 0.03183
Test MSE: 0.031375
