In [75]:
import typing
import pandas as pd
import xgboost as xg
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, SGDRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from catboost import CatBoostRegressor

In [76]:
data = pd.read_csv('data/financial_data.csv', index_col=0)
data = data.drop(['price_to_sales', 'target', 'SP_target'], axis=1)
data = data.dropna()

data_groups = pd.read_csv('data/S&P500-Info.csv', index_col=0)
merged = data.merge(data_groups[['Symbol', 'GICS Sector']], left_index=True, right_on='Symbol')
X = merged.drop('Symbol', axis=1).drop('regression_target', axis=1)
target = merged.regression_target

In [77]:
X_train_df, X_test_df, y_train, y_test = train_test_split(X, target.values, test_size=0.3, random_state=42)

In [78]:
X_train = X_train_df.groupby('GICS Sector').transform(lambda x: (x - x.mean()) / x.std()).values
X_test_mean = X_test_df.merge(X_train_df.groupby('GICS Sector').mean(), left_on='GICS Sector', right_index=True)
X_test_mean_std = X_test_mean.merge(X_train_df.groupby('GICS Sector').std(), left_on='GICS Sector', right_index=True)
X_test_mean_std = X_test_mean_std.drop('GICS Sector', axis=1)
X_test = (X_test_mean_std.iloc[:, :9].values - X_test_mean_std.iloc[:, 9:18].values)/X_test_mean_std.iloc[:, 18:].values

# Regression 

#### LinearRegression

In [6]:
reg_model = LinearRegression()
reg_model.fit(X_train, y_train)
pred = reg_model.predict(X_test)

In [7]:
print(f'Train MSE: {mean_squared_error(reg_model.predict(X_train), y_train):.5}')
print(f'Test MSE: {mean_squared_error(pred, y_test):.5}')

Train MSE: 0.028117
Test MSE: 0.052553


#### Ridge

In [8]:
ridge_model = Ridge(alpha=2.0)
ridge_model.fit(X_train, y_train)
pred = ridge_model.predict(X_test)

In [9]:
print(f'Train MSE: {mean_squared_error(ridge_model.predict(X_train), y_train):.5}')
print(f'Test MSE: {mean_squared_error(pred, y_test):.5}')

Train MSE: 0.028117
Test MSE: 0.052999


#### SGDRegressor - l2

In [10]:
SGD_model_l2 = SGDRegressor(loss='squared_loss', penalty='l2', alpha=20.0, random_state=42)
SGD_model_l2.fit(X_train, y_train)
pred = SGD_model_l2.predict(X_test)

In [11]:
print(f'Train MSE: {mean_squared_error(SGD_model_l2.predict(X_train), y_train):.5}')
print(f'Test MSE: {mean_squared_error(pred, y_test):.5}')

Train MSE: 0.031143
Test MSE: 0.03138


#### SGDRegressor - l1

In [12]:
SGD_model_l1 = SGDRegressor(loss='squared_loss', penalty='l1', alpha=2.0, random_state=42)
SGD_model_l1.fit(X_train, y_train)
pred = SGD_model_l1.predict(X_test)

In [13]:
print(f'Train MSE: {mean_squared_error(SGD_model_l1.predict(X_train), y_train):.5}')
print(f'Test MSE: {mean_squared_error(pred, y_test):.5}')

Train MSE: 0.031935
Test MSE: 0.031524


#### RandomForestRegressor

In [14]:
RF = RandomForestRegressor(criterion='mae', n_estimators=100, min_samples_leaf=1, max_depth=2)
RF.fit(X_train, y_train)
pred = RF.predict(X_test)

In [15]:
print(f'Train MAE: {mean_squared_error(RF.predict(X_train), y_train):.5}')
print(f'Test MAE: {mean_squared_error(pred, y_test):.5}')

Train MAE: 0.023891
Test MAE: 0.033057


In [16]:
RF = RandomForestRegressor(criterion='mse', n_estimators=150, min_samples_leaf=1, max_depth=2)
RF.fit(X_train, y_train)
pred = RF.predict(X_test)

In [17]:
print(f'Train MSE: {mean_squared_error(RF.predict(X_train), y_train):.5}')
print(f'Test MSE: {mean_squared_error(pred, y_test):.5}')

Train MSE: 0.023479
Test MSE: 0.032268


#### XGBoost

In [18]:
XGB_model = xg.XGBRegressor(objective ='reg:squarederror', n_estimators=50, max_depth=1, learning_rate=0.05, seed=42)
XGB_model.fit(X_train, y_train)
pred = XGB_model.predict(X_test)

In [19]:
print(f'Train MSE: {mean_squared_error(XGB_model.predict(X_train), y_train):.5}')
print(f'Test MSE: {mean_squared_error(pred, y_test):.5}')

Train MSE: 0.025938
Test MSE: 0.033233


#### CatBoost

In [80]:
CB_model = CatBoostRegressor(learning_rate=1e-5, loss_function='MAE', depth=2, random_seed=42)
CB_model.fit(X_train, y_train, verbose=False)
pred = CB_model.predict(X_test)

In [81]:
print(f'Train MSE: {mean_squared_error(CB_model.predict(X_train), y_train):.5}')
print(f'Test MSE: {mean_squared_error(pred, y_test):.5}')

Train MSE: 0.031839
Test MSE: 0.031351


In [82]:
CB_model = CatBoostRegressor(learning_rate=1e-5, loss_function='RMSE', depth=2, random_seed=42)
CB_model.fit(X_train, y_train, verbose=False)
pred = CB_model.predict(X_test)

In [83]:
print(f'Train MSE: {mean_squared_error(CB_model.predict(X_train), y_train):.5}')
print(f'Test MSE: {mean_squared_error(pred, y_test):.5}')

Train MSE: 0.03183
Test MSE: 0.031375
