In [282]:
import pandas as pd
import xgboost as xg
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, SGDRegressor, LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.model_selection import GridSearchCV
from catboost import CatBoostRegressor, CatBoostClassifier

In [2]:
data = pd.read_csv('data/financial_data.csv', index_col=0)
data = data.drop(['price_to_sales', 'target'], axis=1)
data = data.dropna()

data_groups = pd.read_csv('data/S&P500-Info.csv', index_col=0)
merged = data.merge(data_groups[['Symbol', 'GICS Sector']], left_index=True, right_on='Symbol')
X = merged.drop('Symbol', axis=1).drop(['regression_target', 'SP_target'], axis=1)
target_reg = merged.regression_target
target_class = merged.SP_target * 1

In [3]:
X_train_df, X_test_df, y_train_reg, y_test_reg = train_test_split(X, target_reg.values, test_size=0.3, random_state=42)
_, _, y_train_class, y_test_class = train_test_split(X, target_class.values, test_size=0.3, random_state=42)

In [4]:
X_train = X_train_df.groupby('GICS Sector').transform(lambda x: (x - x.mean()) / x.std()).values
X_test_mean = X_test_df.merge(X_train_df.groupby('GICS Sector').mean(), left_on='GICS Sector', right_index=True)
X_test_mean_std = X_test_mean.merge(X_train_df.groupby('GICS Sector').std(), left_on='GICS Sector', right_index=True)
X_test_mean_std = X_test_mean_std.drop('GICS Sector', axis=1)
X_test = (X_test_mean_std.iloc[:, :9].values - X_test_mean_std.iloc[:, 9:18].values)/X_test_mean_std.iloc[:, 18:].values

# Regression 

#### LinearRegression

In [5]:
reg_model = LinearRegression()
reg_model.fit(X_train, y_train_reg)
pred = reg_model.predict(X_test)

In [6]:
print(f'Train MSE: {mean_squared_error(reg_model.predict(X_train), y_train_reg):.5}')
print(f'Test MSE: {mean_squared_error(pred, y_test_reg):.5}')

Train MSE: 0.028117
Test MSE: 0.052553


#### Ridge

In [7]:
ridge_model = Ridge(alpha=2.0, random_state=42)
ridge_model.fit(X_train, y_train_reg)
pred = ridge_model.predict(X_test)

In [8]:
print(f'Train MSE: {mean_squared_error(ridge_model.predict(X_train), y_train_reg):.5}')
print(f'Test MSE: {mean_squared_error(pred, y_test_reg):.5}')

Train MSE: 0.028117
Test MSE: 0.052999


#### Lasso

In [9]:
lasso_model = Lasso(alpha=2.0, random_state=42)
lasso_model.fit(X_train, y_train_reg)
pred = lasso_model.predict(X_test)

In [10]:
print(f'Train MSE: {mean_squared_error(lasso_model.predict(X_train), y_train_reg):.5}')
print(f'Test MSE: {mean_squared_error(pred, y_test_reg):.5}')

Train MSE: 0.031893
Test MSE: 0.031381


#### ElasticNet

In [11]:
elastic_model = ElasticNet(alpha=2.0, random_state=42)
elastic_model.fit(X_train, y_train_reg)
pred = elastic_model.predict(X_test)

In [12]:
print(f'Train MSE: {mean_squared_error(elastic_model.predict(X_train), y_train_reg):.5}')
print(f'Test MSE: {mean_squared_error(pred, y_test_reg):.5}')

Train MSE: 0.031893
Test MSE: 0.031381


#### SGDRegressor - l2

In [13]:
SGD_model_l2 = SGDRegressor(loss='squared_loss', penalty='l2', alpha=20.0, random_state=42)
SGD_model_l2.fit(X_train, y_train_reg)
pred = SGD_model_l2.predict(X_test)

In [14]:
print(f'Train MSE: {mean_squared_error(SGD_model_l2.predict(X_train), y_train_reg):.5}')
print(f'Test MSE: {mean_squared_error(pred, y_test_reg):.5}')

Train MSE: 0.031143
Test MSE: 0.03138


#### SGDRegressor - l1

In [15]:
SGD_model_l1 = SGDRegressor(loss='squared_loss', penalty='l1', alpha=2.0, random_state=42)
SGD_model_l1.fit(X_train, y_train_reg)
pred = SGD_model_l1.predict(X_test)

In [16]:
print(f'Train MSE: {mean_squared_error(SGD_model_l1.predict(X_train), y_train_reg):.5}')
print(f'Test MSE: {mean_squared_error(pred, y_test_reg):.5}')

Train MSE: 0.031935
Test MSE: 0.031524


#### RandomForestRegressor

In [17]:
RF = RandomForestRegressor(criterion='mae', n_estimators=100, min_samples_leaf=1, max_depth=2, random_state=42)
RF.fit(X_train, y_train_reg)
pred = RF.predict(X_test)

In [18]:
print(f'Train MSE: {mean_squared_error(RF.predict(X_train), y_train_reg):.5}')
print(f'Test MSE: {mean_squared_error(pred, y_test_reg):.5}')

Train MSE: 0.023802
Test MSE: 0.032969


In [19]:
RF = RandomForestRegressor(criterion='mse', n_estimators=150, min_samples_leaf=1, max_depth=2, random_state=42)
RF.fit(X_train, y_train_reg)
pred = RF.predict(X_test)

In [20]:
print(f'Train MSE: {mean_squared_error(RF.predict(X_train), y_train_reg):.5}')
print(f'Test MSE: {mean_squared_error(pred, y_test_reg):.5}')

Train MSE: 0.023353
Test MSE: 0.032984


#### XGBoost

In [21]:
XGB_model = xg.XGBRegressor(objective ='reg:squarederror', n_estimators=50, max_depth=1, learning_rate=0.05, seed=42)
XGB_model.fit(X_train, y_train_reg)
pred = XGB_model.predict(X_test)

In [22]:
print(f'Train MSE: {mean_squared_error(XGB_model.predict(X_train), y_train_reg):.5}')
print(f'Test MSE: {mean_squared_error(pred, y_test_reg):.5}')

Train MSE: 0.025938
Test MSE: 0.033233


#### CatBoost

In [23]:
CB_model = CatBoostRegressor(learning_rate=1e-5, loss_function='MAE', depth=2, random_seed=42)
CB_model.fit(X_train, y_train_reg, verbose=False)
pred = CB_model.predict(X_test)

In [24]:
print(f'Train MSE: {mean_squared_error(CB_model.predict(X_train), y_train_reg):.5}')
print(f'Test MSE: {mean_squared_error(pred, y_test_reg):.5}')

Train MSE: 0.031839
Test MSE: 0.031351


In [25]:
CB_model = CatBoostRegressor(learning_rate=1e-5, loss_function='RMSE', depth=2, random_seed=42)
CB_model.fit(X_train, y_train_reg, verbose=False)
pred = CB_model.predict(X_test)

In [26]:
print(f'Train MSE: {mean_squared_error(CB_model.predict(X_train), y_train_reg):.5}')
print(f'Test MSE: {mean_squared_error(pred, y_test_reg):.5}')

Train MSE: 0.03183
Test MSE: 0.031375


#### NN

In [27]:
NN_model = MLPRegressor(hidden_layer_sizes=(50, 100, 50), learning_rate_init=0.1, solver='adam', random_state=42)
NN_model.fit(X_train, y_train_reg)
pred = NN_model.predict(X_test)

In [28]:
print(f'Train MSE: {mean_squared_error(NN_model.predict(X_train), y_train_reg):.5}')
print(f'Test MSE: {mean_squared_error(pred, y_test_reg):.5}')

Train MSE: 0.030467
Test MSE: 0.03192


# Classification

#### LogisticRegression

In [65]:
LR_model = LogisticRegression(penalty='l2', random_state=42)
LR_model.fit(X_train, y_train_class)
pred = LR_model.predict(X_test)

In [66]:
print(f'Train Accuracy: {accuracy_score(LR_model.predict(X_train), y_train_class) * 100:.4}')
print(f'Test Accuracy: {accuracy_score(pred, y_test_class) * 100:.4}')

Train Accuracy: 64.75
Test Accuracy: 58.1


#### SGDClassifier

In [107]:
SGDClassifier_model = SGDClassifier(loss='perceptron', penalty='l2', random_state=42)
SGDClassifier_model.fit(X_train, y_train_class)
pred = SGDClassifier_model.predict(X_test)

In [108]:
print(f'Train Accuracy: {accuracy_score(SGDClassifier_model.predict(X_train), y_train_class) * 100:.4}')
print(f'Test Accuracy: {accuracy_score(pred, y_test_class) * 100:.4}')

Train Accuracy: 58.61
Test Accuracy: 60.0


#### RandomForestClassifier

In [198]:
RF_model = RandomForestClassifier(n_estimators=100, criterion='gini', min_samples_leaf=7, max_depth=2, random_state=42)
RF_model.fit(X_train, y_train_class)
pred = RF_model.predict(X_test)

In [199]:
print(f'Train Accuracy: {accuracy_score(RF_model.predict(X_train), y_train_class) * 100:.4}')
print(f'Test Accuracy: {accuracy_score(pred, y_test_class) * 100:.4}')

Train Accuracy: 70.9
Test Accuracy: 65.71


#### XGBoost

In [255]:
XGB_model = xg.XGBClassifier(n_estimators=100, max_depth=2, learning_rate=0.0005, use_label_encoder=False, random_state=42)
XGB_model.fit(X_train, y_train_class)
pred = XGB_model.predict(X_test)



In [256]:
print(f'Train Accuracy: {accuracy_score(XGB_model.predict(X_train), y_train_class) * 100:.4}')
print(f'Test Accuracy: {accuracy_score(pred, y_test_class) * 100:.4}')

Train Accuracy: 69.26
Test Accuracy: 61.9


#### CatBoost

In [280]:
CB_model = CatBoostClassifier(learning_rate=1e-4, depth=3, random_seed=42)
CB_model.fit(X_train, y_train_class, verbose=False)
pred = CB_model.predict(X_test)

In [281]:
print(f'Train Accuracy: {accuracy_score(CB_model.predict(X_train), y_train_class) * 100:.4}')
print(f'Test Accuracy: {accuracy_score(pred, y_test_class) * 100:.4}')

Train Accuracy: 70.08
Test Accuracy: 63.81


#### NN

In [596]:
NN_model = MLPClassifier(hidden_layer_sizes=(50, 100, 50), alpha=0, learning_rate_init=0.5,
                         max_iter=1500, solver='adam')
NN_model.fit(X_train, y_train_class)
pred = NN_model.predict(X_test)

In [597]:
print(f'Train Accuracy: {accuracy_score(NN_model.predict(X_train), y_train_class) * 100:.4}')
print(f'Test Accuracy: {accuracy_score(pred, y_test_class) * 100:.4}')

Train Accuracy: 60.66
Test Accuracy: 60.95
