In [40]:
import pandas as pd
import numpy as np
from sklearn.linear_model import ElasticNet, Lasso
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler, normalize

In [41]:
import warnings
warnings.filterwarnings('ignore')

In [42]:
macro = pd.read_csv('../data/again1.csv')

In [43]:
macro.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6086 entries, 0 to 6085
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   target           6086 non-null   float64
 1   DTB3             6086 non-null   float64
 2   MCUMFN           6086 non-null   float64
 3   CPIAUCSL         6086 non-null   float64
 4   GEPU_current     6086 non-null   float64
 5   IPG211111CN      6086 non-null   float64
 6   Killian          6086 non-null   float64
 7   IRLTLT01USM156N  6086 non-null   float64
 8   M2SL             6086 non-null   float64
 9   Close            6086 non-null   float64
 10  CFNAI            6086 non-null   float64
 11  UNRATE           6086 non-null   float64
 12  Imports          6086 non-null   float64
 13  Production       6086 non-null   float64
 14  Stocks           6086 non-null   float64
 15  target_data      6086 non-null   object 
 16  mean_by_month    6086 non-null   float64
dtypes: float64(16)

In [44]:
tech = pd.read_csv('../data/technical_dataset.csv')

In [45]:
#macro_date = macro['date']
hs = macro['mean_by_month']
y = macro['target']
macro.drop(columns=['target_data', 'target',  'mean_by_month'], inplace=True)

In [46]:
tech_date = tech['date']
tech.drop(columns=['date'], inplace=True)

# Fitting and validation, nothing different from the default parameters

In [47]:
data = pd.concat([macro, tech], axis=1)

In [48]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6086 entries, 0 to 6085
Data columns (total 32 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   DTB3             6086 non-null   float64
 1   MCUMFN           6086 non-null   float64
 2   CPIAUCSL         6086 non-null   float64
 3   GEPU_current     6086 non-null   float64
 4   IPG211111CN      6086 non-null   float64
 5   Killian          6086 non-null   float64
 6   IRLTLT01USM156N  6086 non-null   float64
 7   M2SL             6086 non-null   float64
 8   Close            6086 non-null   float64
 9   CFNAI            6086 non-null   float64
 10  UNRATE           6086 non-null   float64
 11  Imports          6086 non-null   float64
 12  Production       6086 non-null   float64
 13  Stocks           6086 non-null   float64
 14  MA1_9            6086 non-null   int64  
 15  MA2_9            6086 non-null   int64  
 16  MA3_9            6086 non-null   int64  
 17  MA1_12        

In [49]:
x_train = data.iloc[:5813]
y_train = y.iloc[:5813]
x_test = data.iloc[5813:]
y_test = y[5813:]
hs_test = hs[5813:]

In [50]:
scaler = StandardScaler()
scaler.fit(x_train)
x_train_sc = scaler.transform(x_train)
x_test_sc = scaler.transform(x_test)

In [51]:
x_train_norm = normalize(x_train)
x_test_norm = normalize(x_test)

In [117]:
model = ElasticNet()

In [118]:
model.fit(x_train, y_train)
pred = model.predict(x_test)
print("R2", r2_score(y_test, pred))
print("R2OS", 1 - mean_squared_error(y_test, pred) / mean_squared_error(y_test, hs_test))
print("MSE", mean_squared_error(y_test, pred))

R2 -20.185265246209823
R2OS -212.63894253494874
MSE 3712.7160533122697


# Fitting with feature selection, ElasticNet selects features for ElasticNet, Lasso selects features for Lasso, we look at non-zero coefficients (we first fit and look at coefficients, and then we fit using features with non-zero coefficients)

In [119]:
coef = model.coef_
cols = x_train.columns
zipped = list(zip(cols, coef))

In [120]:
res = sorted(zipped, key = lambda x: x[1])

In [121]:
res

[('GEPU_current', -0.07907756800421391),
 ('Production', -0.009644844048295942),
 ('Stocks', -0.00016312582431190722),
 ('Killian', 0.03219377431685701),
 ('CPIAUCSL', 1.7063990547214258),
 ('MCUMFN', 1.8238739761140597),
 ('UNRATE', 3.1392394457361945)]

In [122]:
features = []
for i in res:
    if i[1] != 0.0:
        features.append(i[0])

In [123]:
x_train = data[features].iloc[:5813]
y_train = y.iloc[:5813]
x_test = data[features].iloc[5813:]
y_test = y.iloc[5813:]
hs_test = hs[5813:]

In [124]:
scaler = StandardScaler()
scaler.fit(x_train)
x_train_sc = scaler.transform(x_train)
x_test_sc = scaler.transform(x_test)

In [125]:
x_train_norm = normalize(x_train)
x_test_norm = normalize(x_test)

In [127]:
model.fit(x_train_sc, y_train)
pred = model.predict(x_test_sc)
print("R2", r2_score(y_test, pred))
print("R2OS", 1 - mean_squared_error(y_test, pred) / mean_squared_error(y_test, hs_test))
print("MSE", mean_squared_error(y_test, pred))

R2 -11.860609688144537
R2OS -128.69047223147066
MSE 2253.8208273365804


# Fitting when two sets of features(macro and technical) are used separately

In [149]:
data = tech

In [150]:
x_train = data.iloc[:5813]
y_train = y.iloc[:5813]
x_test = data.iloc[5813:]
y_test = y[5813:]
hs_test = hs[5813:]

In [151]:
scaler = StandardScaler()
scaler.fit(x_train)
x_train_sc = scaler.transform(x_train)
x_test_sc = scaler.transform(x_test)

In [152]:
x_train_norm = normalize(x_train)
x_test_norm = normalize(x_test)

In [153]:
model = Lasso()

In [155]:
model.fit(x_train, y_train)
pred = model.predict(x_test)
print("R2", r2_score(y_test, pred))
print("R2OS", 1 - mean_squared_error(y_test, pred) / mean_squared_error(y_test, hs_test))
print("MSE", mean_squared_error(y_test, pred))

R2 -0.8718815395177377
R2OS -17.876647896811544
MSE 328.0470912635728


# Fitting with economic constraints. An economic constraint that a rational investor will rule out a negative stock return forecast and therefore set the forecast to zero whenever it is negative

In [33]:
data = pd.concat([macro, tech], axis=1)

In [156]:
data['return'] = -0.035

In [157]:
data['return'].iloc[1:] = y.pct_change().iloc[1:]

In [158]:
y = data['return']
data.drop(columns = ['return'], inplace=True)

In [159]:
x_train = data.iloc[:5813]
y_train = y.iloc[:5813]
x_test = data.iloc[5813:]
y_test = y[5813:]
hs_test = hs[5813:]

In [160]:
scaler = StandardScaler()
scaler.fit(x_train)
x_train_sc = scaler.transform(x_train)
x_test_sc = scaler.transform(x_test)

In [161]:
x_train_norm = normalize(x_train)
x_test_norm = normalize(x_test)

In [164]:
model = Lasso()

In [165]:
model.fit(x_train, y_train)
pred = model.predict(x_test)
pred = [num if num > 0 else 0 for num in pred]
print("R2", r2_score(y_test, pred))
print("R2OS", 1 - mean_squared_error(y_test, pred) / mean_squared_error(y_test, hs_test))
print("MSE", mean_squared_error(y_test, pred))

R2 -0.0014418603380992323
R2OS 0.9999797882504812
MSE 0.041996275086847626
