In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import os

from scipy.stats import uniform,randint
from sklearn.linear_model import LinearRegression,Lasso,LassoCV,ElasticNet,ElasticNetCV
from sklearn.metrics import mean_squared_error
from group_lasso import GroupLasso

# Load Data

In [7]:
path = os.path.join("datasets","dow_jones_index")
df_feature_train = pd.read_csv(path+"/features_train_grouped.csv",index_col=0)
df_label_train = pd.read_csv(path+"/label_train.csv",index_col=0)
df_feature_test = pd.read_csv(path+"/features_test_standardized.csv",index_col=0)
df_label_test = pd.read_csv(path+"/label_test.csv",index_col=0)

df_feature_train = df_feature_train.drop(["next_weeks_open","next_weeks_close"],axis=1)
df_feature_test = df_feature_test.drop(["next_weeks_open","next_weeks_close"],axis=1)

print("features:")
print(df_feature_train)
print("label:")
print(df_label_train)

features:
                 open      high       low     close  percent_change_price  \
0           -1.147634 -1.154258 -1.167717 -1.165912             -1.837968   
1           -1.164122 -1.164488 -1.169004 -1.171561             -1.066790   
2           -1.174269 -1.156738 -1.161925 -1.160890              0.551709   
3           -1.164439 -1.133179 -1.150341 -1.129190              2.243461   
4           -1.127975 -1.130389 -1.124921 -1.121971             -0.002695   
...               ...       ...       ...       ...                   ...   
326          1.045288  1.011047  1.017434  1.003212             -0.455992   
327          1.044337  0.996167  0.906423  0.910308             -1.800607   
328          0.912115  0.889529  0.864592  0.870447             -0.489097   
329          0.918457  0.929519  0.963054  0.957387              0.748353   
group_index  1.000000  1.000000  1.000000  1.000000              1.000000   

               volume  percent_change_volume_over_last_wk  \
0   

### Extract numerical values from the tables

In [9]:
features = list(df_feature_train.columns)
group_index = df_feature_train.loc["group_index"]
coeff_df = pd.DataFrame([group_index]).T
mse_df = pd.DataFrame([],index=["mse"])

group_ls = group_index.values
X_train = df_feature_train.iloc[:-1,:].values
y_train = df_label_train.values
X_test = df_feature_test.values
y_test = df_label_test.values

# Perform OLS

In [10]:
ols_reg = LinearRegression()
ols_reg.fit(X_train,y_train)
y_pred_ols = ols_reg.predict(X_test)
mse_ols = mean_squared_error(y_test,y_pred_ols) # calculate MSE

# save the coefficients and mse #
coeff_df["OLS"] = ols_reg.coef_.T
mse_df["OLS"] = mse_ols

# Perform Lasso

In [11]:
lasso_reg = LassoCV(max_iter=15000)
lasso_reg.fit(X_train,y_train)
y_pred_lasso = lasso_reg.predict(X_test)
mse_lasso = mean_squared_error(y_test,y_pred_lasso) # calculate MSE

# save the coefficients and mse #
coeff_df["LassoCV"] = lasso_reg.coef_.T
mse_df["LassoCV"] = mse_lasso

  y = column_or_1d(y, warn=True)


# Perform Elastic Net

In [12]:
elastic_net_reg = ElasticNetCV(max_iter=15000)
elastic_net_reg.fit(X_train,y_train)
y_pred_net = elastic_net_reg.predict(X_test)
mse_net = mean_squared_error(y_test,y_pred_net) # calculate MSE

# save the coefficients and mse #
coeff_df["ElasticNetCV"] = elastic_net_reg.coef_.T
mse_df["ElasticNetCV"] = mse_net

  y = column_or_1d(y, warn=True)


# Perform group lasso

In [13]:
gl = GroupLasso(groups=group_ls,group_reg=0.1,l1_reg=0,n_iter=15000,tol=1e-3,supress_warning=True)
gl.fit(X_train,y_train)
y_pred_gl = gl.predict(X_test)
mse_gl = mean_squared_error(y_test,y_pred_gl) # calculate MSE

# save the coefficients and mse #
coeff_df["gLasso(lambda=0.1)"] = gl.coef_
mse_df["gLasso(lambda=0.1)"] = mse_gl

# Perform sparse group lasso

In [14]:
sgl = GroupLasso(groups=group_ls,group_reg=0.1,l1_reg=0.01,n_iter=15000,tol=1e-3,supress_warning=True)
sgl.fit(X_train,y_train)
y_pred_sgl = sgl.predict(X_test)
mse_sgl = mean_squared_error(y_test,y_pred_sgl) # calculate MSE

# save the coefficients and mse #
coeff_df["sgLasso(lambda=0.01,lambda2=0.1)"] = sgl.coef_
mse_df["sgLasso(lambda1=0.01,lambda2=0.1)"] = mse_sgl

# Results

In [15]:
coeff_df

Unnamed: 0,group_index,OLS,LassoCV,ElasticNetCV,gLasso(lambda=0.1),"sgLasso(lambda=0.01,lambda2=0.1)"
open,1.0,-11.0342,0.0,0.014496,0.004982,0.002616
high,1.0,17.469382,0.134822,0.084465,0.005117,0.002687
low,1.0,-6.604163,0.0,0.004636,0.004969,0.002609
close,1.0,0.443227,0.0,0.041621,0.005035,0.002643
percent_change_price,1.0,-0.206961,0.0,0.0,0.00036,0.0
volume,2.0,0.026984,-0.0,-0.0,-0.050454,-0.041255
percent_change_volume_over_last_wk,2.0,-0.110681,0.0,0.0,0.021175,0.014498
previous_weeks_volume,2.0,-0.226586,-0.166192,-0.158924,-0.061905,-0.050962
days_to_next_dividend,3.0,-0.047213,-0.0,-0.0,-0.0,-0.0
percent_return_next_dividend,3.0,0.331217,0.13384,0.138335,0.0,0.0


### correlation coefficients

In [13]:
print(df_feature_train.corr())

                                        open      high       low     close  \
open                                1.000000  0.999589  0.999403  0.999084   
high                                0.999589  1.000000  0.999540  0.999606   
low                                 0.999403  0.999540  1.000000  0.999681   
close                               0.999084  0.999606  0.999681  1.000000   
percent_change_price                0.111681  0.128472  0.134045  0.146949   
volume                             -0.496023 -0.494240 -0.499041 -0.497700   
percent_change_volume_over_last_wk  0.025136  0.026946  0.012895  0.013177   
previous_weeks_volume              -0.485692 -0.484115 -0.485296 -0.484026   
days_to_next_dividend              -0.047445 -0.047564 -0.049196 -0.047662   
percent_return_next_dividend       -0.136370 -0.140011 -0.134523 -0.137836   

                                    percent_change_price    volume  \
open                                            0.111681 -0.496023   
h