# ML model linear regression, Lasso, ...

In [None]:
import pandas as pd, numpy as np, sklearn

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.metrics import mean_squared_error as mse
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [None]:
# slicing the data

X = df.loc[ : , ['sentiment', 'repo change']] # explanatory variables

y = df.loc[ : , ['stock market index']] # value to be preicted

In [1]:
# Linear regression

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=0) #split into training and test, random state to get the same output.

pipe_lr = make_pipeline(PolynomialFeatures(include_bias = False),  #including/excluding the bias depends on our choice
                        StandardScaler(),
                        LinearRegression())

test_mse = []
train_mse = []
parameters = []
degrees = range(3)

for p in degrees:
    X_train_p = pipe_lr.fit_transform(X_train) # polynomial expansion and transformation of the data
    X_test_p = pipe_lr.transform(X_test) # transform takes the test data and rescles it
    reg = fit(X_train_p, y_train) 
    train_mse += [mse(reg.predict(X_train_p),y_train)] 
    test_mse += [mse(reg.predict(X_test_p),y_test)]     
    parameters.append(reg.coef_)
    
print(parameters)

NameError: name 'X' is not defined

In [None]:
# create and split development set into validation and training

# splitting into development (2/3) and test data (1/3)
X_dev, X_test, y_dev, y_test = train_test_split(X, y, test_size=1/3, random_state=1)

# splitting development into train (1/3) and validation (1/3), so divide by half!
X_train, X_val, y_train, y_val = train_test_split(X_dev, y_dev, test_size=1/2, random_state=1)

In [None]:
# Lasso model: we remove the bias, i.e. beta_0 and we estimate a parameter that improves the prediction

perform = []
lambdas = np.logspace(-4, 4, 20)
for lambda_ in lambdas:
    pipe_lasso = make_pipeline(PolynomialFeatures(include_bias=False), #multiple pipelines
                               StandardScaler(),
                               Lasso(alpha=lambda_, random_state=1))
    pipe_lasso.fit(X_train, y_train) # fit on training data
    y_pred = pipe_lasso.predict(X_val) # predict performance on validation set
    perform.append(mse(y_pred, y_val)) # saved as a series of lambdas
    
hyperparam_perform = pd.Series(perform,index=lambdas)
hyperparam_perform.nsmallest(1) # select the smallest hyperparameter

optimal = hyperparam_perform.nsmallest(1)    
print('Optimal alpha:', optimal.index[0])
print('Validation MSE: %.3f' % optimal.values[0])

# insert optimal  lambda in new model

pipe_lasso = make_pipeline(PolynomialFeatures(include_bias=False), 
                           StandardScaler(),
                           Lasso(alpha=optimal.index[0])) # new value..., alpha is lambda!
# fit new model on all development data, fit = estimate!
pipe_lasso.fit(X_dev,y_dev)
# compare model performance on test data
print('Lasso', round(mse(pipe_lasso.predict(X_test),y_test), 3)) # smaller mse, better performance than linear model
print('LinReg', round(mse(pipe_lr.predict(X_test),y_test), 3))

In [None]:
# other evaluation method
# CROSS VALIDATION

# instead of using on epart of the data, we use all the data, rotating validation and training sets
# 10% testing, 90% training. cross-validation changes the 10% share. 

# leave one out corss validation
# take each obs, all other data used to make a model and that one used to test. most robust approach.

# K-fold method: data divided into k bins, k-1 used to train the (entire) data.

# K as large as possible (10 bins usually). this does not cause leakage because it is not the same model over and over.

from sklearn.model_selection import KFold
kfolds = KFold(n_splits=10) # number of bins
folds = list(kfolds.split(X_dev, y_dev))
# outer loop: lambdas, for each of the lambda
mseCV = []
for lambda_ in lambdas:    
    # inner loop: folds for each of the splits, we want to make the model
    mseCV_ = []    
    for train_idx, val_idx in folds:        
        # train model and compute MSE on test fold
        pipe_lassoCV = make_pipeline(PolynomialFeatures(degree=3, include_bias=True),
                                     StandardScaler(),
                                     Lasso(alpha=lambda_, random_state=1))            
        X_train, y_train = X_dev[train_idx], y_dev[train_idx] 
        X_val, y_val = X_dev[val_idx], y_dev[val_idx] 
        pipe_lassoCV.fit(X_train, y_train)     # fit the model on train data    
        mseCV_.append(mse(pipe_lassoCV.predict(X_val), y_val))    # append on validation sample and store result
        
    # store result    
    mseCV.append(mseCV_) # list of lists, becomes a dataframe next
    
# convert to DataFrame
lambdaCV = pd.DataFrame(mseCV, index=lambdas)

In [None]:
print(lambdaCV.head(3))

print(lambdaCV.mean(axis =1))

# choose optimal hyperparameters 
optimal_lambda = lambdaCV.mean(axis=1).nsmallest(1)

# retrain model using optimal hyperparameters
pipe_lassoCV = make_pipeline(PolynomialFeatures(include_bias=False), 
                             StandardScaler(),
                             Lasso(alpha=optimal_lambda.index[0], random_state=1))
pipe_lassoCV.fit(X_dev,y_dev) # fit the pipeline

# compare performance
models = {'Lasso': pipe_lasso, 'Lasso CV': pipe_lassoCV, 'LinReg': pipe_lr}
for name, model in models.items():
    score = mse(model.predict(X_test),y_test)
    print(name, round(score, 2))

# we see that lambda and lambda cross validation are just the same.

In [None]:
from sklearn.model_selection import validation_curve

train_scores, test_scores = \
    validation_curve(estimator=pipe_lasso,
                     X=X_train,
                     y=y_train,
                     param_name='lasso__alpha',
                     param_range=lambdas,
                     scoring='neg_mean_squared_error',                 
                     cv=3)

mse_score = pd.DataFrame({'Train':-train_scores.mean(axis=1),
                          'Validation':-test_scores.mean(axis=1),
                          'lambda':lambdas})\
              .set_index('lambda')   
print(mse_score.Validation.nsmallest(1))

mse_score.plot(logx=True, logy=True) # optimal hyperparameters

In [None]:
# ELASTIC NET
# more than one hyperparameters (L1, L2, n of features for polynoimial expansion (also chosen, can be optimized))
# grid search = search in multiple dimension.

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import ElasticNet

pipe_el = make_pipeline(PolynomialFeatures(include_bias=False), # same as before
                        StandardScaler(),
                        ElasticNet())
gs = GridSearchCV(estimator=pipe_el, 
                  param_grid={'elasticnet__alpha':np.logspace(-4,4,10)*2, # two-dimesional grid 10*10 = 100 combinations
                              'elasticnet__l1_ratio':np.linspace(0,1,10)},  # how much the regularization of the two parameters 
                                                                            # should be
                  scoring='neg_mean_squared_error', # - MSE, minimizer!
                  n_jobs=4, # parallelize :) make computation faster
                  iid=False, # was giving a warning
                  cv=10)

models['ElasicNetCV'] = gs.fit(X_train, y_train)
for name, model in models.items():
    score = mse(model.predict(X_test),y_test) # finally using the test data.
    print(name, round(score, 2))
print()
print('CV params:', gs.best_params_) # net outperformed the lassob

In [None]:
# Measures for classification accuracy

# accuracy = true/(true + false)

from sklearn.metrics import precision_score, recall_score, f1_score

# recall = share of correct answers
# precision = condition on own predictions (could be 100% even though the share of correct predictions is small)
# high recall = low false positive rate

# nested cross-validation 
# test the model 5 times (fig.), in the inner we make the validation to get optimal hyperparameters