In [303]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from glob import glob
import time

from sklearn.impute import SimpleImputer, MissingIndicator, KNNImputer
from sklearn.model_selection import train_test_split, TimeSeriesSplit, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.metrics import confusion_matrix, make_scorer, precision_recall_fscore_support, roc_curve, roc_auc_score, auc, accuracy_score
from sklearn.model_selection import cross_val_score, cross_validate
import scipy.stats as stats

pd.set_option('display.max_colwidth',280)
pd.set_option('display.html.use_mathjax', False)
pd.set_option('display.max_rows', 1000)

# Load Data

In [195]:
src_path = r'c:\Users\jaromir\OneDrive\UoM\100_Disertation\02_SrcData\05_PreProcessed'

filenames = glob(src_path+'\*.pkl')

for file in filenames:
    if file.find('fin') > 0 :
        stock_prices = pd.read_pickle(file)

In [196]:
stock_prices.shape

(1353, 38)

In [198]:
stock_prices['date'] = pd.to_datetime(stock_prices.index)
stock_prices.drop(['Dividends', 'Stock Splits'], axis=1, inplace=True)

In [199]:
stock_prices.head(3)

Unnamed: 0_level_0,Open,High,Low,Close,Volume,ticker,Adj close,class,MA_3D_mean,MA_5D_mean,...,lag_8,lag_9,lag_10,lag_15,lag_20,lag_25,lag_30,lag_50,lag_75,date
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-01-02,294.21,298.54,293.17,298.29,33870100,AAPL,298.29,0.0,298.29,298.29,...,,,,,,,,,,2020-01-02
2020-01-03,295.11,298.52,294.47,295.39,36580700,AAPL,295.39,1.0,296.84,296.84,...,,,,,,,,,,2020-01-03
2020-01-06,291.78,297.9,290.74,297.75,29596800,AAPL,297.75,0.0,297.75,297.143333,...,,,,,,,,,,2020-01-06


In [200]:
src_path = r'c:\Users\jaromir\OneDrive\UoM\100_Disertation\02_SrcData\06_SentLabelled'
filenames = glob(src_path+'\*.pkl')

for file in filenames:
    if file.find('Labels4') > 0 :
        tweets = pd.read_pickle(file)

In [201]:
tweets.shape

(417476, 29)

In [202]:
tweets.columns

Index(['text', 'ticker', 'cashtags', 'emoji', 'filtered_text',
       'spacy_lemma_pos', 'spacy_lemma', 'nltk_lemma', 'nltk_lemma_pos',
       'username', 'to', 'retweets', 'favorites', 'replies', 'id', 'author_id',
       'date', 'hashtags', 'mentions', 'urls', 'sentiment_collection_date',
       'spacy_text', 'nltk_text', 'spacy_lex', 'nltk_lex',
       'spacy_NB_sentiment', 'spacy_VADER_polarity', 'VADER_spacy_score',
       'spacy_movie_NB_sentiment'],
      dtype='object')

In [203]:
tweets = tweets[['text','ticker','cashtags','id','sentiment_collection_date','spacy_lex', 'nltk_lex',
                 'spacy_NB_sentiment','VADER_spacy_score', 'spacy_movie_NB_sentiment']]

# Merge Sentiment with Financial data

<font color='red'> For extract_ts I could calcualte the mean score for the date by assigning higher weight to smart users instead fo arithemtic mean </font>

In [204]:
score_features = ['spacy_lex'] # 'nltk_lex', 'spacy_NB_sentiment', 'VADER_spacy_score', 'spacy_movie_NB_sentiment']
tweets_g = tweets.groupby(['ticker','sentiment_collection_date'])[score_features].mean()
tweets_g.columns = ['sentiment_score']

In [205]:
stock_prices_g = stock_prices.groupby(['ticker','date']).mean()

In [206]:
df = pd.merge(left=stock_prices, right=tweets_g, how='left', left_on=['ticker','date'], right_on=['ticker','sentiment_collection_date'])

In [207]:
df.set_index('date', drop=True, inplace=True)

In [208]:
df.shape

(1353, 37)

In [209]:
df.loc[:,df.dtypes == 'float64']

Unnamed: 0_level_0,Open,High,Low,Close,Adj close,class,MA_3D_mean,MA_5D_mean,MA_7D_mean,MA_3D_std,...,lag_8,lag_9,lag_10,lag_15,lag_20,lag_25,lag_30,lag_50,lag_75,sentiment_score
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-01-02,294.21,298.54,293.17,298.29,298.29,0.0,298.290000,298.290000,298.290000,,...,,,,,,,,,,0.151190
2020-01-03,295.11,298.52,294.47,295.39,295.39,1.0,296.840000,296.840000,296.840000,2.050610,...,,,,,,,,,,0.202665
2020-01-06,291.78,297.90,290.74,297.75,297.75,0.0,297.750000,297.143333,297.143333,,...,,,,,,,,,,0.215064
2020-01-07,297.79,298.84,295.44,296.35,296.35,1.0,297.050000,296.496667,296.945000,0.989949,...,,,,,,,,,,0.196891
2020-01-08,295.12,302.35,295.12,301.11,301.11,1.0,298.403333,298.403333,297.778000,2.446331,...,,,,,,,,,,0.175183
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-06-22,35.18,37.10,35.00,36.54,36.54,0.0,36.540000,37.573333,38.436000,,...,39.72,44.64,48.69,29.46,25.40,19.92,25.42,31.50,51.59,0.125205
2020-06-23,36.56,36.88,34.37,36.08,36.08,0.0,36.310000,36.486667,37.610000,0.325269,...,33.32,39.72,44.64,29.91,29.54,24.13,23.97,28.91,52.10,0.118380
2020-06-24,34.68,35.23,32.62,33.07,33.07,1.0,35.230000,35.230000,36.374000,1.884702,...,39.66,33.32,39.72,33.65,30.69,23.68,22.76,30.90,46.78,0.139630
2020-06-25,31.35,34.76,31.03,34.69,34.69,0.0,34.613333,35.095000,35.444000,1.506464,...,39.00,39.66,33.32,39.10,28.89,24.91,20.71,31.86,52.56,0.136422


## Missing values - no tweets for that day

In [17]:
df[df.nltk_lex.isna()]

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,ticker,Adj close,class,...,lag_20,lag_25,lag_30,lag_50,lag_75,spacy_lex,nltk_lex,spacy_NB_sentiment,VADER_spacy_score,spacy_movie_NB_sentiment
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-01-16,110.15,111.48,110.02,111.43,1217100,0.0,0,HLT,111.43,1.0,...,,,,,,,,,,
2020-01-16,147.39,148.44,146.91,148.42,804500,0.0,0,MAR,148.42,1.0,...,,,,,,,,,,
2020-05-18,178.87,179.95,176.93,178.64,4653000,0.0,0,MCD,178.64,0.0,...,180.45,178.93,159.27,197.54,213.41,,,,,
2020-05-19,178.93,180.26,176.89,178.38,3365000,0.0,0,MCD,178.38,1.0,...,176.4,182.77,175.87,185.62,211.23,,,,,
2020-05-20,180.79,183.38,179.37,182.88,4920800,0.0,0,MCD,182.88,1.0,...,185.24,176.66,174.43,198.54,212.43,,,,,
2020-05-21,182.96,184.25,180.9,183.85,2913600,0.0,0,MCD,183.85,0.0,...,180.83,178.31,176.31,187.0,211.87,,,,,
2020-05-22,183.26,184.12,181.97,183.19,2475200,0.0,0,MCD,183.19,1.0,...,182.8,184.87,182.48,169.0,211.63,,,,,
2020-05-26,188.33,189.4,183.02,183.62,3990900,0.0,0,MCD,183.62,1.0,...,184.66,180.45,178.93,175.96,210.15,,,,,
2020-05-27,187.49,187.5,184.04,186.48,3993400,0.0,0,MCD,186.48,1.0,...,184.7,176.4,182.77,148.02,208.9,,,,,
2020-05-28,188.17,189.22,185.48,187.48,3567600,0.0,0,MCD,187.48,0.0,...,186.58,185.24,176.66,146.64,210.48,,,,,


# Utility functions

In [286]:
def find_best_score(rgs, main_scoring_metric):
    # max score metric
    scoring_attr = "mean_test_" + main_scoring_metric
    
    rgs = rgs.loc[(rgs[scoring_attr] == rgs[scoring_attr].max()),:]
    # min fitting time
    rgs = rgs.loc[(rgs['mean_fit_time'] == rgs['mean_fit_time'].min()),:]
    
    rgs = rgs.loc[:,(rgs.columns.isin(['params'])) | (rgs.columns.str.contains('mean_test'))]

    return rgs

In [287]:
def holdout_eval(model, X_train, X_test, y_train, y_test):
    try:
        # Teach model onto train data
        model.fit(X_train,y_train)

        # Get prediction on test set
        y_pred = model.predict(X_test)


        # Calculate performance metrics  
        accuracy = accuracy_score(y_test, y_pred)
        precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred,
                                                                             average='weighted',
                                                                             zero_division = 0,
                                                                             labels = [1,0])
        # Get ROC AUC
        #ydf = model.decision_function(X_test)
        #auc_score = roc_auc_score(y_test, ydf)               

        return (accuracy, precision, recall, fscore)
        
    except:
        return np.nan

In [288]:
def hold_out_eval_dt(model, X_train, X_test, y_train, y_test):
  
    # Teach model onto train data
    model.fit(X_train,y_train)

    # Get prediction on test set
    y_pred = model.predict(X_test)

    # Calculate performance metrics  
    accuracy = accuracy_score(y_test, y_pred)
    precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred,
                                                                         average='weighted',
                                                                         zero_division = 0,
                                                                             labels = [1,0])
    # Get ROC AUC
    #predict_proba = model.predict_proba(X_test)
    #auc_score = roc_auc_score(y_test,  predict_proba[:,1])               

    return (accuracy, precision, recall, fscore)

In [289]:
def repeated_retrain(model, X, y, test_size = 20, no_days = 1):
    score = []
    # calcualte number of test windows
    test_window = np.int(np.ceil(test_size/no_days))
    
    # for each window
    for test_window in range(1, test_window + 1):
        
        # get number of elements of X
        remaining_test_days  = X.shape[0] - test_size + (test_window * no_days)     
                
        X_new = X.iloc[0:remaining_test_days, :]
        y_new = y[0: remaining_test_days]
    
        X_train, X_test, y_train, y_test = train_test_split(X_new, y_new, test_size = no_days ,
                                                            random_state = 42)
        
        #print("train:", X_train.shape[0], " test: ", X_test.shape[0])
            
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
    
        score.append(y_pred)
        
    return (y[-test_size:], np.array(score).ravel())

In [290]:
def score_repeated_retrain(y_test, y_pred):
    
    # Calculate performance metrics  
    accuracy = accuracy_score(y_test, y_pred)
    precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred,
                                                                         average='weighted',
                                                                         zero_division = 0,
                                                                             labels = [1,0])
    return (accuracy, precision, recall, fscore )

In [291]:
def extract_clf_params(params):
    short_keys = {}
    for key, value in params.items():
        short_keys[key[key.find('clf__')+5:]] = value
    return short_keys

In [292]:
def get_result_summary(df):
    df = df.groupby(['ticker'])[['test_accuracy','test_precision','test_recall','test_fscore','test_auc_score']].mean()
    df.loc['Total'] = df.mean(axis=0)
    df['Total'] = df.mean(axis=1)
    return df

In [293]:
impede_ffill = FunctionTransformer(lambda x: x[['sentiment_score']].fillna(method='ffill'), validate=False)

In [294]:
get_other_data = FunctionTransformer(lambda x: x.loc[:,~x.columns.isin(['sentiment_score','ticker'])], validate=False)

In [295]:
#differenciate =  FunctionTransformer(lambda x: x[['Adj close']].diff(1), validate=False)

In [296]:
resize_dataset = FunctionTransformer(lambda x: x[~np.isnan(x).any(axis=1)], validate=False)

In [297]:
sentiment_pipeline = Pipeline([
                              ('impede', impede_ffill)       
                          ])

other_data_pipeline = Pipeline([
                              ('get_other_data', get_other_data)
                              ])
    

# full_set_pipeline = Pipeline([
#         ('union', FeatureUnion([
#                                 ('sentiment', sentiment_pipeline),
#                                 ('other', other_data_pipeline)            
#                               ])
#          )
    
#          ,('resize_input', resize_dataset)

#                             ])

In [251]:
# test_df = df
# print(test_df.shape)
# test = full_set_pipeline.fit_transform(test_df)
# print(test.shape)
# test

# SVC

In [263]:
print("X_train: ", X_train.shape)
print("y_train: ", y_train.shape)

X_train:  (23, 8)
y_train:  (23,)


In [311]:
final_results_svc = pd.DataFrame() 
model_type = 'SVC'
n_splits_outter = 5
n_splits_inner = 5
test_size = 20
no_days = 1


X_features = ['Adj close','sentiment_score','MA_3D_mean', 'MA_5D_mean', 'MA_7D_mean', 'lag_3', 'lag_5', 'lag_7']

params = {'clf__kernel' : ['rbf','linear','sigmoid'],
               'clf__C' : stats.uniform(0.02, 0.02),
           'clf__gamma' : stats.uniform(5, 10)
          }

scoring = {'accuracy' : 'balanced_accuracy',
            'precision_weighted' : 'precision_weighted',
            'recall_weighted' : 'recall_weighted',
            'f1_weighted': 'f1_weighted'}
             #'roc_auc' : make_scorer(roc_auc_score) }


pipeline = Pipeline([
        ('union', FeatureUnion([
                                ('sentiment', sentiment_pipeline),
                                ('other', other_data_pipeline)            
                              ])
         )
         ,('clf', SVC(random_state=42))

                            ])

for tick in df.ticker.unique():
    
    start = time.time()
    print(tick, time.asctime(time.localtime()))
    
    company = tick
    
    X = df.loc[df.ticker == company, X_features].dropna(axis=0)
    y = df.loc[df.ticker == company,'class'].iloc[-X.shape[0]:]
    
    tscv_outter = TimeSeriesSplit(n_splits=n_splits_outter)

    # envolepe for parameter tunning
    for idx, (tr, tt) in enumerate(tscv_outter.split(X)):

        X_train = X.iloc[tr,:]
        y_train = y.iloc[tr]

        X_test = X.iloc[tt,:]
        y_test = y.iloc[tt]

        # cross validation split for parameter tuning
        tscv_inner = TimeSeriesSplit(n_splits=n_splits_inner)

        model = RandomizedSearchCV(
            estimator = pipeline,
            param_distributions = params,
            n_iter = 50,
            n_jobs = -1,
            cv = tscv_inner,
            pre_dispatch='2*n_jobs',
            random_state = 7,
            scoring = scoring,
            refit = 'f1_weighted',
            error_score = 0)
        # fit randomized search 
        model.fit(X_train, y_train)
        
        # get the best parameters from the validation set
        rs_df = pd.DataFrame(model.cv_results_)
        rs = find_best_score(rs_df, main_scoring_metric = 'f1_weighted')
        model_params = rs.params.values[0]
        model_params = extract_clf_params(model_params)

        # initaite a new model with parameters obtained from previous step
        tunned_model = SVC(**model_params, random_state=42)

        # fit this new model on the full train data and test on the test set
        tunned_model.fit(X_train, y_train)

        # get test resuls
        accuracy, precision, recall, fscore = holdout_eval(tunned_model, X_train, X_test, y_train, y_test)
        
        
        

        # TEST results for CV  
        rs['test_accuracy'] = accuracy
        rs['test_precision'] = precision
        rs['test_recall'] = recall
        rs['test_fscore'] = fscore    
        
        rs['ticker'] = company
        rs['model'] = model_type
        rs['fold'] = idx
        
        if idx == n_splits_outter:
            # TEST results for daily retrain
            rs['test_accuracy'] = accuracy_dr
            rs['test_precision'] = precision_dr
            rs['test_recall'] = recall_dr
            rs['test_fscore'] = fscore_dr  

        final_results_svc = final_results_svc.append(rs)
        
    y_test, y_pred = repeated_retrain(model, X, y, test_size = len(tt) , no_days = no_days)
    accuracy_dr, precision_dr, recall_dr, fscore_dr = score_repeated_retrain(y_test, y_pred)
    
    end = time.time()
    print(company, end-start)  

AAPL Fri Aug 28 23:12:52 2020
AAPL 243.91693210601807
AMZN Fri Aug 28 23:16:56 2020
AMZN 739.992607831955
BABA Fri Aug 28 23:29:16 2020
BABA 316.3848958015442
GILD Fri Aug 28 23:34:33 2020
GILD 253.1838195323944
HLT Fri Aug 28 23:38:46 2020
HLT 252.28986191749573
JNJ Fri Aug 28 23:42:58 2020
JNJ 252.59974455833435
MAR Fri Aug 28 23:47:11 2020
MAR 255.1143069267273
MCD Fri Aug 28 23:51:26 2020
MCD 193.47777366638184
MSFT Fri Aug 28 23:54:39 2020
MSFT 250.68594026565552
QSR Fri Aug 28 23:58:50 2020
QSR 200.5988163948059
UAL Sat Aug 29 00:02:11 2020
UAL 219.0328507423401


In [316]:
final_results_svc

Unnamed: 0,params,mean_test_accuracy,mean_test_precision_weighted,mean_test_recall_weighted,mean_test_f1_weighted,test_accuracy,test_precision,test_recall,test_fscore,test_accuracy_dr,test_precision_dr,test_recall_dr,test_fscore_dr,ticker,model,fold
14,"{'clf__C': 0.03145250665287908, 'clf__gamma': 7.760490483306951, 'clf__kernel': 'linear'}",0.75,0.611111,0.666667,0.6,0.526316,0.691633,0.526316,0.526316,0.578947,0.578947,0.578947,0.578947,AAPL,SVC,0
35,"{'clf__C': 0.0343448446455046, 'clf__gamma': 6.47147571941693, 'clf__kernel': 'rbf'}",0.525,0.537778,0.566667,0.499048,0.631579,0.398892,0.631579,0.488964,0.578947,0.578947,0.578947,0.578947,AAPL,SVC,1
14,"{'clf__C': 0.03145250665287908, 'clf__gamma': 7.760490483306951, 'clf__kernel': 'linear'}",0.561905,0.617125,0.533333,0.495926,0.421053,0.559211,0.421053,0.360641,0.578947,0.578947,0.578947,0.578947,AAPL,SVC,2
12,"{'clf__C': 0.03902496687731164, 'clf__gamma': 8.487563756547152, 'clf__kernel': 'linear'}",0.590278,0.678242,0.523077,0.498488,0.368421,0.135734,0.368421,0.198381,0.578947,0.578947,0.578947,0.578947,AAPL,SVC,3
1,"{'clf__C': 0.026160255303754786, 'clf__gamma': 7.638708390378986, 'clf__kernel': 'linear'}",0.468341,0.422929,0.425,0.361189,0.578947,0.718045,0.578947,0.585965,0.578947,0.578947,0.578947,0.578947,AAPL,SVC,4
10,"{'clf__C': 0.027149504820855444, 'clf__gamma': 6.9335562339240955, 'clf__kernel': 'rbf'}",0.6,0.444444,0.533333,0.466667,0.315789,0.099723,0.315789,0.151579,0.526316,0.427632,0.526316,0.471869,AMZN,SVC,0
1,"{'clf__C': 0.026160255303754786, 'clf__gamma': 7.638708390378986, 'clf__kernel': 'linear'}",0.7,0.605556,0.666667,0.617143,0.526316,0.51817,0.526316,0.520824,0.526316,0.427632,0.526316,0.471869,AMZN,SVC,1
36,"{'clf__C': 0.02138504140019671, 'clf__gamma': 8.57070628352879, 'clf__kernel': 'linear'}",0.583333,0.504586,0.577778,0.507228,0.368421,0.554825,0.368421,0.314593,0.526316,0.427632,0.526316,0.471869,AMZN,SVC,2
46,"{'clf__C': 0.03593902714887129, 'clf__gamma': 14.751396811571157, 'clf__kernel': 'linear'}",0.495317,0.43802,0.507692,0.445399,0.473684,0.550877,0.473684,0.491627,0.526316,0.427632,0.526316,0.471869,AMZN,SVC,3
1,"{'clf__C': 0.026160255303754786, 'clf__gamma': 7.638708390378986, 'clf__kernel': 'linear'}",0.530682,0.591723,0.5375,0.506487,0.684211,0.468144,0.684211,0.555921,0.526316,0.427632,0.526316,0.471869,AMZN,SVC,4


In [309]:
get_result_summary(final_results_svc)

KeyError: "Columns not found: 'test_auc_score'"

# Decision Tree

In [215]:
from sklearn.tree import DecisionTreeClassifier

In [251]:
final_results_dt = pd.DataFrame() 
model_type = 'DT'
n_splits_outter = 5
n_splits_inner = 5
X_features = ['Adj close','MA_3D_mean', 'MA_5D_mean', 'MA_7D_mean']

params = {'clf__criterion' : ['gini', 'entropy'],
          'clf__splitter' : ['best','random'],
          'clf__max_depth' : [4,6,8,12]
          }

scoring = {'accuracy' : 'balanced_accuracy',
            'precision_weighted' : 'precision_weighted',
            'recall_weighted' : 'recall_weighted',
            'f1_weighted': 'f1_weighted'}
            #'roc_auc' : make_scorer(roc_auc_score) }


transformer = FeatureUnion(
    transformer_list=[
        ('imputer',  KNNImputer(n_neighbors=4, weights="uniform")),
        ('indicators', MissingIndicator())])  # flag of imeded values X[np.where(results[:,2] == 1 ,True, False)]



pipeline = Pipeline([
                     ('transformer', transformer),
                     ('clf', DecisionTreeClassifier(random_state=42))
                    ])


for tick in df.ticker.unique():

    company = tick
    
    X = df.loc[df.ticker == company, X_features] #.diff().dropna()
    y = df.loc[df.ticker == company,'class'] #.iloc[1:]

    tscv_outter = TimeSeriesSplit(n_splits=n_splits_outter)

    # envole for parameter tunning
    for idx, (tr, tt) in enumerate(tscv_outter.split(X)):

        X_train = X.iloc[tr,:]
        y_train = y.iloc[tr]

        X_test = X.iloc[tt,:]
        y_test = y.iloc[tt]

        # cross validation split for parameter tuning
        tscv_inner = TimeSeriesSplit(n_splits=n_splits_inner)

        model = GridSearchCV(
            estimator = pipeline,
            param_grid = params,
            #n_iter = 50,
            n_jobs = -1,
            cv = tscv_inner,
            #verbose=5,
            pre_dispatch='2*n_jobs',
            # random_state = 7,
            return_train_score = True,
            scoring = scoring,
            refit = 'f1_weighted')
            #error_score = 0)
        # fit randomized search 
        model.fit(X_train, y_train)
        # get the best parameters from the validation set
        rs_df = pd.DataFrame(model.cv_results_)
        rs = find_best_score(rs_df, main_scoring_metric = 'f1_weighted')
        model_params = rs.params.values[0]
        model_params = extract_clf_params(model_params)

        # initaite a new model with parameters obtained from previous step
        tunned_model = DecisionTreeClassifier(**model_params, random_state=42)

        # fit this new model on the full train data and test on the test set
        tunned_model.fit(X_train, y_train)

        # get test resuls

        accuracy, precision, recall, fscore, auc_score = hold_out_eval_dt(tunned_model, X_train, X_test, y_train, y_test)

        rs['test_accuracy'] = accuracy
        rs['test_precision'] = precision
        rs['test_recall'] = recall
        rs['test_fscore'] = fscore
        rs['test_auc_score'] = auc_score
        rs['ticker'] = company
        rs['model'] = model_type
        rs['fold'] = idx
        

        final_results_dt = final_results_dt.append(rs)
        
    print(company)
    

AAPL
AMZN
BABA
GILD
HLT
JNJ
MAR
MCD
MSFT
QSR
UAL


In [252]:
final_results_dt

Unnamed: 0,params,mean_test_accuracy,mean_test_precision_weighted,mean_test_recall_weighted,mean_test_f1_weighted,test_accuracy,test_precision,test_recall,test_fscore,test_auc_score,ticker,model,fold
9,"{'clf__criterion': 'entropy', 'clf__max_depth': 4, 'clf__splitter': 'random'}",0.65,0.566667,0.666667,0.58,0.6,0.6,0.6,0.6,0.583333,AAPL,DT,0
13,"{'clf__criterion': 'entropy', 'clf__max_depth': 8, 'clf__splitter': 'random'}",0.63,0.745238,0.628571,0.583896,0.65,0.4225,0.65,0.512121,0.423077,AAPL,DT,1
7,"{'clf__criterion': 'gini', 'clf__max_depth': 12, 'clf__splitter': 'random'}",0.534524,0.558952,0.56,0.493902,0.45,0.471875,0.45,0.398667,0.479798,AAPL,DT,2
15,"{'clf__criterion': 'entropy', 'clf__max_depth': 12, 'clf__splitter': 'random'}",0.586905,0.651409,0.584615,0.543452,0.5,0.530303,0.5,0.510417,0.450549,AAPL,DT,3
11,"{'clf__criterion': 'entropy', 'clf__max_depth': 6, 'clf__splitter': 'random'}",0.570393,0.578017,0.623529,0.580133,0.3,0.09,0.3,0.138462,0.5,AAPL,DT,4
1,"{'clf__criterion': 'gini', 'clf__max_depth': 4, 'clf__splitter': 'random'}",0.683333,0.688889,0.666667,0.66,0.5,0.25,0.5,0.333333,0.5,AMZN,DT,0
10,"{'clf__criterion': 'entropy', 'clf__max_depth': 6, 'clf__splitter': 'best'}",0.55,0.561293,0.628571,0.553593,0.55,0.550505,0.55,0.548872,0.545,AMZN,DT,1
0,"{'clf__criterion': 'gini', 'clf__max_depth': 4, 'clf__splitter': 'best'}",0.525714,0.538952,0.6,0.552617,0.45,0.786111,0.45,0.369333,0.571429,AMZN,DT,2
0,"{'clf__criterion': 'gini', 'clf__max_depth': 4, 'clf__splitter': 'best'}",0.568651,0.590464,0.584615,0.518852,0.6,0.828571,0.6,0.6,0.714286,AMZN,DT,3
4,"{'clf__criterion': 'gini', 'clf__max_depth': 8, 'clf__splitter': 'best'}",0.624177,0.747687,0.552941,0.510169,0.3,0.09,0.3,0.138462,0.5,AMZN,DT,4


In [253]:
get_result_summary(final_results_dt)

Unnamed: 0_level_0,test_accuracy,test_precision,test_recall,test_fscore,test_auc_score,Total
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AAPL,0.5,0.422936,0.5,0.431933,0.487352,0.468444
AMZN,0.48,0.501038,0.48,0.398,0.566143,0.485036
BABA,0.49,0.515913,0.49,0.469324,0.492404,0.491528
GILD,0.5,0.516667,0.5,0.402341,0.526301,0.489062
HLT,0.44,0.346083,0.44,0.367885,0.463983,0.41159
JNJ,0.38,0.412198,0.38,0.363035,0.407586,0.388564
MAR,0.52,0.499928,0.52,0.424298,0.501864,0.493218
MCD,0.52,0.408059,0.52,0.435585,0.514905,0.47971
MSFT,0.48,0.655829,0.48,0.43088,0.562319,0.521806
QSR,0.51,0.503873,0.51,0.42368,0.53966,0.497443


# Random Forest

In [254]:
from sklearn.ensemble import RandomForestClassifier

In [259]:
final_results_rf = pd.DataFrame() 
model_type = 'RF'
n_splits_outter = 5
n_splits_inner = 5
X_features = ['Adj close','MA_3D_mean', 'MA_5D_mean', 'MA_7D_mean']

params = {'clf__criterion' : ['gini', 'entropy'],
          'clf__max_depth' : [4,6,8,12],
          'clf__n_estimators' : [15,20,25,30,40,50,75,100]
          }

scoring = {'accuracy' : 'balanced_accuracy',
            'precision_weighted' : 'precision_weighted',
            'recall_weighted' : 'recall_weighted',
            'f1_weighted': 'f1_weighted'}
            #'roc_auc' : make_scorer(roc_auc_score) }


transformer = FeatureUnion(
    transformer_list=[
        ('imputer',  KNNImputer(n_neighbors=4, weights="uniform")),
        ('indicators', MissingIndicator())])  # flag of imeded values X[np.where(results[:,2] == 1 ,True, False)]



pipeline = Pipeline([
                     ('transformer', transformer),
                     ('clf', RandomForestClassifier(random_state=42))
                    ])


for tick in df.ticker.unique():

    company = tick
    
    X = df.loc[df.ticker == company, X_features] #.diff().dropna()
    y = df.loc[df.ticker == company,'class'] #.iloc[1:]

    tscv_outter = TimeSeriesSplit(n_splits=n_splits_outter)

    # envole for parameter tunning
    for idx, (tr, tt) in enumerate(tscv_outter.split(X)):

        X_train = X.iloc[tr,:]
        y_train = y.iloc[tr]

        X_test = X.iloc[tt,:]
        y_test = y.iloc[tt]

        # cross validation split for parameter tuning
        tscv_inner = TimeSeriesSplit(n_splits=n_splits_inner)

        model = GridSearchCV(
            estimator = pipeline,
            param_grid = params,
            #n_iter = 50,
            n_jobs = -1,
            cv = tscv_inner,
            #verbose=5,
            pre_dispatch='2*n_jobs',
            # random_state = 7,
            return_train_score = True,
            scoring = scoring,
            refit = 'f1_weighted')
            #error_score = 0)
        # fit randomized search 
        model.fit(X_train, y_train)
        # get the best parameters from the validation set
        rs_df = pd.DataFrame(model.cv_results_)
        rs = find_best_score(rs_df, main_scoring_metric = 'f1_weighted')
        model_params = rs.params.values[0]
        model_params = extract_clf_params(model_params)

        # initaite a new model with parameters obtained from previous step
        tunned_model = RandomForestClassifier(**model_params, random_state=42)

        # fit this new model on the full train data and test on the test set
        tunned_model.fit(X_train, y_train)

        # get test resuls

        accuracy, precision, recall, fscore, auc_score = hold_out_eval_dt(tunned_model, X_train, X_test, y_train, y_test)

        rs['test_accuracy'] = accuracy
        rs['test_precision'] = precision
        rs['test_recall'] = recall
        rs['test_fscore'] = fscore
        rs['test_auc_score'] = auc_score
        rs['ticker'] = company
        rs['model'] = model_type
        rs['fold'] = idx
        

        final_results_rf = final_results_rf.append(rs)
        
    print(company)
    

AAPL
AMZN
BABA
GILD
HLT
JNJ
MAR
MCD
MSFT
QSR
UAL


In [260]:
final_results_rf

Unnamed: 0,params,mean_test_accuracy,mean_test_precision_weighted,mean_test_recall_weighted,mean_test_f1_weighted,test_accuracy,test_precision,test_recall,test_fscore,test_auc_score,ticker,model,fold
2,"{'clf__criterion': 'gini', 'clf__max_depth': 4, 'clf__n_estimators': 25}",0.55,0.4,0.533333,0.44,0.55,0.559596,0.55,0.553453,0.5,AAPL,RF,0
29,"{'clf__criterion': 'gini', 'clf__max_depth': 12, 'clf__n_estimators': 50}",0.573333,0.628095,0.542857,0.526667,0.45,0.786111,0.45,0.369333,0.615385,AAPL,RF,1
18,"{'clf__criterion': 'gini', 'clf__max_depth': 8, 'clf__n_estimators': 25}",0.602857,0.691722,0.56,0.494105,0.4,0.368627,0.4,0.320879,0.328283,AAPL,RF,2
34,"{'clf__criterion': 'entropy', 'clf__max_depth': 4, 'clf__n_estimators': 25}",0.583135,0.708974,0.507692,0.462613,0.45,0.522917,0.45,0.456892,0.456044,AAPL,RF,3
23,"{'clf__criterion': 'gini', 'clf__max_depth': 8, 'clf__n_estimators': 100}",0.58123,0.562046,0.541176,0.483287,0.3,0.09,0.3,0.138462,0.529762,AAPL,RF,4
63,"{'clf__criterion': 'entropy', 'clf__max_depth': 12, 'clf__n_estimators': 100}",0.8,0.711111,0.8,0.74,0.45,0.401961,0.45,0.373219,0.395,AMZN,RF,0
42,"{'clf__criterion': 'entropy', 'clf__max_depth': 6, 'clf__n_estimators': 25}",0.588333,0.651565,0.628571,0.582309,0.65,0.664835,0.65,0.641944,0.665,AMZN,RF,1
19,"{'clf__criterion': 'gini', 'clf__max_depth': 8, 'clf__n_estimators': 30}",0.635238,0.604032,0.58,0.532454,0.5,0.794118,0.5,0.447917,0.571429,AMZN,RF,2
32,"{'clf__criterion': 'entropy', 'clf__max_depth': 4, 'clf__n_estimators': 15}",0.57877,0.593432,0.553846,0.494018,0.7,0.85,0.7,0.709091,0.833333,AMZN,RF,3
11,"{'clf__criterion': 'gini', 'clf__max_depth': 6, 'clf__n_estimators': 30}",0.607035,0.744504,0.517647,0.467431,0.3,0.09,0.3,0.138462,0.535714,AMZN,RF,4


In [262]:
get_result_summary(final_results_rf)

Unnamed: 0_level_0,test_accuracy,test_precision,test_recall,test_fscore,test_auc_score,Total
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AAPL,0.43,0.46545,0.43,0.367804,0.485895,0.43583
AMZN,0.52,0.560183,0.52,0.462126,0.600095,0.532481
BABA,0.54,0.605187,0.54,0.482505,0.590495,0.551637
GILD,0.51,0.368519,0.51,0.411411,0.560476,0.472081
HLT,0.44,0.363136,0.44,0.36033,0.473373,0.415368
JNJ,0.5,0.549787,0.5,0.475928,0.523861,0.509915
MAR,0.52,0.529708,0.52,0.446556,0.487838,0.50082
MCD,0.49,0.520132,0.49,0.424434,0.474866,0.479886
MSFT,0.5,0.677448,0.5,0.442327,0.574763,0.538908
QSR,0.41,0.270042,0.41,0.291447,0.521656,0.380629


# LSTM

In [458]:
from sklearn.preprocessing import MinMaxScaler

In [560]:
company = 'MSFT'
X_features = ['Adj close','MA_3D_mean', 'MA_5D_mean', 'MA_7D_mean']
X = df.loc[df.ticker == company, X_features] #.diff().dropna()
y = df.loc[df.ticker == company,'class'] #.iloc[1:]

In [561]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 23 , random_state =42, shuffle = False)

In [562]:
sc = MinMaxScaler(feature_range=(0,1))
X_train_sc = sc.fit_transform(np.array(X_train))
X_test_sc  = sc.transform(np.array(X_test))

In [563]:
print(X_train_sc.shape)
print(X_test_sc.shape)

(100, 4)
(23, 4)


In [564]:
# samples x timestamp, features

In [565]:
input_window = 10
X_train_l = []
y_train_l = []

for i in range(input_window, X_train_sc.shape[0]):
    X_train_l.append(X_train_sc[i-input_window:i, :])
    y_train_l.append(y_train[i-1])

X_train_lstm, y_train_lstm = np.array(X_train_l), np.array(y_train_l)
print("test samples:", X_train_lstm.shape[0])

test samples: 90


In [566]:
# np.concatenate((X_train_sc[:7], np.array(y_train[:7]).reshape(-1,1)), axis=1)
# X_train_lstm[:3]
# y_train_lstm[:3]

In [567]:
X_test_l = []
y_test_l = []

for i in range(input_window, X_test_sc.shape[0]):
    X_test_l.append(X_test_sc[i-input_window:i, :])
    y_test_l.append(y_test[i-1])

X_test_lstm, y_test_lstm = np.array(X_test_l), np.array(y_test_l)
print("test samples:", X_test_lstm.shape[0])

test samples: 13


In [568]:
# samples x timestamp, features
print("X tran shape:", X_train_lstm.shape)
print("y tran shape:", y_train_lstm.shape)

print("X test shape:", X_test_lstm.shape)
print("y test shape:", y_test_lstm.shape)

X tran shape: (90, 10, 4)
y tran shape: (90,)
X test shape: (13, 10, 4)
y test shape: (13,)


In [557]:
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dropout
from keras.layers import Dense

In [569]:
model = Sequential()
model.add(LSTM(units=50,return_sequences=True,input_shape=(X_train_lstm.shape[1], X_train_lstm.shape[2])))
model.add(Dropout(0.2))
model.add(LSTM(units=50,return_sequences=True)) # recurrent_dropout=0.2
model.add(Dropout(0.2))
model.add(LSTM(units=50,return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(units=50))
model.add(Dropout(0.2))
model.add(Dense(units=1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train_lstm,y_train_lstm,epochs=100,batch_size=32)
#model.fit(X_train_lstm, y_train_lstm, validation_data=(X_test_lstm, y_test_lstm), epochs=100, batch_size=32)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.callbacks.History at 0x1155f0c3988>

In [570]:
# Final evaluation of the model
scores = model.evaluate(X_test_lstm, y_test_lstm, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 76.92%


In [585]:
predicted_stock_price = model.predict(X_test_lstm)
predicted_stock_price = np.where(predicted_stock_price >= 0.5 , 1, 0 ).reshape(1,-1)[0]
true_stock_price = y_test_lstm.astype(int)
print("predicted: ", predicted_stock_price)
print("true: ", true_stock_price)

predicted:  [1 1 1 1 1 1 1 1 1 1 0 0 0]
true:  [1 0 1 1 1 1 1 0 1 1 0 1 0]


In [587]:
10/13

0.7692307692307693

In [586]:
confusion_matrix(true_stock_price, predicted_stock_price)

array([[2, 2],
       [1, 8]], dtype=int64)

In [633]:
X

Unnamed: 0_level_0,Adj close,MA_3D_mean,MA_5D_mean,MA_7D_mean
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-01-02,159.35,159.35,159.35,159.35
2020-01-03,157.37,158.36,158.36,158.36
2020-01-06,157.77,157.77,158.163333,158.163333
2020-01-07,156.34,157.055,157.16,157.7075
2020-01-08,158.83,157.646667,157.646667,157.932
2020-01-09,160.81,158.66,158.4375,158.224
2020-01-10,160.07,159.903333,158.764,158.764
2020-01-13,161.99,161.99,160.956667,159.608
2020-01-14,160.85,161.42,160.97,160.51
2020-01-15,161.89,161.576667,161.576667,161.122


# Bagging 

In [94]:
import time
import numpy as np
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn import datasets
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC

In [107]:
start = time.time()
svc = SVC(C = 0.031301037345420765, gamma = 8.038220970385176, kernel =  'linear', probability=True)
clf = OneVsRestClassifier(svc, n_jobs=-1)
clf.fit(X, y)
end = time.time()
print("Single SVC", end - start, clf.score(X,y))
proba = clf.predict_proba(X)

Single SVC 0.058841705322265625 0.7008547008547008


In [108]:
n_estimators = 10
start = time.time()
clf = OneVsRestClassifier(BaggingClassifier(svc, max_samples=1.0 / n_estimators, n_estimators=n_estimators))
clf.fit(X, y)
end = time.time()
print("Bagging SVC", end - start, clf.score(X,y))
proba = clf.predict_proba(X)

Bagging SVC 0.03091597557067871 0.6239316239316239


In [109]:
start = time.time()
clf = RandomForestClassifier(min_samples_leaf=20)
clf.fit(X, y)
end = time.time()
print("Random Forest", end - start, clf.score(X,y))
proba = clf.predict_proba(X)

Random Forest 0.31952834129333496 0.6666666666666666
