## Задание Basic code.
## По мотивам статьи: 2014 - On the Importance of Text Analysis for Stock Price Prediction

In [1]:
# load modules
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import numpy as np
from scipy.sparse import hstack, vstack
import pandas as pd
import re
import datetime
import time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import NMF, LatentDirichletAllocation


RANDOM_STATE = 42

  from numpy.core.umath_tests import inner1d


In [2]:
time_ = time.time()

tickers = ["AAPL", "ADBE", "AMZN", "GOOG", "HPQ", "IBM", "INTC", "MSFT", "NVDA"] # tickers for {Apple,Adobe,Amazon,Google,HP,IBM,Intel,MicroSoft,NVidia}
sp500_ticker = "gspc" # S&P500 ticker from paper

reports_dataset = pd.DataFrame(columns=['ticker', 'date', 'time', 'text', 'movement', 'movement_normalized', 'label'])

for iter_, ticker in enumerate(tickers):
    ############################################
    ################## PART 1 ##################
    ############################################
    
    print('iteration %i of %i | ticker: %s' % (iter_+1, len(tickers), ticker))
    
    # load data
    
    # 1. stock quotes
    price = pd.read_csv('data/price_history/'+ticker+'.csv')
    price = price[price['Date'] > '2001-12-31']
    price = price.sort_values('Date').reset_index(drop=True)
    sp500 = pd.read_csv('data/price_history/'+sp500_ticker+'.csv')
    sp500 = sp500[sp500['Date'] > '2001-12-31']
    sp500 = sp500.sort_values('Date').reset_index(drop=True)

    # 2. 8K reports
    with open('data/8K/'+ticker, 'r') as f:
        f_lines = f.readlines()
    raw_8k_reports = pd.Series(' '.join(f_lines).split('</DOCUMENT>')[:-1])

    def transform_8k_report_to_dataframe(report):
        # transform 8k report to pd.DataFrame row
        #
        # report: string
        # result: pd.DataFrame

        result = pd.DataFrame(columns=['ticker', 'date', 'time', 'text'], index=[0])

        result['ticker'] = report.split('FILE:')[1].split('/')[0]

        datetime_ = report.split('TIME:')[1].split('\n')[0]
        datetime_ = datetime.datetime.strptime(datetime_, '%Y%m%d%H%M%S')
        result['date'] = str(datetime_.date())
        result['time'] = str(datetime_.time())

        text = report.split('ITEM:')[-1]
        text = text.replace('QuickLinks', '').replace('Click here to rapidly navigate through this document', '')
        text = ' '.join(text.split())
        result['text'] = text

        return result

    reports = pd.DataFrame(columns=[])
    for report in raw_8k_reports:
        row = transform_8k_report_to_dataframe(report)
        reports = pd.concat([reports, row], axis=0)
    reports.reset_index(drop=True, inplace=True)

    del f_lines, raw_8k_reports, report
    
    
    ############################################
    ################## PART 2 ##################
    ############################################
    
    # create binary markup {MOVE, STAY} for price data
    # aggregate Up and DOWN labels to MOVE label

    price_movement = pd.DataFrame(columns=['date', 'movement', 'movement_normalized', 'label'])

    for i, j in price.iloc[:-1,:].iterrows():
        row = pd.DataFrame(columns=['date', 'movement', 'movement_normalized', 'label'], index=[0])
        row['date'] = j['Date']

        column_next =  "Open"
        column_prev =  "Close"
        price_change = (price.loc[i+1, column_next] - price.loc[i, column_prev]) / price.loc[i, column_prev]
        sp500_change = (sp500.loc[i+1, column_next] - sp500.loc[i, column_prev]) / sp500.loc[i, column_prev]
        row['movement'] = price_change

        price_change_normalized =  price_change - sp500_change
        row['movement_normalized'] = price_change_normalized

        if price_change_normalized >= 0.01: # value from paper
            row['label'] = "MOVE" # price movement: UP
        elif price_change_normalized <= -0.01: # value from paper
            row['label'] =  "MOVE"# price movement: DOWN
        else:
            row['label'] =  "STAY"# price movement: STAY

        price_movement = pd.concat([price_movement, row], axis=0)

    price_movement.reset_index(drop=True, inplace=True)

    del price, sp500
    
    # merge stock quotes and text

    reports = pd.merge(reports, price_movement, on='date', how='left')
    reports.dropna(axis=0, inplace=True)

    del price_movement
    
    # combine tickers
    
    reports_dataset = pd.concat([reports_dataset, reports], axis=0)
    
    del reports

reports_dataset.reset_index(drop=True, inplace=True)
print('finished. time elapsed: %.2f sec' % (time.time() - time_))

iteration 1 of 9 | ticker: AAPL
iteration 2 of 9 | ticker: ADBE
iteration 3 of 9 | ticker: AMZN
iteration 4 of 9 | ticker: GOOG
iteration 5 of 9 | ticker: HPQ
iteration 6 of 9 | ticker: IBM
iteration 7 of 9 | ticker: INTC
iteration 8 of 9 | ticker: MSFT
iteration 9 of 9 | ticker: NVDA
finished. time elapsed: 87.99 sec


In [3]:
# reports_dataset.to_csv("ReportsDatasetStep2.csv")
# reports_dataset = pd.read_csv("ReportsDatasetStep2.csv")

In [4]:
############################################
################## PART 3 ##################
############################################

# split data to train / validation / test

# features
train_start, train_end = '2001-12-31', '2008-12-31' # train period from paper
x_train = reports_dataset[(train_start < reports_dataset['date']) & (reports_dataset['date'] <= train_end)]['text']

val_start, val_end = train_end, '2010-12-31'# validation period from paper
x_val = reports_dataset[(val_start < reports_dataset['date']) & (reports_dataset['date'] <= val_end)]['text']

test_start, test_end =  val_end, '2012-12-31'# test period from paper
x_test = reports_dataset[(test_start < reports_dataset['date']) & (reports_dataset['date'] <= test_end)]['text']

# target variable
# I added a comparison to "MOVE" in order to work with binary data explicitly
y_train = reports_dataset[(train_start < reports_dataset['date']) & (reports_dataset['date'] <= train_end)]['label'] == "MOVE"
y_val = reports_dataset[(val_start < reports_dataset['date']) & (reports_dataset['date'] <= val_end)]['label'] == "MOVE"
y_test = reports_dataset[(test_start < reports_dataset['date']) & (reports_dataset['date'] <= test_end)]['label'] == "MOVE"

In [5]:
############################################
################## PART 4 ##################
############################################

time_ = time.time()

# create features

# 1. Unigrams
# example: vectorizer_params = {}
#          vectorizer = Vectorizer(**vectorizer_params)
#          unigrams_train_features = vectorizer.fit_transform(x_train)
# ...
vectorizer_params = {"ngram_range": (1, 1), # we are using unigrams
                     "min_df": 10} # according to authors

# I do not remove stop-words since it was not mentioned in the original paper

vectorizer = CountVectorizer(**vectorizer_params) 
unigrams_train_features = vectorizer.fit_transform(x_train)

# I decided to try to implement feature selection using PMI
# Found the implementation guide here: 
# https://stackoverflow.com/questions/46752650/information-gain-calculation-with-scikit-learn?rq=1

feature_scores = mutual_info_classif(unigrams_train_features, y_train, 
                                     discrete_features=True,
                                     random_state=RANDOM_STATE)

vocab = [el for el, score in
         zip(vectorizer.get_feature_names(), feature_scores) if
         score > 0.01]

# Using 0.01 as a threshold since we ends up with 2411 features
# which is pretty close to 2319 features left in the original paper 

vectorizer_params['vocabulary'] = vocab

vectorizer = CountVectorizer(**vectorizer_params) 

unigrams_train_features = vectorizer.fit_transform(x_train)
unigrams_val_features = vectorizer.transform(x_val)
unigrams_test_features = vectorizer.transform(x_test)

print('Created {} features using unigrams'.format(unigrams_train_features.shape[1]))

# 2. NMF vector for 50, 100, and 200 components
# example: nmf_params = {}
#          nmf = NMF(**nmf_params)
#          nmf_train_features = nmf.fit_transform(unigrams_train_features)
# ...

nmf_params = {'n_components': 50,
              'random_state': RANDOM_STATE,
              'l1_ratio': 0.5,
              'alpha': 0.1}
# nmf_params were chosen based on
# https://scikit-learn.org/0.18/auto_examples/applications/topics_extraction_with_nmf_lda.html

nmf = NMF(**nmf_params)
nmf_50_train_features = nmf.fit_transform(unigrams_train_features)
nmf_50_val_features = nmf.transform(unigrams_val_features)
nmf_50_test_features = nmf.transform(unigrams_test_features)

print('nmf 50 features created')

nmf_params['n_components'] = 100
nmf = NMF(**nmf_params)
nmf_100_train_features = nmf.fit_transform(unigrams_train_features)
nmf_100_val_features = nmf.transform(unigrams_val_features)
nmf_100_test_features = nmf.transform(unigrams_test_features)

print('nmf 100 features created')

nmf_params['n_components'] = 200
nmf = NMF(**nmf_params)
nmf_200_train_features = nmf.fit_transform(unigrams_train_features)
nmf_200_val_features = nmf.transform(unigrams_val_features)
nmf_200_test_features = nmf.transform(unigrams_test_features)

print('nmf 200 features created')
print('finished. time elapsed: %.2f sec' % (time.time() - time_))

Created 2411 features using unigrams
nmf 50 features created
nmf 100 features created
nmf 200 features created
finished. time elapsed: 114.03 sec


In [6]:
############################################
################## PART 5 ##################
############################################

# combine all features to one feature space

# 1. NMF 50 features
nmf_50_x_train = hstack([unigrams_train_features, 
                         nmf_50_train_features])

nmf_50_x_val = hstack([unigrams_val_features,   
                       nmf_50_val_features])

nmf_50_x_test = hstack([unigrams_test_features,  
                        nmf_50_test_features])

# 2. NMF 100 features
nmf_100_x_train = hstack([unigrams_train_features, 
                         nmf_100_train_features])

nmf_100_x_val = hstack([unigrams_val_features,   
                       nmf_100_val_features])

nmf_100_x_test = hstack([unigrams_test_features,  
                        nmf_100_test_features])


# 3. NMF 200 features
nmf_200_x_train = hstack([unigrams_train_features, 
                         nmf_200_train_features])

nmf_200_x_val = hstack([unigrams_val_features,   
                       nmf_200_val_features])

nmf_200_x_test = hstack([unigrams_test_features,  
                        nmf_200_test_features])

# 4. Ensemble features
ensemble_x_train = hstack([unigrams_train_features, 
                           nmf_50_train_features, 
                           nmf_100_train_features, 
                           nmf_200_train_features])

ensemble_x_val = hstack([unigrams_val_features, 
                         nmf_50_val_features, 
                         nmf_100_val_features, 
                         nmf_200_val_features])

ensemble_x_test = hstack([unigrams_test_features, 
                          nmf_50_test_features, 
                          nmf_100_test_features, 
                          nmf_200_test_features])

In [7]:
############################################
################## PART 6 ##################
############################################

time_ = time.time()

# set basic classifier parameters, fit classifiers. 
# Hint: initialize new basic classifier before new training process

# example: basic_classifier_params = {}
#          basic_classifier = BasicClassifier(**basic_classifier_params)


# 1. Unigrams
# example: rf_unigrams_model = ...
# ...
rf_params = {"n_estimators": 2000, # 2000 trees according to the paper
             "n_jobs": -1,
             "random_state": RANDOM_STATE}
rf_unigrams_model = RandomForestClassifier(**rf_params)
rf_unigrams_model.fit(unigrams_train_features, y_train)

print('rf_unigrams_model trained')

# 2. NMF 50

rf_nmf_50_model = RandomForestClassifier(**rf_params)
rf_nmf_50_model.fit(nmf_50_x_train, y_train)

print('rf_nmf_50_model trained')

# 3. NMF 100
rf_nmf_100_model = RandomForestClassifier(**rf_params)
rf_nmf_100_model.fit(nmf_100_x_train, y_train)

print('rf_nmf_100_model trained')

# 4. NMF 200
rf_nmf_200_model = RandomForestClassifier(**rf_params)
rf_nmf_200_model.fit(nmf_200_x_train, y_train)

print('rf_nmf_200_model trained')

# 5. Ensemble
rf_ensemble_model = RandomForestClassifier(**rf_params)
rf_ensemble_model.fit(ensemble_x_train, y_train)

print('rf_ensemble_model trained')
print('finished. time elapsed: %.2f sec' % (time.time() - time_))

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=2000, n_jobs=-1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

rf_unigrams_model trained


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=2000, n_jobs=-1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

rf_nmf_50_model trained


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=2000, n_jobs=-1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

rf_nmf_100_model trained


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=2000, n_jobs=-1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

rf_nmf_200_model trained


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=2000, n_jobs=-1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

rf_ensemble_model trained
finished. time elapsed: 30.28 sec


In [8]:
############################################
################## PART 7 ##################
############################################

# calculate quality measures

# 1. Unigrams
# example: rf_unigrams_quality = Quality(y_true, y_pred)

y_pred = rf_unigrams_model.predict(unigrams_test_features)
rf_unigrams_quality = accuracy_score(y_test, y_pred)

# 2. NMF 50
y_pred = rf_nmf_50_model.predict(nmf_50_x_test)
rf_nmf_50_quality = accuracy_score(y_test, y_pred)


# 3. NMF 100
y_pred = rf_nmf_100_model.predict(nmf_100_x_test)
rf_nmf_100_quality = accuracy_score(y_test, y_pred)


# 4. NMF 200
y_pred = rf_nmf_200_model.predict(nmf_200_x_test)
rf_nmf_200_quality = accuracy_score(y_test, y_pred)

# 5. Ensemble
y_pred = rf_ensemble_model.predict(ensemble_x_test)
rf_ensemble_quality = accuracy_score(y_test, y_pred)

# save results

results = pd.DataFrame({'model': ['rf_unigrams_model', 'rf_nmf_50_model', 
                                  'rf_nmf_100_model', 'rf_nmf_200_model', 'rf_ensemble_model'], 
                        'n_features': [rf_unigrams_model.n_features_, rf_nmf_50_model.n_features_, 
                                       rf_nmf_100_model.n_features_, rf_nmf_200_model.n_features_, 
                                       rf_ensemble_model.n_features_], 
                        'accuracy': [rf_unigrams_quality, rf_nmf_50_quality, 
                                     rf_nmf_100_quality, rf_nmf_200_quality, rf_ensemble_quality]}, 
                       columns=['model', 'n_features', 'accuracy'])
results.to_csv('data/results.csv', index=False)
results

Unnamed: 0,model,n_features,accuracy
0,rf_unigrams_model,2411,0.760163
1,rf_nmf_50_model,2461,0.764228
2,rf_nmf_100_model,2511,0.768293
3,rf_nmf_200_model,2611,0.764228
4,rf_ensemble_model,2761,0.760163


**I have less features since I'm using PMI for feature selection as it was mentioned in the original paper. I've decided to leave calculations with less features since it works faster and shows a better result**

# Errors analysis

Let's inspect where our best model makes errors

In [9]:
from sklearn.metrics import confusion_matrix
from matplotlib import pyplot as plt

%matplotlib inline

In [10]:
y_pred = rf_nmf_100_model.predict(nmf_100_x_test)
confusion_matrix(y_test, y_pred)

array([[131,  27],
       [ 30,  58]])

It would be pretty hard to analize false pozitive errors. Let's try to take a closer look to the example of false positive error (model believes that the piece of news will hane an effect on the share price)

*Just a reminder: I set "MOVE" answer as True and "STAY" as False*

In [11]:
false_positive = ((y_test == False) & (y_pred == True))
x_test[false_positive.index[2]]

'Submission of Matters to a Vote of Security Holders Check the appropriate box below if the Form 8-K filing is intended to simultaneously satisfy the filing obligation of the registrant under any of the following provisions: Item 5.07 Submission of Matters to a Vote of Security Holders The Annual Meeting of Shareholders (the "Annual Meeting") of Apple Inc. (the "Company") was held on February 23, 2011. At the Annual Meeting, the shareholders voted on the following six proposals and cast their votes as described below. Proposal 1 The individuals listed below were elected at the Annual Meeting to serve a one-year term on the Company\'s Board of Directors (the "Board"). Proposal 2 Proposal 2 was a management proposal to ratify the appointment of Ernst & Young LLP as the Company\'s independent registered public accounting firm for fiscal year 2011, as described in the proxy materials. This proposal was approved. Proposal 3 Proposal 3 was a management proposal to hold an advisory vote on ex

**We may see here some positive words like "satisfy", "proposals", "approved", "recommendation", "consistent". Yet, we, as a human beings may understand that it is not so positive information: this is all mainly about some company governance tasks.**

# Playing with models

In the following section I'm going to check the performance of XGBClassifier, LinearSVC and LogisticRegression on the dataset created in the previous section using f1-score and roc-auc

In [12]:
# ! pip install xgboost

In [13]:
from xgboost import XGBClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, f1_score

We are going to use not tuned classifiers in this section

In [14]:
time_ = time.time()

params = {"random_state": RANDOM_STATE}

def create_and_fit_models(X, y, params):
    return {"xgb": XGBClassifier(n_jobs=-1, **params).fit(X, y),
            "logit": LogisticRegression(**params).fit(X, y),
            "svm": LinearSVC(**params).fit(X, y),
            "rf": RandomForestClassifier(**params).fit(X, y)}

# 1. Unigrams


unigrams_models = create_and_fit_models(unigrams_train_features, y_train, params)

print('unigrams_models trained')

# 2. NMF 50

nmf_50_models = create_and_fit_models(nmf_50_x_train, y_train, params)

print('nmf_50_models trained')

# 3. NMF 100
nmf_100_models = create_and_fit_models(nmf_100_x_train, y_train, params)

print('nmf_100_models trained')

# 4. NMF 200
nmf_200_models = create_and_fit_models(nmf_200_x_train, y_train, params)

print('nmf_200_models trained')

# 5. Ensemble
ensemble_models = create_and_fit_models(ensemble_x_train, y_train, params)

print('ensemble_models trained')
print('finished. time elapsed: %.2f sec' % (time.time() - time_))

unigrams_models trained
nmf_50_models trained
nmf_100_models trained
nmf_200_models trained
ensemble_models trained
finished. time elapsed: 10.63 sec


In [15]:
def get_quality_scores(features_name, models, X_test, y_test):
    res = []
    for model_name, model in models.items():
        y_pred = model.predict(X_test)
        f1 = f1_score(y_test, y_pred)
        roc_auc = roc_auc_score(y_test, y_pred)
        res.append({"model": model_name,
                    "features": features_name,
                    "f1 score": f1,
                    "roc_auc_score": roc_auc})
    return res
    
res = []
# 1. Unigrams
# example: rf_unigrams_quality = Quality(y_true, y_pred)

res.extend(get_quality_scores("unigrams", unigrams_models,
                              unigrams_test_features, y_test))

# 2. NMF 50
res.extend(get_quality_scores("nmf_50", nmf_50_models,
                              nmf_50_x_test, y_test))

# 3. NMF 100
res.extend(get_quality_scores("nmf_100", nmf_100_models,
                               nmf_100_x_test, y_test))

# 4. NMF 200
res.extend(get_quality_scores("nmf_200", nmf_200_models,
                               nmf_200_x_test, y_test))

# 5. Ensemble
res.extend(get_quality_scores("ensemble", ensemble_models,
                               ensemble_x_test, y_test))

  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


In [16]:
df = pd.DataFrame(res, columns=["model", "features", "f1 score", "roc_auc_score"])
df.sort_values(["model", "f1 score"], ascending=False, inplace=True)

df.to_csv('data/results2.csv', index=False)

In [39]:
df

Unnamed: 0,model,features,f1 score,roc_auc_score
0,xgb,unigrams,0.666667,0.740291
12,xgb,nmf_200,0.651429,0.728927
16,xgb,ensemble,0.643275,0.723892
4,xgb,nmf_50,0.635294,0.718211
8,xgb,nmf_100,0.635294,0.718211
6,svm,nmf_50,0.56044,0.653697
14,svm,nmf_200,0.553191,0.643556
2,svm,unigrams,0.544503,0.634062
10,svm,nmf_100,0.544503,0.634062
18,svm,ensemble,0.541176,0.64744


We may see that not tuned RF and XGB models outperforms logit and svm models

# Models tuning

Finally, we may use our validation dataset to tune parameters of our models!

We are going to tune XGB and logit models using unigrams as features; RF and SVM models - using nmf_50 features.

We will use f1 score and roc-auc score to compare our models.

In [17]:
from scipy import sparse
from sklearn.model_selection import PredefinedSplit, GridSearchCV
from sklearn.metrics import make_scorer
from tqdm import tqdm_notebook
import warnings
warnings.simplefilter("ignore")

In [18]:
def prep_dataset(dataset):
    """
    we want to tune parameters using validation set
    GridSearchCV with default cv param wouldn't help us with that
    We need to use PredefinedSplit as mentioned in
    https://stackoverflow.com/questions/43764999/python-machine-learning-perform-a-grid-search-on-custom-validation-set/43766334#43766334
    """
    my_test_fold = []

    for i in range(dataset["X_train"].shape[0]):
        my_test_fold.append(-1)

    for i in range(dataset["X_val"].shape[0]):
        my_test_fold.append(0)
    
    dataset['X_united'] = sparse.vstack([dataset["X_train"], dataset["X_val"]])
    dataset['y_united'] = np.concatenate([y_train.values, y_val.values])
    dataset['cv'] = PredefinedSplit(test_fold=my_test_fold)
    return dataset

In [19]:
unigrams_dataset = prep_dataset(
    {"X_train": unigrams_train_features,
     "y_train": y_train,
     "X_val": unigrams_val_features,
     "y_val": y_val,
     "X_test": unigrams_test_features,
     "y_test": y_test})

nmf_50_dataset = prep_dataset(
    {"X_train": nmf_50_x_train,
     "y_train": y_train,
     "X_val": nmf_50_x_val,
     "y_val": y_val,
     "X_test": nmf_50_x_test,
     "y_test": y_test})

In [20]:
model_to_dataset = {
    "xgb": unigrams_dataset,
    "logit": unigrams_dataset,
    "svm": nmf_50_dataset,
    "rf": nmf_50_dataset}

In [32]:
models = {"xgb": XGBClassifier(random_state=RANDOM_STATE),
         "logit": LogisticRegression(random_state=RANDOM_STATE),
         "svm": LinearSVC(random_state=RANDOM_STATE),
         "rf": RandomForestClassifier(random_state=RANDOM_STATE)}

all_model_params = {
    "xgb": {"gamma": [0., 0.3, 0.5],
            "max_depth": [3, 6, 10],
            "n_estimators": [20, 200]},
    "logit": {"penalty": ["l1", "l2"],
              "class_weight": [None, "balanced"],
              "C": [0.1, 1, 10, 100]},
    "svm": {"class_weight": [None, "balanced"],
            "C": [0.1, 1, 10, 100]},
    "rf": {"class_weight": [None], # excluded "balanced" since it always loses to None
           "n_estimators": [10, 20, 200, 2000]}}

In [33]:
res = []
with tqdm_notebook(total=2*len(models)) as f:
    for scorer_func, scorer_name in [(roc_auc_score, "roc-auc"), (f1_score, "f1")]:
        for model_name, model in models.items():
            model_dataset = model_to_dataset[model_name]
            model_params = all_model_params[model_name]
            gs = GridSearchCV(model, param_grid=model_params, scoring=make_scorer(scorer_func),
                              n_jobs=-1, cv=model_dataset['cv'])
            gs.fit(model_dataset['X_united'], model_dataset['y_united'])
            y_pred = gs.best_estimator_.predict(model_dataset['X_test'])
            test_score = scorer_func(model_dataset['y_test'], y_pred)
            res.append({"model": model_name,
                        "scorer": scorer_name,
                        "best_params": gs.best_params_,
                        "val_score": gs.best_score_,
                        "test_score": test_score})
            f.update()

HBox(children=(IntProgress(value=0, max=8), HTML(value='')))

GridSearchCV(cv=PredefinedSplit(test_fold=array([-1, -1, ...,  0,  0])),
       error_score='raise',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic',
       random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'gamma': [0.0, 0.3, 0.5], 'max_depth': [3, 6, 10], 'n_estimators': [20, 200]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(roc_auc_score), verbose=0)

GridSearchCV(cv=PredefinedSplit(test_fold=array([-1, -1, ...,  0,  0])),
       error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'penalty': ['l1', 'l2'], 'class_weight': [None, 'balanced'], 'C': [0.1, 1, 10, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(roc_auc_score), verbose=0)

GridSearchCV(cv=PredefinedSplit(test_fold=array([-1, -1, ...,  0,  0])),
       error_score='raise',
       estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=42, tol=0.0001,
     verbose=0),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'class_weight': [None, 'balanced'], 'C': [0.1, 1, 10, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(roc_auc_score), verbose=0)

GridSearchCV(cv=PredefinedSplit(test_fold=array([-1, -1, ...,  0,  0])),
       error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'class_weight': [None], 'n_estimators': [10, 20, 200, 2000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(roc_auc_score), verbose=0)

GridSearchCV(cv=PredefinedSplit(test_fold=array([-1, -1, ...,  0,  0])),
       error_score='raise',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic',
       random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'gamma': [0.0, 0.3, 0.5], 'max_depth': [3, 6, 10], 'n_estimators': [20, 200]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(f1_score), verbose=0)

GridSearchCV(cv=PredefinedSplit(test_fold=array([-1, -1, ...,  0,  0])),
       error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'penalty': ['l1', 'l2'], 'class_weight': [None, 'balanced'], 'C': [0.1, 1, 10, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(f1_score), verbose=0)

GridSearchCV(cv=PredefinedSplit(test_fold=array([-1, -1, ...,  0,  0])),
       error_score='raise',
       estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=42, tol=0.0001,
     verbose=0),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'class_weight': [None, 'balanced'], 'C': [0.1, 1, 10, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(f1_score), verbose=0)

GridSearchCV(cv=PredefinedSplit(test_fold=array([-1, -1, ...,  0,  0])),
       error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'class_weight': [None], 'n_estimators': [10, 20, 200, 2000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(f1_score), verbose=0)




In [35]:
pd.set_option('display.max_colwidth', -1)
tuned = pd.DataFrame(res, columns=["model", "scorer", "best_params", "val_score", "test_score"])
tuned.sort_values(["scorer", "test_score"], ascending=False, inplace=True)

tuned.to_csv('data/results3.csv', index=False)
tuned

Unnamed: 0,model,scorer,best_params,val_score,test_score
3,rf,roc-auc,"{'class_weight': None, 'n_estimators': 20}",0.749635,0.754891
0,xgb,roc-auc,"{'gamma': 0.3, 'max_depth': 10, 'n_estimators': 20}",0.763846,0.747267
1,logit,roc-auc,"{'C': 0.1, 'class_weight': None, 'penalty': 'l1'}",0.692383,0.699223
2,svm,roc-auc,"{'C': 0.1, 'class_weight': None}",0.635131,0.634781
7,rf,f1,"{'class_weight': None, 'n_estimators': 20}",0.700565,0.682927
4,xgb,f1,"{'gamma': 0.3, 'max_depth': 10, 'n_estimators': 20}",0.732673,0.674419
5,logit,f1,"{'C': 1, 'class_weight': 'balanced', 'penalty': 'l1'}",0.663551,0.547486
6,svm,f1,"{'C': 0.1, 'class_weight': 'balanced'}",0.6,0.523256


After tuning of hyperparameters we managed to achieve better results with all of the tested models (which is an obvious result)

RF is still ahead of the other models. It seems pretty interesting that the best RF model has only 20 trees. I expected to see 2000 trees since authors of the paper used 2000 trees.