## Задание Basic code.
## По мотивам статьи: 2014 - On the Importance of Text Analysis for Stock Price Prediction

In [30]:
# load modules
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import numpy as np
from scipy.sparse import hstack, vstack
import pandas as pd
import re
import datetime
import time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import NMF, LatentDirichletAllocation

import warnings
warnings.simplefilter("ignore")

RANDOM_STATE = 42

In [2]:
time_ = time.time()

def create_reports(tickers):
    sp500_ticker = "gspc" # S&P500 ticker from paper

    reports_dataset = pd.DataFrame(columns=['ticker', 'date', 'time', 'text', 'movement', 'movement_normalized', 'label'])

    for iter_, ticker in enumerate(tickers):
        ############################################
        ################## PART 1 ##################
        ############################################

        print('iteration %i of %i | ticker: %s' % (iter_+1, len(tickers), ticker))

        # load data

        # 1. stock quotes
        price = pd.read_csv('data/price_history/'+ticker+'.csv')
        price = price[price['Date'] > '2001-12-31']
        price = price.sort_values('Date').reset_index(drop=True)
        sp500 = pd.read_csv('data/price_history/'+sp500_ticker+'.csv')
        sp500 = sp500[sp500['Date'] > '2001-12-31']
        sp500 = sp500.sort_values('Date').reset_index(drop=True)

        # 2. 8K reports
        with open('data/8K/'+ticker, 'r') as f:
            f_lines = f.readlines()
        raw_8k_reports = pd.Series(' '.join(f_lines).split('</DOCUMENT>')[:-1])

        def transform_8k_report_to_dataframe(report):
            # transform 8k report to pd.DataFrame row
            #
            # report: string
            # result: pd.DataFrame

            result = pd.DataFrame(columns=['ticker', 'date', 'time', 'text'], index=[0])

            result['ticker'] = report.split('FILE:')[1].split('/')[0]

            datetime_ = report.split('TIME:')[1].split('\n')[0]
            datetime_ = datetime.datetime.strptime(datetime_, '%Y%m%d%H%M%S')
            result['date'] = str(datetime_.date())
            result['time'] = str(datetime_.time())

            text = report.split('ITEM:')[-1]
            text = text.replace('QuickLinks', '').replace('Click here to rapidly navigate through this document', '')
            text = ' '.join(text.split())
            result['text'] = text

            return result

        reports = pd.DataFrame(columns=[])
        for report in raw_8k_reports:
            row = transform_8k_report_to_dataframe(report)
            reports = pd.concat([reports, row], axis=0)
        reports.reset_index(drop=True, inplace=True)

        del f_lines, raw_8k_reports, report


        ############################################
        ################## PART 2 ##################
        ############################################

        # create binary markup {MOVE, STAY} for price data
        # aggregate Up and DOWN labels to MOVE label

        price_movement = pd.DataFrame(columns=['date', 'movement', 'movement_normalized', 'label'])

        for i, j in price.iloc[:-1,:].iterrows():
            row = pd.DataFrame(columns=['date', 'movement', 'movement_normalized', 'label'], index=[0])
            row['date'] = j['Date']

            column_next =  "Open"
            column_prev =  "Close"
            price_change = (price.loc[i+1, column_next] - price.loc[i, column_prev]) / price.loc[i, column_prev]
            sp500_change = (sp500.loc[i+1, column_next] - sp500.loc[i, column_prev]) / sp500.loc[i, column_prev]
            row['movement'] = price_change

            price_change_normalized =  price_change - sp500_change
            row['movement_normalized'] = price_change_normalized

            if price_change_normalized >= 0.01: # value from paper
                row['label'] = "MOVE" # price movement: UP
            elif price_change_normalized <= -0.01: # value from paper
                row['label'] =  "MOVE"# price movement: DOWN
            else:
                row['label'] =  "STAY"# price movement: STAY

            price_movement = pd.concat([price_movement, row], axis=0)

        price_movement.reset_index(drop=True, inplace=True)

        del price, sp500

        # merge stock quotes and text

        reports = pd.merge(reports, price_movement, on='date', how='left')
        reports.dropna(axis=0, inplace=True)

        del price_movement

        # combine tickers

        reports_dataset = pd.concat([reports_dataset, reports], axis=0)

        del reports

    reports_dataset.reset_index(drop=True, inplace=True)
    return reports_dataset

tickers = ["AAPL", "ADBE", "AMZN", "GOOG", "HPQ", "IBM", "INTC", "MSFT", "NVDA"] # tickers for {Apple,Adobe,Amazon,Google,HP,IBM,Intel,MicroSoft,NVidia}
reports_dataset = create_reports(tickers)

print('finished. time elapsed: %.2f sec' % (time.time() - time_))

iteration 1 of 9 | ticker: AAPL
iteration 2 of 9 | ticker: ADBE
iteration 3 of 9 | ticker: AMZN
iteration 4 of 9 | ticker: GOOG
iteration 5 of 9 | ticker: HPQ
iteration 6 of 9 | ticker: IBM
iteration 7 of 9 | ticker: INTC
iteration 8 of 9 | ticker: MSFT
iteration 9 of 9 | ticker: NVDA
finished. time elapsed: 92.32 sec


In [3]:
reports_dataset.to_csv("ReportsDatasetStep2.csv")
reports_dataset = pd.read_csv("ReportsDatasetStep2.csv")

In [4]:
############################################
################## PART 3 ##################
############################################

# split data to train / validation / test

# features
def train_val_test_split(reports_dataset):
    train_start, train_end = '2001-12-31', '2008-12-31' # train period from paper
    x_train = reports_dataset[(train_start < reports_dataset['date']) & (reports_dataset['date'] <= train_end)]['text']

    val_start, val_end = train_end, '2010-12-31'# validation period from paper
    x_val = reports_dataset[(val_start < reports_dataset['date']) & (reports_dataset['date'] <= val_end)]['text']

    test_start, test_end =  val_end, '2012-12-31'# test period from paper
    x_test = reports_dataset[(test_start < reports_dataset['date']) & (reports_dataset['date'] <= test_end)]['text']

    # target variable
    # I added a comparison to "MOVE" in order to work with binary data explicitly
    y_train = reports_dataset[(train_start < reports_dataset['date']) & (reports_dataset['date'] <= train_end)]['label'] == "MOVE"
    y_val = reports_dataset[(val_start < reports_dataset['date']) & (reports_dataset['date'] <= val_end)]['label'] == "MOVE"
    y_test = reports_dataset[(test_start < reports_dataset['date']) & (reports_dataset['date'] <= test_end)]['label'] == "MOVE"

    return x_train, y_train, x_val, y_val, x_test, y_test

x_train, y_train, x_val, y_val, x_test, y_test = train_val_test_split(reports_dataset)

In [7]:
############################################
################## PART 4 ##################
############################################

time_ = time.time()

# create features

def create_features(x_train, y_train, x_val, y_val, x_test, y_test):

    # 1. Unigrams
    # example: vectorizer_params = {}
    #          vectorizer = Vectorizer(**vectorizer_params)
    #          unigrams_train_features = vectorizer.fit_transform(x_train)
    # ...
    vectorizer_params = {"ngram_range": (1, 1), # we are using unigrams
                         "min_df": 10} # according to authors

    # I do not remove stop-words since it was not mentioned in the original paper

    vectorizer = CountVectorizer(**vectorizer_params) 
    unigrams_train_features = vectorizer.fit_transform(x_train)

    # I decided to try to implement feature selection using PMI
    # Found the implementation guide here: 
    # https://stackoverflow.com/questions/46752650/information-gain-calculation-with-scikit-learn?rq=1

    feature_scores = mutual_info_classif(unigrams_train_features, y_train, 
                                         discrete_features=True,
                                         random_state=RANDOM_STATE)

    vocab = [el for el, score in
             zip(vectorizer.get_feature_names(), feature_scores) if
             score > 0.01]

    # Using 0.01 as a threshold since we ends up with 2411 features
    # which is pretty close to 2319 features left in the original paper 

    vectorizer_params['vocabulary'] = vocab

    vectorizer = CountVectorizer(**vectorizer_params) 

    unigrams_train_features = vectorizer.fit_transform(x_train)
    unigrams_val_features = vectorizer.transform(x_val)
    unigrams_test_features = vectorizer.transform(x_test)

    print('Created {} features using unigrams'.format(unigrams_train_features.shape[1]))

    # 2. NMF vector for 50, 100, and 200 components
    # example: nmf_params = {}
    #          nmf = NMF(**nmf_params)
    #          nmf_train_features = nmf.fit_transform(unigrams_train_features)
    # ...

    nmf_params = {'n_components': 50,
                  'random_state': RANDOM_STATE,
                  'l1_ratio': 0.5,
                  'alpha': 0.1}
    # nmf_params were chosen based on
    # https://scikit-learn.org/0.18/auto_examples/applications/topics_extraction_with_nmf_lda.html

    nmf = NMF(**nmf_params)
    nmf_50_train_features = nmf.fit_transform(unigrams_train_features)
    nmf_50_val_features = nmf.transform(unigrams_val_features)
    nmf_50_test_features = nmf.transform(unigrams_test_features)

    print('nmf 50 features created')

    nmf_params['n_components'] = 100
    nmf = NMF(**nmf_params)
    nmf_100_train_features = nmf.fit_transform(unigrams_train_features)
    nmf_100_val_features = nmf.transform(unigrams_val_features)
    nmf_100_test_features = nmf.transform(unigrams_test_features)

    print('nmf 100 features created')

    nmf_params['n_components'] = 200
    nmf = NMF(**nmf_params)
    nmf_200_train_features = nmf.fit_transform(unigrams_train_features)
    nmf_200_val_features = nmf.transform(unigrams_val_features)
    nmf_200_test_features = nmf.transform(unigrams_test_features)

    print('nmf 200 features created')
    return (unigrams_train_features, unigrams_val_features, unigrams_test_features,
            nmf_50_train_features, nmf_50_val_features, nmf_50_test_features,
            nmf_100_train_features, nmf_100_val_features, nmf_100_test_features,
            nmf_200_train_features, nmf_200_val_features, nmf_200_test_features,
            y_train, y_val, y_test)

unigrams_train_features, unigrams_val_features, unigrams_test_features, \
    nmf_50_train_features, nmf_50_val_features, nmf_50_test_features, \
    nmf_100_train_features, nmf_100_val_features, nmf_100_test_features, \
    nmf_200_train_features, nmf_200_val_features, nmf_200_test_features, \
    y_train, y_val, y_test = \
    create_features(x_train, y_train, x_val, y_val, x_test, y_test)
    
print('finished. time elapsed: %.2f sec' % (time.time() - time_))

Created 2411 features using unigrams
nmf 50 features created
nmf 100 features created
nmf 200 features created
finished. time elapsed: 118.37 sec


In [8]:
############################################
################## PART 5 ##################
############################################

# combine all features to one feature space

# 1. NMF 50 features
def unite_features(unigrams_train_features, unigrams_val_features, unigrams_test_features,
                   nmf_50_train_features, nmf_50_val_features, nmf_50_test_features,
                   nmf_100_train_features, nmf_100_val_features, nmf_100_test_features,
                   nmf_200_train_features, nmf_200_val_features, nmf_200_test_features,
                   y_train, y_val, y_test,
                   dict_view=False):
    nmf_50_x_train = hstack([unigrams_train_features, 
                             nmf_50_train_features])

    nmf_50_x_val = hstack([unigrams_val_features,   
                           nmf_50_val_features])

    nmf_50_x_test = hstack([unigrams_test_features,  
                            nmf_50_test_features])

    # 2. NMF 100 features
    nmf_100_x_train = hstack([unigrams_train_features, 
                             nmf_100_train_features])

    nmf_100_x_val = hstack([unigrams_val_features,   
                           nmf_100_val_features])

    nmf_100_x_test = hstack([unigrams_test_features,  
                            nmf_100_test_features])


    # 3. NMF 200 features
    nmf_200_x_train = hstack([unigrams_train_features, 
                             nmf_200_train_features])

    nmf_200_x_val = hstack([unigrams_val_features,   
                           nmf_200_val_features])

    nmf_200_x_test = hstack([unigrams_test_features,  
                            nmf_200_test_features])

    # 4. Ensemble features
    ensemble_x_train = hstack([unigrams_train_features, 
                               nmf_50_train_features, 
                               nmf_100_train_features, 
                               nmf_200_train_features])

    ensemble_x_val = hstack([unigrams_val_features, 
                             nmf_50_val_features, 
                             nmf_100_val_features, 
                             nmf_200_val_features])

    ensemble_x_test = hstack([unigrams_test_features, 
                              nmf_50_test_features, 
                              nmf_100_test_features, 
                              nmf_200_test_features])
    data = (unigrams_train_features, unigrams_val_features,
            unigrams_test_features,
            nmf_50_x_train, nmf_50_x_val, nmf_50_x_test,
            nmf_100_x_train, nmf_100_x_val, nmf_100_x_test,
            nmf_200_x_train, nmf_200_x_val, nmf_200_x_test,
            ensemble_x_train, ensemble_x_val, ensemble_x_test,
            y_train, y_val, y_test)
    if dict_view:
        names = ("unigrams_train_features", "unigrams_val_features",
                 "unigrams_test_features",
                 "nmf_50_x_train", "nmf_50_x_val", "nmf_50_x_test",
                 "nmf_100_x_train", "nmf_100_x_val", "nmf_100_x_test",
                 "nmf_200_x_train", "nmf_200_x_val", "nmf_200_x_test",
                 "ensemble_x_train", "ensemble_x_val", "ensemble_x_test",
                 "y_train", "y_val", "y_test")
        return dict(zip(names, data))
    return data

unigrams_train_features, unigrams_val_features, \
    unigrams_test_features, \
    nmf_50_x_train, nmf_50_x_val, nmf_50_x_test, \
    nmf_100_x_train, nmf_100_x_val, nmf_100_x_test, \
    nmf_200_x_train, nmf_200_x_val, nmf_200_x_test, \
    ensemble_x_train, ensemble_x_val, ensemble_x_test, \
    y_train, y_val, y_test = \
    unite_features(unigrams_train_features, unigrams_val_features, unigrams_test_features,
                   nmf_50_train_features, nmf_50_val_features, nmf_50_test_features,
                   nmf_100_train_features, nmf_100_val_features, nmf_100_test_features,
                   nmf_200_train_features, nmf_200_val_features, nmf_200_test_features,
                   y_train, y_val, y_test)

In [9]:
############################################
################## PART 6 ##################
############################################

time_ = time.time()

# set basic classifier parameters, fit classifiers. 
# Hint: initialize new basic classifier before new training process

# example: basic_classifier_params = {}
#          basic_classifier = BasicClassifier(**basic_classifier_params)


# 1. Unigrams
# example: rf_unigrams_model = ...
# ...
rf_params = {"n_estimators": 2000, # 2000 trees according to the paper
             "n_jobs": -1,
             "random_state": RANDOM_STATE}
rf_unigrams_model = RandomForestClassifier(**rf_params)
rf_unigrams_model.fit(unigrams_train_features, y_train)

print('rf_unigrams_model trained')

# 2. NMF 50

rf_nmf_50_model = RandomForestClassifier(**rf_params)
rf_nmf_50_model.fit(nmf_50_x_train, y_train)

print('rf_nmf_50_model trained')

# 3. NMF 100
rf_nmf_100_model = RandomForestClassifier(**rf_params)
rf_nmf_100_model.fit(nmf_100_x_train, y_train)

print('rf_nmf_100_model trained')

# 4. NMF 200
rf_nmf_200_model = RandomForestClassifier(**rf_params)
rf_nmf_200_model.fit(nmf_200_x_train, y_train)

print('rf_nmf_200_model trained')

# 5. Ensemble
rf_ensemble_model = RandomForestClassifier(**rf_params)
rf_ensemble_model.fit(ensemble_x_train, y_train)

print('rf_ensemble_model trained')
print('finished. time elapsed: %.2f sec' % (time.time() - time_))

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=2000, n_jobs=-1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

rf_unigrams_model trained


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=2000, n_jobs=-1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

rf_nmf_50_model trained


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=2000, n_jobs=-1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

rf_nmf_100_model trained


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=2000, n_jobs=-1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

rf_nmf_200_model trained


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=2000, n_jobs=-1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

rf_ensemble_model trained
finished. time elapsed: 30.44 sec


In [10]:
############################################
################## PART 7 ##################
############################################

# calculate quality measures

# 1. Unigrams
# example: rf_unigrams_quality = Quality(y_true, y_pred)

y_pred = rf_unigrams_model.predict(unigrams_test_features)
rf_unigrams_quality = accuracy_score(y_test, y_pred)

# 2. NMF 50
y_pred = rf_nmf_50_model.predict(nmf_50_x_test)
rf_nmf_50_quality = accuracy_score(y_test, y_pred)


# 3. NMF 100
y_pred = rf_nmf_100_model.predict(nmf_100_x_test)
rf_nmf_100_quality = accuracy_score(y_test, y_pred)


# 4. NMF 200
y_pred = rf_nmf_200_model.predict(nmf_200_x_test)
rf_nmf_200_quality = accuracy_score(y_test, y_pred)

# 5. Ensemble
y_pred = rf_ensemble_model.predict(ensemble_x_test)
rf_ensemble_quality = accuracy_score(y_test, y_pred)

# save results

results = pd.DataFrame({'model': ['rf_unigrams_model', 'rf_nmf_50_model', 
                                  'rf_nmf_100_model', 'rf_nmf_200_model', 'rf_ensemble_model'], 
                        'n_features': [rf_unigrams_model.n_features_, rf_nmf_50_model.n_features_, 
                                       rf_nmf_100_model.n_features_, rf_nmf_200_model.n_features_, 
                                       rf_ensemble_model.n_features_], 
                        'accuracy': [rf_unigrams_quality, rf_nmf_50_quality, 
                                     rf_nmf_100_quality, rf_nmf_200_quality, rf_ensemble_quality]}, 
                       columns=['model', 'n_features', 'accuracy'])
results.to_csv('data/results.csv', index=False)
results

Unnamed: 0,model,n_features,accuracy
0,rf_unigrams_model,2411,0.760163
1,rf_nmf_50_model,2461,0.764228
2,rf_nmf_100_model,2511,0.768293
3,rf_nmf_200_model,2611,0.764228
4,rf_ensemble_model,2761,0.760163


**I have less features since I'm using PMI for feature selection as it was mentioned in the original paper. I've decided to leave calculations with less features since it works faster and shows a better result**

# Errors analysis

Let's inspect where our best model makes errors

In [11]:
from sklearn.metrics import confusion_matrix
from matplotlib import pyplot as plt

%matplotlib inline

In [12]:
y_pred = rf_nmf_100_model.predict(nmf_100_x_test)
confusion_matrix(y_test, y_pred)

array([[131,  27],
       [ 30,  58]])

It would be pretty hard to analize false pozitive errors. Let's try to take a closer look to the example of false positive error (model believes that the piece of news will hane an effect on the share price)

*Just a reminder: I set "MOVE" answer as True and "STAY" as False*

In [13]:
false_positive = ((y_test == False) & (y_pred == True))
x_test[false_positive.index[2]]

'Submission of Matters to a Vote of Security Holders Check the appropriate box below if the Form 8-K filing is intended to simultaneously satisfy the filing obligation of the registrant under any of the following provisions: Item 5.07 Submission of Matters to a Vote of Security Holders The Annual Meeting of Shareholders (the "Annual Meeting") of Apple Inc. (the "Company") was held on February 23, 2011. At the Annual Meeting, the shareholders voted on the following six proposals and cast their votes as described below. Proposal 1 The individuals listed below were elected at the Annual Meeting to serve a one-year term on the Company\'s Board of Directors (the "Board"). Proposal 2 Proposal 2 was a management proposal to ratify the appointment of Ernst & Young LLP as the Company\'s independent registered public accounting firm for fiscal year 2011, as described in the proxy materials. This proposal was approved. Proposal 3 Proposal 3 was a management proposal to hold an advisory vote on ex

**We may see here some positive words like "satisfy", "proposals", "approved", "recommendation", "consistent". Yet, we, as a human beings may understand that it is not so positive information: this is all mainly about some company governance tasks.**

# Creating new datasets

In this section I'm going to create 2 new datasets using the algorithm used in the previous section.

The first one, banks_dataset will be formed based on the information from the 3 large banks: JPMorgan Chase & Co. (JPM), Bank of America Corp (BAC) and Citigroup Inc. (C)

The second one, cg_dataset will be formed based on the information from the 3 large companies which provides consumer goods: Gap Inc. (GPS), Nike (NKE) and Hasbro Inc. (HAS) 

In [49]:
time_ = time.time()
banks_tickers = ["JPM", "BAC", "C"]
banks_dataset = unite_features(*create_features(*train_val_test_split(create_reports(banks_tickers))),
                               dict_view=True)
print('finished. time elapsed: %.2f sec' % (time.time() - time_))

iteration 1 of 3 | ticker: JPM
iteration 2 of 3 | ticker: BAC
iteration 3 of 3 | ticker: C
Created 1129 features using unigrams
nmf 50 features created
nmf 100 features created
nmf 200 features created
finished. time elapsed: 140.28 sec


In [50]:
time_ = time.time()
cg_tickers = ["GPS", "NKE", "HAS"]
cg_dataset = unite_features(*create_features(*train_val_test_split(create_reports(cg_tickers))),
                            dict_view=True)
print('finished. time elapsed: %.2f sec' % (time.time() - time_))

iteration 1 of 3 | ticker: GPS
iteration 2 of 3 | ticker: NKE
iteration 3 of 3 | ticker: HAS
Created 1961 features using unigrams
nmf 50 features created
nmf 100 features created
nmf 200 features created
finished. time elapsed: 96.46 sec


In [52]:
def prepare_dataset(dataset, name):
    prepped = {}
    prepped["name"] = name
    X = {}
    X["unigrams"] = {"train": dataset['unigrams_train_features'],
                     "val": dataset['unigrams_val_features'],
                     "test": dataset['unigrams_test_features']}   
    X["nmf50"] = {"train": dataset['nmf_50_x_train'],
                  "val": dataset['nmf_50_x_val'],
                  "test": dataset['nmf_50_x_test']}
    X["nmf100"] = {"train": dataset['nmf_100_x_train'],
                   "val": dataset['nmf_100_x_val'],
                   "test": dataset['nmf_100_x_test']}
    X["nmf200"] = {"train": dataset['nmf_200_x_train'],
                   "val": dataset['nmf_200_x_val'],
                   "test": dataset['nmf_200_x_test']}
    X["ensemble"] = {"train": dataset['ensemble_x_train'],
                     "val": dataset['ensemble_x_val'],
                     "test": dataset['ensemble_x_test']}
    prepped["X"] = X
    prepped['y'] = {"train": dataset['y_train'],
                    "val": dataset['y_val'],
                    "test": dataset['y_test']}
    return prepped
    
banks_dataset = prepare_dataset(banks_dataset, "banks")
cg_dataset = prepare_dataset(cg_dataset, "consumer_goods")

Now we are ready to build some models!

# Playing with models

In the following section I'm going to check the performance of XGBClassifier, LinearSVC and LogisticRegression on the dataset created in the previous section using f1-score and roc-auc

In [18]:
# ! pip install xgboost

In [19]:
from xgboost import XGBClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, f1_score

We are going to use not tuned classifiers in this section

In [54]:
time_ = time.time()

params = {"random_state": RANDOM_STATE}

def create_and_fit_models(dataset, params):
    trained_models = {}
    for features_names, X_data in dataset['X'].items():
        X = X_data["train"]
        y = dataset['y']["train"]
        trained_models[features_names] = \
            {"xgb": XGBClassifier(n_jobs=-1, **params).fit(X, y),
             "logit": LogisticRegression(**params).fit(X, y),
             "svm": LinearSVC(**params).fit(X, y),
             "rf": RandomForestClassifier(**params).fit(X, y)}
        print('{} trained'.format(features_names))
    return trained_models


banks_models = create_and_fit_models(banks_dataset, params)
cg_models = create_and_fit_models(cg_dataset, params)

print('finished. time elapsed: %.2f sec' % (time.time() - time_))

unigrams trained
nmf50 trained
nmf100 trained
nmf200 trained
ensemble trained
unigrams trained
nmf50 trained
nmf100 trained
nmf200 trained
ensemble trained
finished. time elapsed: 15.09 sec


In [56]:
def get_quality_scores(dataset, models, test_plan):
    res = []
    for features_names, X_data in dataset['X'].items():
        X_test = X_data['test']
        y_test = dataset['y']['test']        
        for model_name, model in models[features_names].items():
            y_pred = model.predict(X_test)
            f1 = f1_score(y_test, y_pred)
            roc_auc = roc_auc_score(y_test, y_pred)
            res.append({"model": model_name,
                        "dataset": dataset["name"],
                        "features": features_names,
                        "f1 score": f1,
                        "roc_auc_score": roc_auc})
    return res

banks_summary = get_quality_scores(banks_dataset, banks_models, testing_plan)
cg_summary = get_quality_scores(cg_dataset, cg_models, testing_plan)

In [57]:
df = pd.DataFrame(banks_summary + cg_summary, columns=["dataset", "model", "features", "f1 score", "roc_auc_score"])
df.sort_values(["dataset", "model", "f1 score"], ascending=False, inplace=True)

df.to_csv('data/results2.csv', index=False)

In [58]:
df

Unnamed: 0,dataset,model,features,f1 score,roc_auc_score
28,consumer_goods,xgb,nmf100,0.313725,0.526786
36,consumer_goods,xgb,ensemble,0.27907,0.549107
32,consumer_goods,xgb,nmf200,0.25,0.551339
20,consumer_goods,xgb,unigrams,0.205128,0.535714
24,consumer_goods,xgb,nmf50,0.046512,0.426339
34,consumer_goods,svm,nmf200,0.470588,0.571429
38,consumer_goods,svm,ensemble,0.470588,0.571429
22,consumer_goods,svm,unigrams,0.424242,0.540179
26,consumer_goods,svm,nmf50,0.424242,0.540179
30,consumer_goods,svm,nmf100,0.424242,0.540179


After building not tuned models for banks and consumer goods companies we may notice that the quality of the classification is higher when we use linear models. Probably this result may be explained by the size of the feature space: linear models always show pretty good performance when we use many features, especially if a lot of them have not any effect on the resulting variable

It is likely that we may see the increase of quality of ensemble models if we tune them a little. Let's try to do it!

# Models tuning

Finally, we may use our validation dataset to tune parameters of our models!

We are going to tune models using those features which allowed to achieve the best quality with each specific model.

We will use f1 score and roc-auc score to compare our models.

In [63]:
from scipy import sparse
from sklearn.model_selection import PredefinedSplit, GridSearchCV
from sklearn.metrics import make_scorer
from tqdm import tqdm_notebook

In [79]:
def prep_validation(dataset):
    """
    we want to tune parameters using validation set
    GridSearchCV with default cv param wouldn't help us with that
    We need to use PredefinedSplit as mentioned in
    https://stackoverflow.com/questions/43764999/python-machine-learning-perform-a-grid-search-on-custom-validation-set/43766334#43766334
    """
    my_test_fold = []
    for features_names, X_data in dataset['X'].items(): 
        X_data['united'] = sparse.vstack([X_data['train'], X_data['val']])
    
    for i in range(X_data['train'].shape[0]):
        my_test_fold.append(-1)

    for i in range(X_data['val'].shape[0]):
        my_test_fold.append(0)

    dataset['y']['united'] = np.concatenate([dataset['y']['train'].values, dataset['y']['val'].values])
    dataset['split'] = PredefinedSplit(test_fold=my_test_fold)
    return dataset

In [80]:
banks_dataset = prep_validation(banks_dataset)
cg_dataset = prep_validation(cg_dataset)

In [81]:
model_to_dataset = {
    ("banks", "xgb"): "nmf200",
    ("banks", "logit"): "ensemble",
    ("banks", "svm"): "unigrams",
    ("banks", "rf"): "nmf100",
    ("consumer_goods", "xgb"): "nmf100",
    ("consumer_goods", "logit"): "nmf200",
    ("consumer_goods", "svm"): "nmf200",
    ("consumer_goods", "rf"): "ensemble"}

In [82]:
all_model_params = {
    "xgb": {"gamma": [0.25, 0.5],
            "max_depth": [3, 6, 10],
            "n_estimators": [20, 200]},
    "logit": {"penalty": ["l1", "l2"],
              "C": [0.1, 1, 10, 100]},
    "svm": {"class_weight": [None, "balanced"],
            "C": [0.1, 1, 10, 100]},
    "rf": {"n_estimators": [10, 20, 200, 2000]}}

In [84]:
time_ = time.time()
res = []
with tqdm_notebook(total=2*2*4) as f:
    for scorer_func, scorer_name in [(roc_auc_score, "roc-auc"), (f1_score, "f1")]:
        for dataset in [banks_dataset, cg_dataset]:
            models = {"xgb": XGBClassifier(random_state=RANDOM_STATE),
                      "logit": LogisticRegression(random_state=RANDOM_STATE),
                      "svm": LinearSVC(random_state=RANDOM_STATE),
                      "rf": RandomForestClassifier(random_state=RANDOM_STATE)}
            for model_name, model in models.items():
                model_X = dataset["X"][model_to_dataset[(dataset["name"], model_name)]]
                model_params = all_model_params[model_name]
                gs = GridSearchCV(model, param_grid=model_params, scoring=make_scorer(scorer_func),
                                  n_jobs=-1, cv=dataset['split'])
                gs.fit(model_X['united'], dataset['y']['united']);
                y_pred = gs.best_estimator_.predict(model_X['test'])
                test_score = scorer_func(dataset['y']['test'], y_pred)
                res.append({"dataset": dataset['name'],
                            "model": model_name,
                            "scorer": scorer_name,
                            "best_params": gs.best_params_,
                            "val_score": gs.best_score_,
                            "test_score": test_score})
                f.update()

print('finished. time elapsed: %.2f sec' % (time.time() - time_))

HBox(children=(IntProgress(value=0, max=16), HTML(value='')))

GridSearchCV(cv=PredefinedSplit(test_fold=array([-1, -1, ...,  0,  0])),
       error_score='raise',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic',
       random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'gamma': [0.0, 0.3, 0.5], 'max_depth': [3, 6, 10], 'n_estimators': [20, 200]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(roc_auc_score), verbose=0)

GridSearchCV(cv=PredefinedSplit(test_fold=array([-1, -1, ...,  0,  0])),
       error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'penalty': ['l1', 'l2'], 'class_weight': [None, 'balanced'], 'C': [0.1, 1, 10, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(roc_auc_score), verbose=0)

GridSearchCV(cv=PredefinedSplit(test_fold=array([-1, -1, ...,  0,  0])),
       error_score='raise',
       estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=42, tol=0.0001,
     verbose=0),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'class_weight': [None, 'balanced'], 'C': [0.1, 1, 10, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(roc_auc_score), verbose=0)

GridSearchCV(cv=PredefinedSplit(test_fold=array([-1, -1, ...,  0,  0])),
       error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'class_weight': [None, 'balanced'], 'n_estimators': [10, 20, 200, 2000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(roc_auc_score), verbose=0)

GridSearchCV(cv=PredefinedSplit(test_fold=array([-1, -1, ...,  0,  0])),
       error_score='raise',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic',
       random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'gamma': [0.0, 0.3, 0.5], 'max_depth': [3, 6, 10], 'n_estimators': [20, 200]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(roc_auc_score), verbose=0)

GridSearchCV(cv=PredefinedSplit(test_fold=array([-1, -1, ...,  0,  0])),
       error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'penalty': ['l1', 'l2'], 'class_weight': [None, 'balanced'], 'C': [0.1, 1, 10, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(roc_auc_score), verbose=0)

GridSearchCV(cv=PredefinedSplit(test_fold=array([-1, -1, ...,  0,  0])),
       error_score='raise',
       estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=42, tol=0.0001,
     verbose=0),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'class_weight': [None, 'balanced'], 'C': [0.1, 1, 10, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(roc_auc_score), verbose=0)

GridSearchCV(cv=PredefinedSplit(test_fold=array([-1, -1, ...,  0,  0])),
       error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'class_weight': [None, 'balanced'], 'n_estimators': [10, 20, 200, 2000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(roc_auc_score), verbose=0)

GridSearchCV(cv=PredefinedSplit(test_fold=array([-1, -1, ...,  0,  0])),
       error_score='raise',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic',
       random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'gamma': [0.0, 0.3, 0.5], 'max_depth': [3, 6, 10], 'n_estimators': [20, 200]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(f1_score), verbose=0)

GridSearchCV(cv=PredefinedSplit(test_fold=array([-1, -1, ...,  0,  0])),
       error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'penalty': ['l1', 'l2'], 'class_weight': [None, 'balanced'], 'C': [0.1, 1, 10, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(f1_score), verbose=0)

GridSearchCV(cv=PredefinedSplit(test_fold=array([-1, -1, ...,  0,  0])),
       error_score='raise',
       estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=42, tol=0.0001,
     verbose=0),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'class_weight': [None, 'balanced'], 'C': [0.1, 1, 10, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(f1_score), verbose=0)

GridSearchCV(cv=PredefinedSplit(test_fold=array([-1, -1, ...,  0,  0])),
       error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'class_weight': [None, 'balanced'], 'n_estimators': [10, 20, 200, 2000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(f1_score), verbose=0)

GridSearchCV(cv=PredefinedSplit(test_fold=array([-1, -1, ...,  0,  0])),
       error_score='raise',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic',
       random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'gamma': [0.0, 0.3, 0.5], 'max_depth': [3, 6, 10], 'n_estimators': [20, 200]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(f1_score), verbose=0)

GridSearchCV(cv=PredefinedSplit(test_fold=array([-1, -1, ...,  0,  0])),
       error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'penalty': ['l1', 'l2'], 'class_weight': [None, 'balanced'], 'C': [0.1, 1, 10, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(f1_score), verbose=0)

GridSearchCV(cv=PredefinedSplit(test_fold=array([-1, -1, ...,  0,  0])),
       error_score='raise',
       estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=42, tol=0.0001,
     verbose=0),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'class_weight': [None, 'balanced'], 'C': [0.1, 1, 10, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(f1_score), verbose=0)

GridSearchCV(cv=PredefinedSplit(test_fold=array([-1, -1, ...,  0,  0])),
       error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'class_weight': [None, 'balanced'], 'n_estimators': [10, 20, 200, 2000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(f1_score), verbose=0)




In [86]:
pd.set_option('display.max_colwidth', -1)
tuned = pd.DataFrame(res, columns=["dataset", "model", "scorer", "best_params", "val_score", "test_score"])
tuned.sort_values(["dataset", "scorer", "test_score"], ascending=False, inplace=True)

tuned.to_csv('data/results3.csv', index=False)
tuned

Unnamed: 0,dataset,model,scorer,best_params,val_score,test_score
7,consumer_goods,rf,roc-auc,"{'class_weight': 'balanced', 'n_estimators': 200}",0.689189,0.667411
4,consumer_goods,xgb,roc-auc,"{'gamma': 0.0, 'max_depth': 10, 'n_estimators': 20}",0.666216,0.645089
5,consumer_goods,logit,roc-auc,"{'C': 0.1, 'class_weight': None, 'penalty': 'l1'}",0.68027,0.584821
6,consumer_goods,svm,roc-auc,"{'C': 0.1, 'class_weight': None}",0.564324,0.524554
15,consumer_goods,rf,f1,"{'class_weight': 'balanced', 'n_estimators': 200}",0.54902,0.530612
12,consumer_goods,xgb,f1,"{'gamma': 0.0, 'max_depth': 10, 'n_estimators': 20}",0.551724,0.526316
14,consumer_goods,svm,f1,"{'C': 0.1, 'class_weight': None}",0.551724,0.4
13,consumer_goods,logit,f1,"{'C': 100, 'class_weight': None, 'penalty': 'l1'}",0.615385,0.354839
2,banks,svm,roc-auc,"{'C': 100, 'class_weight': None}",0.602273,0.584914
1,banks,logit,roc-auc,"{'C': 100, 'class_weight': None, 'penalty': 'l2'}",0.519318,0.506098


After tuning of hyperparameters we see that tuned RF and XGB models outperforms tuned linear models on the dataset of consumer goods companies. And we may notice 180 degrees situation with banks dataset. Also we may notice that there is a higher f1 and roc-auc scores for consumer goods companies.

Probably, it may be explained by the excessive amount of financial data in banks reports and lack of the changes in the banking sector (this means that the information on the banks 8K reports do not influence on the core business and as a result we have a lot of the useless features): that is why in banking sector we see a lower quality of classificators and the supremacy of linear models.

**In this notebook we tried to build a model inspired by the paper "On the Importance of Text Analysis for Stock Price Prediction". We created different set of features based on the vectorization of the 8K reports text. We managed to prove that there is a connection between 8K reports and stock proces behavior.**

**After that we builded 4 different models using data of 3 major American banks and 3 major consumer goods companies. We noticed that not tuned linear models shows a better quality on this data than not tuned ensemble models**

**Finally, we tried to tune those models using features which gave us the best quality on the previous step and made some very interesting conclusions using this results**