## Задание Basic code.
## По мотивам статьи: 2014 - [On the Importance of Text Analysis for Stock Price Prediction](https://nlp.stanford.edu/pubs/lrec2014-stock.pdf)
## [Данные](https://nlp.stanford.edu/pubs/stock-event.html)

In [2]:
# load modules

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import numpy as np
from scipy.sparse import hstack, vstack
import pandas as pd
import re
import datetime
import time
import os
from pyunpack import Archive

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import  accuracy_score # quality measure from paper

from sklearn.ensemble import  RandomForestClassifier # basic classifier from paper
from sklearn.decomposition import NMF, LatentDirichletAllocation

  from numpy.core.umath_tests import inner1d


In [53]:
if not os.path.exists('data'):
    os.mkdir('data')
    print('Скачай данные из https://nlp.stanford.edu/pubs/stock-event.html в папку data!')
    
if not os.path.exists('data\my8K'):
    os.mkdir('data\my8K')

In [52]:
time_ = time.time()

tickers = ['AAPL', 'ADBE', 'AMZN', 'GOOG', 'HPQ', 'IBM', 'INTC', 'MSFT', 'NVDA'] # tickers for {Apple,Adobe,Amazon,Google,HP,IBM,Intel,MicroSoft,NVidia}
sp500_ticker =  'gspc'# S&P500 ticker from paper

price_history_path = 'data\price_history\\'
eightk_path = 'data\8K\\'

reports_dataset = pd.DataFrame(columns=['ticker', 'date', 'time', 'text', 'movement', 'movement_normalized', 'label'])

for iter_, ticker in enumerate(tickers):
    ############################################
    ################## PART 1 ##################
    ############################################
    
    print('iteration %i of %i | ticker: %s' % (iter_+1, len(tickers), ticker))
    
    # load data
    
    # 1. stock quotes
    price = pd.read_csv(price_history_path + ticker+'.csv')
    price = price[price['Date'] > '2001-12-31']
    price = price.sort_values('Date').reset_index(drop=True)
    sp500 = pd.read_csv(price_history_path+sp500_ticker+'.csv')
    sp500 = sp500[sp500['Date'] > '2001-12-31']
    sp500 = sp500.sort_values('Date').reset_index(drop=True)

    # 2. 8K reports
    try:
        Archive('data\8K\\' + ticker + '.gz').extractall('data\my8K')
    except UnicodeDecodeError:
        pass
    with open('data\my8K\\' + ticker, 'r') as f:
        f_lines = f.readlines()
    raw_8k_reports = pd.Series(' '.join(f_lines).split('</DOCUMENT>')[:-1])

    def transform_8k_report_to_dataframe(report):
        # transform 8k report to pd.DataFrame row
        #
        # report: string
        # result: pd.DataFrame

        result = pd.DataFrame(columns=['ticker', 'date', 'time', 'text'], index=[0])

        result['ticker'] = report.split('FILE:')[1].split('/')[0]

        datetime_ = report.split('TIME:')[1].split('\n')[0]
        datetime_ = datetime.datetime.strptime(datetime_, '%Y%m%d%H%M%S')
        result['date'] = str(datetime_.date())
        result['time'] = str(datetime_.time())

        text = report.split('ITEM:')[-1]
        text = text.replace('QuickLinks', '').replace('Click here to rapidly navigate through this document', '')
        text = ' '.join(text.split())
        result['text'] = text

        return result

    reports = pd.DataFrame(columns=[])
    for report in raw_8k_reports:
        row = transform_8k_report_to_dataframe(report)
        reports = pd.concat([reports, row], axis=0)
    reports.reset_index(drop=True, inplace=True)

    del f_lines, raw_8k_reports, report
    
    
    ############################################
    ################## PART 2 ##################
    ############################################
    
    # create binary markup {MOVE, STAY} for price data
    # aggregate Up and DOWN labels to MOVE label

    price_movement = pd.DataFrame(columns=['date', 'movement', 'movement_normalized', 'label'])

    for i, j in price.iloc[:-1,:].iterrows():
        row = pd.DataFrame(columns=['date', 'movement', 'movement_normalized', 'label'], index=[0])
        row['date'] = j['Date']

        column_next =  'Open' # value from paper
        column_prev =  'Close' # value from paper
        price_change = (price.loc[i+1, column_next] - price.loc[i, column_prev]) / price.loc[i, column_prev]
        sp500_change = (sp500.loc[i+1, column_next] - sp500.loc[i, column_prev]) / sp500.loc[i, column_prev]
        row['movement'] = price_change

        price_change_normalized =  price_change - sp500_change# price_change normalization, described in paper
        row['movement_normalized'] = price_change_normalized

        if price_change_normalized >= 0.01: # value from paper
            row['label'] =  'UP' # price movement: UP
        elif price_change_normalized <= -0.01: # value from paper
            row['label'] =  'DOWN' # price movement: DOWN
        else:
            row['label'] =  'STAY' # price movement: STAY

        price_movement = pd.concat([price_movement, row], axis=0)

    price_movement.reset_index(drop=True, inplace=True)

    del price, sp500
    
    # merge stock quotes and text

    reports = pd.merge(reports, price_movement, on='date', how='left')
    reports.dropna(axis=0, inplace=True)

    del price_movement
    
    # combine tickers
    
    reports_dataset = pd.concat([reports_dataset, reports], axis=0)
    
    del reports

reports_dataset.reset_index(drop=True, inplace=True)
print('finished. time elapsed: %.2f sec' % (time.time() - time_))

iteration 1 of 9 | ticker: AAPL
iteration 2 of 9 | ticker: ADBE
iteration 3 of 9 | ticker: AMZN
iteration 4 of 9 | ticker: GOOG
iteration 5 of 9 | ticker: HPQ
iteration 6 of 9 | ticker: IBM
iteration 7 of 9 | ticker: INTC
iteration 8 of 9 | ticker: MSFT
iteration 9 of 9 | ticker: NVDA
finished. time elapsed: 118.94 sec


In [55]:
############################################
################## PART 3 ##################
############################################

# split data to train / validation / test

# features
train_start, train_end = '2001-12-31', '2008-12-31' # train period from paper
x_train = reports_dataset[(train_start < reports_dataset['date']) & (reports_dataset['date'] <= train_end)]['text']

val_start, val_end =  '2008-12-31', '2010-12-31' # validation period from paper
x_val = reports_dataset[(val_start < reports_dataset['date']) & (reports_dataset['date'] <= val_end)]['text']

test_start, test_end = '2010-12-31', '2012-12-31' # test period from paper
x_test = reports_dataset[(test_start < reports_dataset['date']) & (reports_dataset['date'] <= test_end)]['text']

# target variable
y_train = reports_dataset[(train_start < reports_dataset['date']) & (reports_dataset['date'] <= train_end)]['label']
y_val = reports_dataset[(val_start < reports_dataset['date']) & (reports_dataset['date'] <= val_end)]['label']
y_test = reports_dataset[(test_start < reports_dataset['date']) & (reports_dataset['date'] <= test_end)]['label']

In [56]:
############################################
################## PART 4 ##################
############################################

time_ = time.time()

# create features

# 1. Unigrams
# example: vectorizer_params = {}
#          vectorizer = Vectorizer(**vectorizer_params)
#          unigrams_train_features = vectorizer.fit_transform(x_train)
# ...
#vectorizer = CountVectorizer()
vectorizer = TfidfVectorizer()
unigrams_train_features = vectorizer.fit_transform(x_train)
unigrams_val_features = vectorizer.transform(x_val)
unigrams_test_features = vectorizer.transform(x_test)
print('unigrams created')
# To check feature names: vectorizer.get_feature_names()

# 2. NMF vector for 50, 100, and 200 components
# example: nmf_params = {}
#          nmf = NMF(**nmf_params)
#          nmf_train_features = nmf.fit_transform(unigrams_train_features)

nmf50 = NMF(50)
nmf_50_train_features = nmf50.fit_transform(unigrams_train_features)
nmf_50_val_features = nmf50.transform(unigrams_val_features)
nmf_50_test_features = nmf50.transform(unigrams_test_features)
print('nmf 50 features created')
# To check matrix shape: nmf_50_train_features.shape

nmf100 = NMF(100)
nmf_100_train_features = nmf100.fit_transform(unigrams_train_features)
nmf_100_val_features = nmf100.transform(unigrams_val_features)
nmf_100_test_features = nmf100.transform(unigrams_test_features)
print('nmf 100 features created')

nmf200 = NMF(200)
nmf_200_train_features = nmf200.fit_transform(unigrams_train_features)
nmf_200_val_features = nmf200.transform(unigrams_val_features)
nmf_200_test_features = nmf200.transform(unigrams_test_features)
print('nmf 200 features created')
print('finished. time elapsed: %.2f sec' % (time.time() - time_))

unigrams created
nmf 50 features created
nmf 100 features created
nmf 200 features created
finished. time elapsed: 386.83 sec


In [57]:
############################################
################## PART 5 ##################
############################################

# combine all features to one feature space

# 1. NMF 50 features
nmf_50_x_train = hstack([unigrams_train_features, 
                         nmf_50_train_features])

nmf_50_x_val = hstack([unigrams_val_features,   
                       nmf_50_val_features])

nmf_50_x_test = hstack([unigrams_test_features,  
                        nmf_50_test_features])

# 2. NMF 100 features
nmf_100_x_train = hstack([unigrams_train_features, 
                         nmf_100_train_features])

nmf_100_x_val = hstack([unigrams_val_features,   
                       nmf_100_val_features])

nmf_100_x_test = hstack([unigrams_test_features,  
                        nmf_100_test_features])

# 3. NMF 200 features
nmf_200_x_train = hstack([unigrams_train_features, 
                         nmf_200_train_features])

nmf_200_x_val = hstack([unigrams_val_features,   
                       nmf_200_val_features])

nmf_200_x_test = hstack([unigrams_test_features,  
                        nmf_200_test_features])

# 4. Ensemble features
ensemble_x_train = hstack([unigrams_train_features, 
                           nmf_50_train_features, 
                           nmf_100_train_features, 
                           nmf_200_train_features])

ensemble_x_val = hstack([unigrams_val_features, 
                           nmf_50_val_features, 
                           nmf_100_val_features, 
                           nmf_200_val_features])

ensemble_x_test = hstack([unigrams_test_features, 
                           nmf_50_test_features, 
                           nmf_100_test_features, 
                           nmf_200_test_features])

In [58]:
############################################
################## PART 6 ##################
############################################

time_ = time.time()

# set basic classifier parameters, fit classifiers. 
# Hint: initialize new basic classifier before new training process

# example: basic_classifier_params = {}
#          basic_classifier = BasicClassifier(**basic_classifier_params)


# 1. Unigrams
# example: rf_unigrams_model = ...
# ...
#basic_classifier = BasicClassifier(**basic_classifier_params)
#clf = RandomForestClassifier(n_estimators=10, max_depth=None,
#                             min_samples_split=2, random_state=0)
rf_unigrams_model = RandomForestClassifier()
rf_unigrams_model.fit(unigrams_train_features, y_train)
#print(clf.feature_importances_)
#scores = cross_val_score(clf, X, y, cv=5)
#scores.mean()
print('rf_unigrams_model trained')

# 2. NMF 50
rf_nmf_50_model = RandomForestClassifier(n_estimators=2000)
rf_nmf_50_model.fit(nmf_50_x_train, y_train)

print('rf_nmf_50_model trained')

# 3. NMF 100
rf_nmf_100_model = RandomForestClassifier(n_estimators=2000)
rf_nmf_100_model.fit(nmf_100_x_train, y_train)

print('rf_nmf_100_model trained')

# 4. NMF 200
rf_nmf_200_model = RandomForestClassifier(n_estimators=2000)
rf_nmf_200_model.fit(nmf_200_x_train, y_train)

print('rf_nmf_200_model trained')

# 5. Ensemble
rf_ensemble_model = RandomForestClassifier(n_estimators=2000)
rf_ensemble_model.fit(ensemble_x_train, y_train)

print('rf_ensemble_model trained')
print('finished. time elapsed: %.2f sec' % (time.time() - time_))

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

rf_unigrams_model trained


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=2000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

rf_nmf_50_model trained


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=2000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

rf_nmf_100_model trained


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=2000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

rf_nmf_200_model trained


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=2000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

rf_ensemble_model trained
finished. time elapsed: 140.82 sec


In [59]:
############################################
################## PART 7 ##################
############################################

# calculate quality measures

# 1. Unigrams
# example: rf_unigrams_quality = Quality(y_true, y_pred)
rf_unigrams_quality = accuracy_score(y_val, rf_unigrams_model.predict(unigrams_val_features))

# 2. NMF 50
rf_nmf_50_quality = accuracy_score(y_val, rf_nmf_50_model.predict(nmf_50_x_val))

# 3. NMF 100
rf_nmf_100_quality = accuracy_score(y_val, rf_nmf_100_model.predict(nmf_100_x_val))

# 4. NMF 200
rf_nmf_200_quality = accuracy_score(y_val, rf_nmf_200_model.predict(nmf_200_x_val))

# 5. Ensemble
rf_ensemble_quality = accuracy_score(y_val, rf_ensemble_model.predict(ensemble_x_val))

# save results
results = pd.DataFrame({'model': ['rf_unigrams_model', 'rf_nmf_50_model', 
                                  'rf_nmf_100_model', 'rf_nmf_200_model', 'rf_ensemble_model'], 
                        'n_features': [rf_unigrams_model.n_features_, rf_nmf_50_model.n_features_, 
                                       rf_nmf_100_model.n_features_, rf_nmf_200_model.n_features_, 
                                       rf_ensemble_model.n_features_], 
                        'accuracy': [rf_unigrams_quality, rf_nmf_50_quality, 
                                     rf_nmf_100_quality, rf_nmf_200_quality, rf_ensemble_quality]}, 
                       columns=['model', 'n_features', 'accuracy'])
results.to_csv('data/results.csv', index=False)
results

Unnamed: 0,model,n_features,accuracy
0,rf_unigrams_model,16420,0.586667
1,rf_nmf_50_model,16470,0.662222
2,rf_nmf_100_model,16520,0.657778
3,rf_nmf_200_model,16620,0.657778
4,rf_ensemble_model,16770,0.662222
