# Testing the packages

In [1]:
import pandas as pd
import numpy as np
import pickle
import os
import sys
sys.path.insert(1, '../functions')
import data_utilities as data_u
import dict_utilities as dict_u
import nlp_utilities as nlp_u
import time

#from jupyterthemes import jtplot
#jtplot.style()
%load_ext autoreload
%autoreload 2
%matplotlib inline

data_folder = '../data/'
dict_dir = data_folder + 'data_dict.pkl'


## Testing dict_utilities

In [50]:
import dict_utilities as dict_u

data_dict = dict_u.get_dict(dict_dir)
print(f"Tickers already saved : {data_dict.keys()}")

dict_u.reset_dict(dict_dir)
data_dict = dict_u.get_dict(dict_dir)
print(f"Tickers after reset : {data_dict.keys()}")

Tickers already saved : dict_keys(['google', 'exxon'])
Tickers after reset : dict_keys([])


## Testing data_utilities

These functions now label the data at the same time, so it can take quite a long time ! Approximately 1 min every 100 labels ... This is not optimized !

Add news to data dictionary from database

In [3]:
import data_utilities as data_u

search_words = ['Google', 'Exxon']
news_to_read = "Reuters"
data_u.add_news_to_dict(search_words, data_folder, news_to_read, dict_dir)
data_dict = dict_u.get_dict(dict_dir)
print(f"Tickers added : {data_dict.keys()}\n")
print(f"Number of news in ticker {search_words[0]} : {len(data_dict[search_words[0].lower()])}")

Tickers added : dict_keys(['google', 'exxon'])

Number of news in ticker Google : 3608


Add news to data dictionary from Twitter account

In [51]:
date_since = "2020-11-13"
nb_items = 1000
language = "en"
codes = data_u.get_codes(data_folder + "twitter_codes.txt")
from_ids = ['Google', 'Total']
print(f"Tickers before operation : {data_dict.keys()}\n")
data_u.add_tweets_to_dict(date_since, nb_items, language, codes,\
                    dict_dir, retweet=False, from_ids=from_ids)
data_dict = dict_u.get_dict(dict_dir)
print(f"Tickers now : {data_dict.keys()}\n")
print(f"Number of news in ticker {search_words[0]} : {len(data_dict[search_words[0].lower()])}")

Tickers before operation : dict_keys([])

Tickers now : dict_keys(['google', 'total'])

Number of news in ticker Google : 1000


Add news to data dictionary from all over Twitter

In [47]:
date_since = "2020-11-13"
nb_items = 100
language = "fr"
codes = data_u.get_codes(data_folder + "twitter_codes.txt")
from_words = ['Google', 'Facebook']
print(f"Tickers before operation : {data_dict.keys()}\n")
data_u.add_tweets_to_dict(date_since, nb_items, language, codes,\
                    dict_dir, retweet=False, from_words=from_words)
data_dict = dict_u.get_dict(dict_dir)
print(f"Tickers now : {data_dict.keys()}\n")
print(f"Number of news in ticker {search_words[0]} : {len(data_dict[search_words[0].lower()])}")

Tickers before operation : dict_keys(['google', 'exxon', 'total'])

Tickers now : dict_keys(['google', 'exxon', 'total', 'facebook'])

Number of news in ticker Google : 3808


In [15]:
dict_u.reset_dict(dict_dir)
search_word = "Google"
news_to_read = "Reuters"
data_u.add_news_to_dict(search_word, data_folder, news_to_read, dict_dir)


KeyboardInterrupt: 

## Testing nlp_utilities

### BOW/TFIDF

In [4]:
import nlp_utilities as nlp_u

data_dict = dict_u.get_dict(dict_dir)
df = data_dict['google']

bow, countvect, feat2word = nlp_u.df_to_bow(df, TFIDF=True)

print("Document - words matrix:", bow.shape)
print("First words:", countvect.get_feature_names()[0:100])

Document - words matrix: (3608, 9966)
First words: ['aa', 'abandon', 'abandoned', 'abb', 'abdominal', 'abide', 'abiding', 'ability', 'able', 'aboard', 'aborted', 'abortive', 'abound', 'abrasive', 'abroad', 'abrupt', 'abruptly', 'absence', 'absent', 'absolute', 'absolutely', 'absorb', 'absorbed', 'absorbing', 'absorption', 'abstain', 'absurd', 'abu', 'abundance', 'abundant', 'abuse', 'abusive', 'abuzz', 'abyss', 'academic', 'academy', 'accelerate', 'accelerated', 'acceleration', 'accelerator', 'accelerometer', 'accent', 'accept', 'acceptable', 'acceptance', 'accepted', 'access', 'accessible', 'accessory', 'accident', 'accidental', 'accidentally', 'acclaim', 'accommodate', 'accommodating', 'accommodation', 'accommodative', 'accompany', 'accomplish', 'accomplished', 'accord', 'accordance', 'according', 'accordingly', 'account', 'accountability', 'accountable', 'accountancy', 'accountant', 'accounting', 'accredited', 'accretive', 'accrue', 'accuracy', 'accurate', 'accurately', 'accusation'

In [5]:
bow

<3608x9966 sparse matrix of type '<class 'numpy.float64'>'
	with 575506 stored elements in Compressed Sparse Row format>

### Word2Vec

In [28]:
model = nlp_u.df_to_vec(df)
model.wv.most_similar(positive="high")

[('low', 0.7797523140907288),
 ('hit', 0.7690551280975342),
 ('record', 0.654312789440155),
 ('highest', 0.6522954106330872),
 ('galloping', 0.6289225220680237),
 ('bullion', 0.6248561143875122),
 ('mid', 0.615885317325592),
 ('gold', 0.604141116142273),
 ('heavy', 0.6038429737091064),
 ('level', 0.603442907333374)]

### Topic models : LDA and NMF

In [45]:
import nlp_utilities as nlp_u

#Fresh start
search_words = ['Google', 'Exxon']
news_to_read = "Reuters"
format_cols = ["Text", "Author", "Date"]
dict_u.reset_dict(dict_dir)
data_u.add_news_to_dict(search_words, data_folder, news_to_read, dict_dir, format_cols)
data_dict = dict_u.get_dict(dict_dir)

df = data_dict['google']
n_words = 20

#LDA
print("Fitting LDA model (tf features)")
X, lda, countvect, feat2word = nlp_u.df_to_lda(df, n_topics = 5, TF = True)
feature_names = countvect.get_feature_names()

print("Topics in tf-LDA model:")
nlp_u.print_topics(lda, feature_names, n_words)

print("\nFitting LDA model (BOW features)")
X, lda, countvect, feat2word = nlp_u.df_to_lda(df, n_topics = 5, TF = False)
feature_names = countvect.get_feature_names()

print("Topics in BOW-LDA model:")
nlp_u.print_topics(lda, feature_names, n_words)

#NMF
print("\nFitting NMF model")
nmf, countvect, feat2word = nlp_u.df_to_nmf(df, n_topics = 5)
feature_names = countvect.get_feature_names()

print("Topics in NMF model:")
nlp_u.print_topics(nmf, feature_names, n_words)


Fitting LDA model (tf features)
Topics in tf-LDA model:
Topic #0: percent said new stock year york company dow rose market index content billion time yahoo high medium higher video average
Topic #1: said video web new site year also medium clip journal york yahoo advertising deal time music service company content last
Topic #2: said percent company new apple year billion yahoo market million would stock quarter also search business time last could one
Topic #3: percent billion said stock web company cash new year fell market would index growth week also york average last high
Topic #4: said percent new yahoo year web company stock billion quarter search million business advertising mobile video york also share system

Fitting LDA model (BOW features)
Topics in BOW-LDA model:
Topic #0: noble energy nook wind solar grid power electric duke smart gas toy oil book bookstore electricity renewable gold project lan
Topic #1: said court patent case commission government information federal la

# Start learning

## Get features

In [47]:
import nlp_utilities as nlp_u
import dict_utilities as dict_u
import data_utilities as data_u

data_dict = dict_u.get_dict(dict_dir)
print(f"Tickers available : {data_dict.keys()}")
print(f"Number of news labelled for Google : {len(data_dict['google'])}")

Tickers available : dict_keys(['google'])
Number of news labelled for Google : 11195


In [50]:
data_dict["google"].tail(5)

Unnamed: 0,Text,Author_x,Date_x,Label_x,Author_y,Date_y,Label_y
11190,By Amy Thomson and Matthew Campbell Nov. 25 (...,,NaT,,[A m y T h o m s o n a n d M a t t h e w...,2013-11-25 00:01:00,-0.780701
11191,"Softbank Corp. (9984) , the Japanesemajority o...",,NaT,,[E h r e n G o o s s e n s],2013-11-25 18:27:48,-0.780701
11192,Working in secret like the programsthey’re rev...,,NaT,,[M a r g a r e t T a l e v],2013-11-25 18:18:51,-0.780701
11193,People who sign up as drivers forUber Technolo...,,NaT,,[M a r k M i l i a n],2013-11-25 05:00:00,-0.780701
11194,"Yahoo! Inc. ’s push to hire Katie Couric , wh...",,NaT,,[r i a n W o m a c k a n d D a v i d H...,2013-11-25 17:26:38,-0.780701


### W2V features

In [37]:
%%time
df = data_dict["google"]
X_w2v = nlp_u.get_w2v_features(df, stop_words = None, language = 'en', size=200, window=5, min_count=1)

Wall time: 57.2 s


### TFIDF features

In [43]:
%%time
n_samples = len(data_dict["google"])
df = data_dict["google"].iloc[0:n_samples]
X_bow = nlp_u.get_bow_features(df, stop_words = None, language = 'en', TFIDF = True)

Wall time: 30.1 s


### LDA features

In [51]:
%%time
n_samples = len(data_dict["google"])
df = data_dict["google"].iloc[0:n_samples]
X_lda = nlp_u.get_lda_features(df, n_topics = 5, stop_words = None, language = 'en', TF = True)

Wall time: 45 s


### NMF features

In [54]:
%%time
n_samples = len(data_dict["google"])
df = data_dict["google"].iloc[0:n_samples]
X_nmf = nlp_u.get_nmf_features(df, n_topics=5, stop_words = None, language = 'en')

Wall time: 33.9 s


## Regression

In [None]:
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, AdaBoostRegressor
from sklearn import tree
from sklearn import svm
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
#Add Voting regression when all have been tested ?

#The classifiers we will compare
clfs = { "Random Forests" : RandomForestRegressor(n_estimators=100, criterion='entropy'),
         "Gradient Boosting" : GradientBoostingRegressor(n_estimators=100),
         "Decision Tree" : tree.DecisionTreeRegressor(),
         "SVR" : svm.SVR(kernel='rbf', C = 1),
         "Gaussian Process" : GaussianProcessRegressor(n_restarts_optimizer = 3),
         "Adaboost" : AdaBoostRegressor(tree.DecisionTreeRegressor(criterion='entropy', max_depth = 3), n_estimators=100)}

#The models we will compare
models = { "TFIDF" : X_bow,
           "Word2Vec" : X_w2v,
           "LDA" : X_lda,
           "NMF" : X_nmf}

y = df["Label"]
results = {}
for model_name in models:
    for clf_name in clfs:
        X = models[model_name]
        clf = clfs[clf_name]
        scores = cross_val_score(clf, X, y)
        results[model_name + " + " + clf_name] = np.mean(scores)
        


In [4]:
import re
def standardize_date(date, news_to_read):
    '''Returns a standard datetime64 format of given date taken from Reuters/Bloomberg dataset'''
    if (news_to_read == "Reuters") :
        date = re.split(' |, ', date)
        month2number = {"Jan" : '01', "Feb" : '02', "Mar" : '03', "Apr" : '04', "May" : '05', "Jun" : '06',
                        "Jul" : '07', "Aug" : '08', "Sep" : '09', "Oct" : '10', "Nov": '11', "Dec" : '12'}
        month = month2number[date[1]]
        day = date[2]
        year = date[3]
        hour_min = date[4]
        if 'p' in hour_min:
            hour_min = re.split(':|p', hour_min)
            hour, minutes = str((12+int(hour_min[0]))%12), hour_min[1]
        else:
            hour_min = re.split(':|a', hour_min)
            hour, minutes = hour_min[0], hour_min[1]

        hour = '0' + hour if len(hour)==1 else hour
        minutes = '0' + minutes if len(minutes)==1 else minutes
        day = '0' + day if len(day)==1 else day

        return np.datetime64(year+'-'+month+'-'+day+'T'+hour+':'+minutes)
    
    elif(news_to_read == "Bloomberg") :
        return np.datetime64(date[:-1])


def get_df_news(data_folder, news_to_read, format_cols = ["Text", "Author", "Date"]):
    '''Creates a dataframe from a ".parquet.gzip" file
    data_folder : directory of the parquet file
    news_to_read : "Bloomberg" or "Reuters"
    format_cols : temporary argument until we agree on the format of dataframe'''
    df = pd.read_parquet(data_folder + 'financial_data' + news_to_read + '.parquet.gzip')
    df = df.rename(columns = {'Article':'Text', 'Journalists':'Author'})
    df["Date"] = df["Date"].map(lambda date : standardize_date(date, news_to_read))
    return df[format_cols]

dict_u.reset_dict(dict_dir)
search_word = "Google"
news_to_read = "Reuters"

dfA = get_df_news(data_folder, "Reuters").head(20)
dfB = get_df_news(data_folder, "Bloomberg").head(20)

In [8]:
dfB

Unnamed: 0,Text,Author,Date
0,"Inco Ltd., the Canadian nickel producerbeing b...",[Dale Crofts],2006-10-20 20:16:16
1,Jim Cramer recommended that viewersbuy shares...,[Steven Bodzin],2006-10-21 00:08:44
2,European Union Energy CommissionerAndris Pieba...,[Thomas Bauer],2006-10-23 11:51:36
3,A former worker at a Wisconsinprinting plant ...,[David Glovin],2006-10-23 20:00:29
4,"Bare Escentuals Inc.'s cosmetics area fad, mak...",[Steven Bodzin],2006-10-24 01:32:04
5,"Russia 's state-run OAO Gazprom, theworld's bi...",[Daryna Krasnolutska],2006-10-24 10:53:59
6,"Huaneng Power International Inc. (902) ,the la...",[Wing-Gar Cheng],2006-10-24 11:48:37
7,Cia. Vale do Rio Doce paid $13.3billion to a...,[Heloiza Canassa],2006-10-24 21:46:57
8,"Ambac Financial Group Inc. (ABKFQ) , theworld'...",[Christine Richard],2006-10-25 16:33:03
9,"Wheeling-Pittsburgh Corp., a WestVirginia-base...",[Dale Crofts],2006-10-25 20:28:44


In [9]:
%%time
dfC = pd.merge(dfA,dfB , how='outer', on=["Text"])
dfC

Wall time: 0 ns


Unnamed: 0,Text,Author_x,Date_x,Author_y,Date_y
0,"ANCHORAGE, Alaska (Reuters) - Exxon Mobil ( X...",[Yereth Rosen],2006-10-20 06:15:00,,NaT
1,SAN FRANCISCO/NEW YORK (Reuters) - Wall Stree...,"[Paul Thomasch, Eric Auchard]",2006-10-20 04:25:00,,NaT
2,FRANKFURT (Reuters) - Internet service provid...,[],2006-10-21 02:21:00,,NaT
3,WASHINGTON (Reuters) - The central bank is ex...,[Alister Bull],2006-10-22 00:14:00,,NaT
4,"LIMA, Peru (Reuters) - Argentine oil company ...",[],2006-10-21 08:11:00,,NaT
5,BRUSSELS (Reuters) - The European Union will ...,[William Schomberg],2006-10-22 06:46:00,,NaT
6,FRANKFURT (Reuters) - DaimlerChrysler DCXGn.D...,[],2006-10-23 03:51:00,,NaT
7,NEW YORK (Reuters) - Oilfield services compan...,[],2006-10-22 08:36:00,,NaT
8,MELBOURNE (Reuters) - Foster's Group Ltd. FGL...,[],2006-10-23 01:50:00,,NaT
9,"(Adds analyst comment, closing stock activity)...",[],2006-10-23 04:17:00,,NaT
