# Testing the packages

In [10]:
import pandas as pd
import numpy as np
import pickle
import os
import sys
sys.path.insert(1, '../functions')
import data_utilities as data_u
import dict_utilities as dict_u
import nlp_utilities as nlp_u


#from jupyterthemes import jtplot
#jtplot.style()
%load_ext autoreload
%autoreload 2
%matplotlib inline

data_folder = '../data/'
dict_dir = data_folder + 'data_dict.pkl'


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Testing dict_utilities

In [44]:
import dict_utilities as dict_u

data_dict = dict_u.get_dict(dict_dir)
print(f"Tickers already saved : {data_dict.keys()}")

dict_u.reset_dict(dict_dir)
data_dict = dict_u.get_dict(dict_dir)
print(f"Tickers after reset : {data_dict.keys()}")

Tickers already saved : dict_keys(['google', 'exxon', 'total'])
Tickers after reset : dict_keys([])


## Testing data_utilities

Add news to data dictionary from database

In [45]:
import data_utilities as data_u

search_words = ['Google', 'Exxon']
news_to_read = "Reuters"
format_cols = ["Text", "Author", "Date"]
data_u.add_news_to_dict(search_words, data_folder, news_to_read, dict_dir, format_cols)
data_dict = dict_u.get_dict(dict_dir)
print(f"Tickers added : {data_dict.keys()}\n")
print(f"Number of news in ticker {search_words[0]} : {len(data_dict[search_words[0].lower()])}")

Tickers added : dict_keys(['google', 'exxon'])

Number of news in ticker Google : 3608


Add news to data dictionary from Twitter account

In [46]:
date_since = "2020-11-13"
nb_items = 100
language = "fr"
codes = data_u.get_codes(data_folder + "twitter_codes.txt")
from_ids = ['Google', 'Total']
print(f"Tickers before operation : {data_dict.keys()}\n")
data_u.add_tweets_to_dict(date_since, nb_items, language, codes,\
                    format_cols, dict_dir, retweet=False, from_ids=from_ids)
data_dict = dict_u.get_dict(dict_dir)
print(f"Tickers now : {data_dict.keys()}\n")
print(f"Number of news in ticker {search_words[0]} : {len(data_dict[search_words[0].lower()])}")

Tickers before operation : dict_keys(['google', 'exxon'])

Tickers now : dict_keys(['google', 'exxon', 'total'])

Number of news in ticker Google : 3708


Add news to data dictionary from all over Twitter

In [47]:
date_since = "2020-11-13"
nb_items = 100
language = "fr"
codes = data_u.get_codes(data_folder + "twitter_codes.txt")
from_words = ['Google', 'Facebook']
print(f"Tickers before operation : {data_dict.keys()}\n")
data_u.add_tweets_to_dict(date_since, nb_items, language, codes,\
                    format_cols, dict_dir, retweet=False, from_words=from_words)
data_dict = dict_u.get_dict(dict_dir)
print(f"Tickers now : {data_dict.keys()}\n")
print(f"Number of news in ticker {search_words[0]} : {len(data_dict[search_words[0].lower()])}")

Tickers before operation : dict_keys(['google', 'exxon', 'total'])

Tickers now : dict_keys(['google', 'exxon', 'total', 'facebook'])

Number of news in ticker Google : 3808


## Testing nlp_utilities

### BOW/TFIDF

In [27]:
import nlp_utilities as nlp_u

data_dict = dict_u.get_dict(dict_dir)
df = data_dict['google']

bow, countvect, feat2word = nlp_u.df_to_bow(df, TFIDF=True)

print("Document - words matrix:", bow.shape)
print("First words:", countvect.get_feature_names()[0:100])

Document - words matrix: (4020, 12978)
First words: ['aa', 'aal', 'aba', 'aback', 'abacus', 'abandon', 'abandoned', 'abandonment', 'abate', 'abb', 'abbey', 'abbreviate', 'abdominal', 'abide', 'abiding', 'ability', 'able', 'ably', 'aboard', 'aborted', 'abortion', 'abortive', 'abound', 'abrasive', 'abroad', 'abrupt', 'abruptly', 'absence', 'absent', 'absentee', 'absolute', 'absolutely', 'absorb', 'absorbed', 'absorbing', 'absorption', 'abstain', 'abstract', 'abstractly', 'absurd', 'abu', 'abundance', 'abundant', 'abundantly', 'abuse', 'abuser', 'abusive', 'abuzz', 'abysmal', 'abyss', 'academic', 'academy', 'acca', 'accelerate', 'accelerated', 'acceleration', 'accelerator', 'accelerometer', 'accent', 'accept', 'acceptable', 'acceptance', 'accepted', 'access', 'accessible', 'accession', 'accessory', 'accident', 'accidental', 'accidentally', 'acclaim', 'accommodate', 'accommodating', 'accommodation', 'accommodative', 'accompany', 'accomplish', 'accomplished', 'accord', 'accordance', 'accord

### Word2Vec

In [28]:
model = nlp_u.df_to_vec(df)
model.wv.most_similar(positive="high")

[('low', 0.7797523140907288),
 ('hit', 0.7690551280975342),
 ('record', 0.654312789440155),
 ('highest', 0.6522954106330872),
 ('galloping', 0.6289225220680237),
 ('bullion', 0.6248561143875122),
 ('mid', 0.615885317325592),
 ('gold', 0.604141116142273),
 ('heavy', 0.6038429737091064),
 ('level', 0.603442907333374)]

In [7]:
data_samples

Unnamed: 0,Text,Author_x,Date_x,Author_y,Date_y,Author,Date
0,SAN FRANCISCO/NEW YORK (Reuters) - Wall Stree...,"[Paul Thomasch, Eric Auchard]","Fri Oct 20, 2006 4:25pm EDT",,NaT,,NaT
1,FRANKFURT (Reuters) - Internet service provid...,[],"Sat Oct 21, 2006 2:21pm EDT",,NaT,,NaT
2,NEW YORK (Reuters) - U.S. stocks should exten...,"[ers, Chris S]","Mon Oct 23, 2006 5:24am EDT",,NaT,,NaT
3,NEW YORK (Reuters) - U.S. stocks rallied on M...,[Vivianne Rodrigues],"Mon Oct 23, 2006 5:37pm EDT",,NaT,,NaT
4,LOS ANGELES (Reuters) - Amazon.com on Tuesday...,"[ria Sage, Alex]","Tue Oct 24, 2006 7:39pm EDT",,NaT,,NaT
...,...,...,...,...,...,...,...
3803,https://t.co/1r1aITGy2l un dernier pour la route?,,,,NaT,jt50manche,2020-11-29 15:58:57
3804,@Microsoft Et ça c’est de la résilience ou ? \...,,,,NaT,Jemexprime1,2020-11-29 15:58:55
3805,Horowitz: étude CDC: 85% des cas de COVID-19 e...,,,,NaT,paxipax_heleane,2020-11-29 15:58:12
3806,@readarrt Tape Aubameyang Kroos sur Google tu ...,,,,NaT,Banalyste,2020-11-29 15:58:11


### Topic models : LDA and NMF

In [45]:
import nlp_utilities as nlp_u

#Fresh start
search_words = ['Google', 'Exxon']
news_to_read = "Reuters"
format_cols = ["Text", "Author", "Date"]
dict_u.reset_dict(dict_dir)
data_u.add_news_to_dict(search_words, data_folder, news_to_read, dict_dir, format_cols)
data_dict = dict_u.get_dict(dict_dir)

df = data_dict['google']
n_words = 20

#LDA
print("Fitting LDA model (tf features)")
lda, countvect, feat2word = nlp_u.df_to_lda(df, n_topics = 5, TF = True)
feature_names = countvect.get_feature_names()

print("Topics in tf-LDA model:")
nlp_u.print_topics(lda, feature_names, n_words)

print("\nFitting LDA model (BOW features)")
lda, countvect, feat2word = nlp_u.df_to_lda(df, n_topics = 5, TF = False)
feature_names = countvect.get_feature_names()

print("Topics in BOW-LDA model:")
nlp_u.print_topics(lda, feature_names, n_words)

#NMF
print("\nFitting NMF model")
nmf, countvect, feat2word = nlp_u.df_to_nmf(df, n_topics = 5)
feature_names = countvect.get_feature_names()

print("Topics in NMF model:")
nlp_u.print_topics(nmf, feature_names, n_words)


Fitting LDA model (tf features)
Topics in tf-LDA model:
Topic #0: percent said new stock year york company dow rose market index content billion time yahoo high medium higher video average
Topic #1: said video web new site year also medium clip journal york yahoo advertising deal time music service company content last
Topic #2: said percent company new apple year billion yahoo market million would stock quarter also search business time last could one
Topic #3: percent billion said stock web company cash new year fell market would index growth week also york average last high
Topic #4: said percent new yahoo year web company stock billion quarter search million business advertising mobile video york also share system

Fitting LDA model (BOW features)
Topics in BOW-LDA model:
Topic #0: noble energy nook wind solar grid power electric duke smart gas toy oil book bookstore electricity renewable gold project lan
Topic #1: said court patent case commission government information federal la