# Testing the packages

In [26]:
import pandas as pd
import numpy as np
import pickle
import os
import sys
sys.path.insert(1, '../functions')
import data_utilities as data_u
import dict_utilities as dict_u

#from jupyterthemes import jtplot
#jtplot.style()
%load_ext autoreload
%autoreload 2
%matplotlib inline

data_folder = '../data/'
dict_dir = data_folder + 'data_dict.pkl'


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Testing dict_utilities

In [44]:
import dict_utilities as dict_u

data_dict = dict_u.get_dict(dict_dir)
print(f"Tickers already saved : {data_dict.keys()}")

dict_u.reset_dict(dict_dir)
data_dict = dict_u.get_dict(dict_dir)
print(f"Tickers after reset : {data_dict.keys()}")

Tickers already saved : dict_keys(['google', 'exxon', 'total'])
Tickers after reset : dict_keys([])


## Testing data_utilities

Add news to data dictionary from database

In [45]:
import data_utilities as data_u

search_words = ['Google', 'Exxon']
news_to_read = "Reuters"
format_cols = ["Text", "Author", "Date"]
data_u.add_news_to_dict(search_words, data_folder, news_to_read, dict_dir, format_cols)
data_dict = dict_u.get_dict(dict_dir)
print(f"Tickers added : {data_dict.keys()}\n")
print(f"Number of news in ticker {search_words[0]} : {len(data_dict[search_words[0].lower()])}")

Tickers added : dict_keys(['google', 'exxon'])

Number of news in ticker Google : 3608


Add news to data dictionary from Twitter account

In [46]:
date_since = "2020-11-13"
nb_items = 100
language = "fr"
codes = data_u.get_codes(data_folder + "twitter_codes.txt")
from_ids = ['Google', 'Total']
print(f"Tickers before operation : {data_dict.keys()}\n")
data_u.add_tweets_to_dict(date_since, nb_items, language, codes,\
                    format_cols, dict_dir, retweet=False, from_ids=from_ids)
data_dict = dict_u.get_dict(dict_dir)
print(f"Tickers now : {data_dict.keys()}\n")
print(f"Number of news in ticker {search_words[0]} : {len(data_dict[search_words[0].lower()])}")

Tickers before operation : dict_keys(['google', 'exxon'])

Tickers now : dict_keys(['google', 'exxon', 'total'])

Number of news in ticker Google : 3708


Add news to data dictionary from all over Twitter

In [47]:
date_since = "2020-11-13"
nb_items = 100
language = "fr"
codes = data_u.get_codes(data_folder + "twitter_codes.txt")
from_words = ['Google', 'Facebook']
print(f"Tickers before operation : {data_dict.keys()}\n")
data_u.add_tweets_to_dict(date_since, nb_items, language, codes,\
                    format_cols, dict_dir, retweet=False, from_words=from_words)
data_dict = dict_u.get_dict(dict_dir)
print(f"Tickers now : {data_dict.keys()}\n")
print(f"Number of news in ticker {search_words[0]} : {len(data_dict[search_words[0].lower()])}")

Tickers before operation : dict_keys(['google', 'exxon', 'total'])

Tickers now : dict_keys(['google', 'exxon', 'total', 'facebook'])

Number of news in ticker Google : 3808


## Testing nlp_utilities

Transform the dataframe of a company into BOW (with TFIDF)

In [27]:
import nlp_utilities as nlp_u

data_dict = dict_u.get_dict(dict_dir)
df = data_dict['google']

bow, countvect, feat2word = nlp_u.df_to_bow(df, TFIDF=True)

print("Document - words matrix:", bow.shape)
print("First words:", countvect.get_feature_names()[0:100])

Document - words matrix: (4020, 12978)
First words: ['aa', 'aal', 'aba', 'aback', 'abacus', 'abandon', 'abandoned', 'abandonment', 'abate', 'abb', 'abbey', 'abbreviate', 'abdominal', 'abide', 'abiding', 'ability', 'able', 'ably', 'aboard', 'aborted', 'abortion', 'abortive', 'abound', 'abrasive', 'abroad', 'abrupt', 'abruptly', 'absence', 'absent', 'absentee', 'absolute', 'absolutely', 'absorb', 'absorbed', 'absorbing', 'absorption', 'abstain', 'abstract', 'abstractly', 'absurd', 'abu', 'abundance', 'abundant', 'abundantly', 'abuse', 'abuser', 'abusive', 'abuzz', 'abysmal', 'abyss', 'academic', 'academy', 'acca', 'accelerate', 'accelerated', 'acceleration', 'accelerator', 'accelerometer', 'accent', 'accept', 'acceptable', 'acceptance', 'accepted', 'access', 'accessible', 'accession', 'accessory', 'accident', 'accidental', 'accidentally', 'acclaim', 'accommodate', 'accommodating', 'accommodation', 'accommodative', 'accompany', 'accomplish', 'accomplished', 'accord', 'accordance', 'accord

Transform the dataframe of a company into Word2Vec model

In [28]:
model = nlp_u.df_to_vec(df)
model.wv.most_similar(positive="high")

[('low', 0.7797523140907288),
 ('hit', 0.7690551280975342),
 ('record', 0.654312789440155),
 ('highest', 0.6522954106330872),
 ('galloping', 0.6289225220680237),
 ('bullion', 0.6248561143875122),
 ('mid', 0.615885317325592),
 ('gold', 0.604141116142273),
 ('heavy', 0.6038429737091064),
 ('level', 0.603442907333374)]