# **Libraries**

In [None]:
    #GENERIC
import matplotlib.pyplot as plt
import seaborn as sns
from random import random
import pandas as pd
import numpy as np
from datetime import datetime
import json
import time
import os
from tqdm import tqdm
tqdm.pandas()
import pickle
from joblib import dump
from joblib import load
from pprint import pprint
    #PRE PROCESSING
import re
import nltk  
import collections
from nltk.corpus import stopwords
    #Embedding
from gensim.models import KeyedVectors
import operator
    #Sentiment Model
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
from sklearn.metrics import precision_recall_fscore_support, plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
    #Finbert
from transformers import BertTokenizer, BertForSequenceClassification

In [None]:
#Change Directory to the OS Folder
os.chdir(r"")

# **Labelled Data Exploration**

Two Datasets:
> **FinancialPhraseBank** as used in Malo, P., Sinha, A., Takala, P., Korhonen, P. and Wallenius, J. (2014): “Good debt or bad debt: Detecting semantic orientations in economic texts.” Journal of the American Society for Information Science and Technology https://arxiv.org/abs/1307.5336  |  https://www.kaggle.com/ankurzing/sentiment-analysis-for-financial-news/version/5


> **SemEval2017** Task 5: Fine-Grained Sentiment Analysis on Financial Microblogs and News (Cortis et al., 2017)

## Full Data

In [None]:
#Dataset from FinancialPhraseBank
FinancialPhraseBank = pd.read_csv("0)Headlines_Data_OG/FinancialPhraseBank.csv", header=None, encoding = "ISO-8859-1")
display(FinancialPhraseBank.shape)
display(FinancialPhraseBank.head(3))
#Dataset from SemEval2017 Task 5
    #Traing Data
SemEval_Training = pd.read_json("0)Headlines_Data_OG/SemEval2017_Trainingdata.json")
display(SemEval_Training.shape)
display(SemEval_Training.head(3))
    #Trial Data
SemEval_Trial = pd.read_json("0)Headlines_Data_OG/SemEval2017_Trialdata.json",)
display(SemEval_Trial.shape)
display(SemEval_Trial.head(3))

 * Individual Analysis of each source of data

In [None]:
    # FinancialPhraseBank Data
FinancialPhraseBank[0].value_counts()

In [None]:
    #SemEval Data
SemEval = pd.concat([SemEval_Training,SemEval_Trial], ignore_index=True)
    # Distribution of sentiment
fig = plt.figure(figsize=(15, 7))
ax1 = fig.add_subplot(1, 1, 1)
SemEval["sentiment"].hist(bins=50, ax=ax1)
ax1.set_xlabel('Sentiment')
ax1.set_ylabel('Sample')
ax1.set_title('Sentiment distribution')
plt.xlim(-1,1)
plt.show()
    # Healines Examples
for sent_value in np.linspace(-1,1,21):
        temp_df = SemEval.iloc[(SemEval['sentiment']-sent_value).abs().argsort()[:1]]
        display(f'Sentiment {round(temp_df.iloc[0]["sentiment"],2)}  | Title: {temp_df.iloc[0]["title"]}')

## Compiled Clean Data

 * Main Data Adjustments

In [None]:
#Main Data Adjutments
    #Data Copies
FinancialPhraseBank_0 = FinancialPhraseBank.copy()
SemEval_0 = SemEval.copy()
    #Renaming columns
FinancialPhraseBank_0.rename(columns = {0: "sentiment", 1: "title"}, inplace = True) 
    #Eliminating Irrelevant Columns
SemEval_0.drop(['id','company'], axis=1, inplace=True)
    # Creating Sent Variable (pos 1; neg -1; neutral 0) Sent 
                #Note.For semeval data values <0 will be negative, >0 positive and the remained neutral
FinancialPhraseBank_0["Sent"] = np.where(FinancialPhraseBank_0["sentiment"]=="positive",1,np.where(FinancialPhraseBank_0["sentiment"]=="negative",-1,0))
SemEval_0["Sent"] = np.where(SemEval_0["sentiment"]>0,1,np.where(SemEval_0["sentiment"]<0,-1,0))          
    #Join together the datframe
headlines_sent_0 = pd.concat([FinancialPhraseBank_0, SemEval_0], ignore_index=True)
    #Adjusting Duplicates (for SemEval some titles refer to multiple companies and so are displayed multiple times)
headlines_sent_0['Duplicate_Title'] = headlines_sent_0.groupby('title')['title'].transform('count')
headlines_sent_1 = headlines_sent_0[headlines_sent_0['Duplicate_Title']<2]

In [None]:
print("Duplicated: ", len(headlines_sent_0))
print("NO Duplicated: ", len(headlines_sent_1))

In [None]:
#Save Data
save_check = input("Save Data?")
if save_check == "yes":
    with open("1)Transformed_Headlines/IS_Headlines_Data", "wb") as fp:   
        pickle.dump(headlines_sent_1, fp)  

In [None]:
display(headlines_sent_1.shape)
display(headlines_sent_1.head(3))
display(headlines_sent_1["Sent"].value_counts()) #Sentiment Distribution (unbalanced)

In [None]:
headlines_sent_1[headlines_sent_1["Sent"]==0]

# Functions for generic pre-processing

Useful Links

1) https://dylancastillo.co/nlp-snippets-clean-and-tokenize-text-with-python/#:~:text=Remove%20all%20special%20characters%20and%20punctuation%20In%20cases,import%20re%20sample_text%20%3D%20%22Sample%20text%20123%20%21%21%21%21

2) https://www.kaggle.com/christofhenkel/how-to-preprocessing-when-using-embeddings

In [None]:
def text_processing(text, multiexpression_dic=[], stopwords=None, special_char=False, numbers=True, lowercase=False, stemm=False, lemm=False):
    '''Preprocess a string. All transformation can be chosen through arguments.
    :parameter
    :param text_input: string - sentence/corpus to be processed
    :param numbers: bool - whether numbers are removed or not
    :param special_char: bool - whether special characters and punctuation are removed or not
    :param lowercase: bool - whether  words are converted to lowercase or not   
    :param stopwords: list - list of stopwords to remove
    :param stemm: bool - whether stemming is to be applied
    :param lemm: bool - whether lemmitisation is to be applied
    :param min_size: int - minimum size of words included (inclusive)
    :param tokenize: bool - whether it is to tokenize the final text
    '''
    
    
    # MultiExpression Words (1st step)
    for key, value in multiexpression_dic.items():
        if key in text:
            text = text.replace(key,value)
    # Word Tokenize (2st step)
    text = nltk.word_tokenize(text)    
    #StopWord Removal (3rd step)
    if stopwords is not None:
        text = [word for word in text if word not in stopwords]    
    # Punctuation & Special Character removal (except those between numbers and those in multiexpressions)
    if special_char == True:
        multiexpressions = list(multiexpression_dic.values())
        text = [re.sub(r"(?<!\d)[.,;:](?!\d)", "", word) if word not in multiexpressions else word for word in text] 
        text = [word for word in text if word!=""]   
    # Numbers removal
    if numbers == True:
        text = [re.sub('\d', '#', str(word)) for word in text] 
    # Conversion to lowercase
    if lowercase == True:
        text =  [word.lower() for word in text]       
    # Stemming (remove -ing, -ly, ...)
    if stemm == True:
        ps = nltk.stem.porter.PorterStemmer()
        text = [ps.stem(word) for word in text.split()]
    # Lemmatisation (convert the word into root word)
    if lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        text = [lem.lemmatize(word) for word in text.split()]     
    return text

# Tables w Performace of Model Data

In [None]:
#Save Data
load_data = input("Load Table Data?")
if load_data.lower() == "yes":
    ML_table_Full = pd.read_excel("3)Table_Plots/ML_Performance.xlsx", index_col=0)
##Initiate New ML_table
reset_data = input("Reset Table Data?")
if reset_data.lower() == "yes":
    ML_table_Full = pd.DataFrame([], columns=["ML","Params","Mean Train","Mean Test","Train","Validation"])
##Save Data
save_data = input("Save the Data?")
if save_data.lower() == "yes":
    print("Saving")
    datestring_time = datetime.strftime(datetime.now(),"%m_%d_%Y")
    writer = pd.ExcelWriter("3)Table_Plots/"+str(datestring_time) + "__ML_Performance.xlsx", engine='xlsxwriter')
    ML_table_Full = ML_table_Full.drop_duplicates()
    ML_table_Full.to_excel(writer, sheet_name='ML')  
    writer.save()

# Word2Vec Embeding + ML


> **Google word2vec >**  https://code.google.com/archive/p/word2vec/

> **Intuition of Models >**  https://thinkinfi.com/simple-doc2vec-explained/

> **Averaging vectors intuition >**https://stats.stackexchange.com/questions/318882/what-does-average-of-word2vec-vector-mean/318891

> **Averaging vectors papers >** "Sentiment  Analysis  of  Twitter  Messages  using 
Word2vec  by  Weighted  Average" (Kamel, 2019)

> **Pre-Processing Specs >** https://mccormickml.com/2016/04/12/googles-pretrained-word2vec-model-in-python/

> **Intuition >**https://wiki.pathmind.com/word2vec#:~:text=Word2vec%20is%20a%20two%2Dlayer,deep%20neural%20networks%20can%20understand.

## Specific Functions for embedding Analysis

In [None]:
def embedding_coverage(vocab,model):
    '''Percentage of words in vocabulary that are in model selected. And percentage of tottal text.
    :param vocab: dictionary with count of words in data to be vectorized
    :param model: pretrained model used
    '''
    common_words = {}
    specific_vocab = {}
    n_words_common = 0
    n_specific_vocab = 0
    for word in (vocab):
        try:
            common_words[word] = model[word]
            n_words_common += vocab[word]
        except:
            specific_vocab[word] = vocab[word]
            n_specific_vocab += vocab[word]
            pass

    print('Found embeddings for {:.2%} of vocab'.format(len(common_words) / len(vocab)))
    print('Found embeddings for  {:.2%} of all text'.format(n_words_common / (n_words_common + n_specific_vocab)))
    sorted_x = sorted(specific_vocab.items(), key=operator.itemgetter(1))[::-1]
    return sorted_x

## Pre-Processing

In [None]:
#Loading headlines
with open("1)Transformed_Headlines/IS_Headlines_Data", "rb") as fp:  
    IS_headlines = pickle.load(fp)
#Load Google vocab
with open("PreTrainedModels/Word2Vec_Vocab", "rb") as fp:  
    Vocab_in_google_model = pickle.load(fp)

In [None]:
#Feature extraction
    #Specifc dictionaries with correction
stop_words_google = ["to","of","and","a"]
    #GET multiexpressions & organize them into a dictionary as non processed: processed format
mulitword_expressions_google = [vocab for vocab in Vocab_in_google_model if ("_" in vocab) & (len(vocab[:vocab.find("_")])>=2) & (len(vocab[vocab.find("_")+1:])>=2)] 
multiexpression_google_dic = {" ".join(exp.split("_")):exp for exp in mulitword_expressions_google}
    #Processing
IS_headlines["Processed_Text"] = IS_headlines.progress_apply(lambda x:text_processing(x["title"], multiexpression_dic=multiexpression_google_dic, stopwords=stop_words_google, special_char=True, numbers=True, lowercase=False, stemm=False, lemm=False),axis=1)
headline_processed_0 = list(IS_headlines["Processed_Text"])

#Save Processed Features
save_check = input("Save Processed Sentences?")
if save_check == "yes":
    with open("1)Transformed_Headlines/IS_Headlines_Data_+token", "wb") as fp:   
        pickle.dump(IS_headlines, fp)

## Vectorization

In [None]:
#GET Processed Headlines
with open("1)Transformed_Headlines/IS_Data_+token", "rb") as fp:  
    IS_headlines = pickle.load(fp)

#ELIMINATE NEUTRAL FOR BINARY MODEL
model_type = input("USE BINARY MODEL?")
if model_type == "yes":
    IS_headlines = IS_headlines[IS_headlines["Sent"]!=0].copy()

#LIST of Headlines & Vocabulary
headline_processed_0 = list(IS_headlines["Processed_Text"])
headline_processed_0_vocab = build_vocab(headline_processed_0) 

#Check  empty lists
IS_headlines[IS_headlines.apply(lambda x: len(x["Processed_Text"])<2, axis=1)]

In [None]:
#Load google word2vec trained model, saved in desktop
filename = "GoogleNews-vectors-negative300.bin"
google_model = KeyedVectors.load_word2vec_format(filename, binary=True)

In [None]:
#Check covergae of pretrained model given current vocabulary
coverage_check = input("Check coverage of model?")
if coverage_check == "yes":
    word_coverage = embedding_coverage(headline_processed_0_vocab,google_model)
    display(word_coverage)
    
#Get and save googel vocab
Vocab_in_google_model = list(google_model.index_to_key)
save_check = input("Save Word2Vec Vocabulary?")
if save_check == "yes":
    with open("PreTrainedModels/Word2Vec_Vocab", "wb") as fp:   
        pickle.dump(Vocab_in_google_model, fp)

In [None]:
#Going from word vector to sentence vectors......
IS_headlines["Vectorized_Text"] = IS_headlines.progress_apply(lambda x: np.mean([google_model[token] for token in x["Processed_Text"] if token in Vocab_in_google_model], axis=0),axis=1)
IS_headlines = IS_headlines[IS_headlines.apply(lambda x: type(x["Vectorized_Text"])!=np.float64, axis=1)] #take out empty arrays
#SAVE sentence vectors & google vocab
save_check = input("Save Vectorized Sentences?")
if save_check == "yes":
    with open("1)Transformed_Headlines/IS_Data_+token_+vector", "wb") as fp:   
        pickle.dump(IS_headlines, fp)  

In [None]:
#Check model closest words to given user chosen word
check = input("Check word for model closest?")
if check == "yes":
    sim_words = google_model.most_similar('rise', topn = 10)
    sim_words

## Sentiment Models (TRAINING)

In [None]:
#Loading Processed Headlines
with open("1)Transformed_Headlines/IS_Data_+token_+vector", "rb") as fp:  
    IS_headlines_processed = pickle.load(fp)
    
# Split Dataset into Traing and Test
headlines_train, headlines_test, sent_train, sent_test = train_test_split(list(IS_headlines_processed["Vectorized_Text"]), list(IS_headlines_processed["Sent"]), 
                                                          test_size=0.2, random_state=42)

### Baseline Model
Based on Majority Class

In [None]:
# Majority baseline
display(pd.Series(sent_train).value_counts())
display(pd.Series(sent_test).value_counts())
# Generate majority baseline dataframe
y_train_pred_baseline = [1]*len(sent_train)
y_test_pred_baseline = [1]*len(sent_test)

  #Train
print("-----TRAIN-----")
p, r, f, s = precision_recall_fscore_support(sent_train, y_train_pred_baseline, average="macro")
print("Precision: {:.2%}".format(p))
print("Recall: {:.2%}".format(r))
print("F score: {:.2%}".format(f))
  #Test
print("-----TEST-----")
p, r, f, s = precision_recall_fscore_support(sent_test, y_test_pred_baseline, average="macro")
print("Precision: {:.2%}".format(p))
print("Recall: {:.2%}".format(r))
print("F score: {:.2%}".format(f))

In [None]:
##Full Train & Full Test/VAlidation
print("Baseline Model")
  #Train
print("-----TRAIN-----")
train_p, train_r, train_f, train_s = precision_recall_fscore_support(sent_train, y_train_pred_baseline, average="macro")
print("Precision: {:.2%}".format(train_p))
print("Recall: {:.2%}".format(train_r))
print("F score: {:.2%}".format(train_f))
  #Test
print("-----TEST-----")
test_p, test_r, test_f, test_s = precision_recall_fscore_support(sent_test, y_test_pred_baseline, average="macro")
print("Precision: {:.2%}".format(test_p))
print("Recall: {:.2%}".format(test_r))
print("F score: {:.2%}".format(test_f))
##Append Data
ML_table = pd.DataFrame([["Baseline Model","-","-","-",train_f,test_f]], columns=["ML","Params","Mean Train","Mean Test","Train","Validation"])
ML_table_Full = pd.concat([ML_table_Full,ML_table])

### SVM 

In [None]:
?SVC

#### Linear 

In [None]:
#Initialize SVC
LinearSVC = SVC()
# grid search 
param_grid = {
    'C': [2.5,3,3.2,3.5,3.7,4],
    'kernel':["linear"],
    'decision_function_shape':["ovo", "ovr"],
    'random_state': [42]
}
#cross-validation
grid_search_LinearSVC = GridSearchCV(LinearSVC, param_grid, cv=5,
                           scoring='f1_macro', 
                           return_train_score=True,
                           error_score= "raise") 

start = time.time()
grid_search_LinearSVC.fit(headlines_train, sent_train)
end = time.time() - start
print(f"Took {end} seconds")
# Print the set of best hyperparameters
print("Best estimators", grid_search_LinearSVC.best_estimator_)
# Mean cross-validated score of the best_estimator
print("Validation Score", grid_search_LinearSVC.best_score_)
#Print scores of all tests
val_scores = grid_search_LinearSVC.cv_results_["mean_test_score"]
train_scores = grid_search_LinearSVC.cv_results_["mean_train_score"]
params = [str(x) for x in grid_search_LinearSVC.cv_results_["params"]]
for val_score, train_score, param in sorted(zip(val_scores, train_scores, params), reverse=True):
    print('Val:{:.2%};  Train:{:.2%}; Param:{};'.format(val_score, train_score, param))

In [None]:
# Store model
save_model_best = input("Save model best params")
if save_model_best == "yes":
    dump(grid_search_LinearSVC.best_estimator_, '2)Sentiment_Models/LinearSVC_2class.joblib')

In [None]:
#Load best model and get statistics on the full train set and test
##INitiate and store best model
Best_LinearSVC = load("2)Sentiment_Models/LinearSVC_2class.joblib")
#Train the model (with CV)
gs_best = GridSearchCV(Best_LinearSVC, {}, cv=5,
                           scoring='f1_macro', 
                           return_train_score=True,
                           error_score= "raise") 
gs_best.fit(headlines_train, sent_train)
grid_search_results = sorted(zip(gs_best.cv_results_["mean_test_score"], gs_best.cv_results_["mean_train_score"], [str(x) for x in gs_best.cv_results_["params"]]), reverse=True)
val_score = grid_search_results[0][0]
train_score = grid_search_results[0][1]
##Full Train & Full Test/VAlidation
print("Linear SVC")
  #Train
print("-----TRAIN-----")
train_p, train_r, train_f, train_s = precision_recall_fscore_support(sent_train, Best_LinearSVC.predict(headlines_train), average="macro")
print("Precision: {:.2%}".format(train_p))
print("Recall: {:.2%}".format(train_r))
print("F score: {:.2%}".format(train_f))
  #Test
print("-----TEST-----")
test_p, test_r, test_f, test_s = precision_recall_fscore_support(sent_test, Best_LinearSVC.predict(headlines_test), average="macro")
print("Precision: {:.2%}".format(test_p))
print("Recall: {:.2%}".format(test_r))
print("F score: {:.2%}".format(test_f))
##Append Data
ML_table = pd.DataFrame([["Linear SVC",Best_LinearSVC,train_score,val_score,train_f,test_f]], columns=["ML","Params","Mean Train","Mean Test","Train","Validation"])
ML_table_Full = pd.concat([ML_table_Full,ML_table])

#### Poly 

In [None]:
#Initialize SVC
PolySVC = SVC()
# grid search 
param_grid = {
    'C': [4,4.5],
    'kernel':["poly"],
    'degree':[2,3],
    'gamma':["scale",0.7,0.8],
    'decision_function_shape':["ovr"],
    'random_state': [42]
}
# cross-validation
grid_search_PolySVC = GridSearchCV(PolySVC, param_grid, cv=5,
                           scoring='f1_macro', 
                           return_train_score=True,
                           error_score= "raise") 

start = time.time()
grid_search_PolySVC.fit(headlines_train, sent_train)
end = time.time() - start
print(f"Took {end} seconds")
# Print the set of best hyperparameters
print("Best estimators", grid_search_PolySVC.best_estimator_)
# Mean cross-validated score of the best_estimator
print("Validation Score", grid_search_PolySVC.best_score_)
#Print scores of all tests
val_scores = grid_search_PolySVC.cv_results_["mean_test_score"]
train_scores = grid_search_PolySVC.cv_results_["mean_train_score"]
params = [str(x) for x in grid_search_PolySVC.cv_results_["params"]]
for val_score, train_score, param in sorted(zip(val_scores, train_scores, params), reverse=True):
    print('Val:{:.2%};  Train:{:.2%}; Param:{};'.format(val_score, train_score, param))

In [None]:
# Store model
save_model_best = input("Save model best params")
if save_model_best == "yes":
    dump(grid_search_PolySVC.best_estimator_, '2)Sentiment_Models/PolySVC_2class.joblib')

In [None]:
#Load best model and get statistics on the full train set and test
##INitiate and store best model
Best_PolySVC = load("2)Sentiment_Models/PolySVC_2class.joblib")
#Train the model (with CV)
gs_best = GridSearchCV(Best_PolySVC, {}, cv=5,
                           scoring='f1_macro', 
                           return_train_score=True,
                           error_score= "raise") 
gs_best.fit(headlines_train, sent_train)
grid_search_results = sorted(zip(gs_best.cv_results_["mean_test_score"], gs_best.cv_results_["mean_train_score"], [str(x) for x in gs_best.cv_results_["params"]]), reverse=True)
val_score = grid_search_results[0][0]
train_score = grid_search_results[0][1]
##Full Train & Full Test/VAlidation
print("Poly SVC")
  #Train
print("-----TRAIN-----")
train_p, train_r, train_f, train_s = precision_recall_fscore_support(sent_train, Best_PolySVC.predict(headlines_train), average="macro")
print("Precision: {:.2%}".format(train_p))
print("Recall: {:.2%}".format(train_r))
print("F score: {:.2%}".format(train_f))
  #Test
print("-----TEST-----")
test_p, test_r, test_f, test_s = precision_recall_fscore_support(sent_test, Best_PolySVC.predict(headlines_test), average="macro")
print("Precision: {:.2%}".format(test_p))
print("Recall: {:.2%}".format(test_r))
print("F score: {:.2%}".format(test_f))
##Append Data
ML_table = pd.DataFrame([["Poly SVC",Best_PolySVC,train_score,val_score,train_f,test_f]], columns=["ML","Params","Mean Train","Mean Test","Train","Validation"])
ML_table_Full = pd.concat([ML_table_Full,ML_table])

#### Rbf 

In [None]:
#Initialize SVC
RbfSVC = SVC()
# grid search 
param_grid = {
    'C': [3.5,3.7,3.8,3.9,4],
    'kernel':["rbf"],
    'decision_function_shape':["ovo", "ovr"],
    'random_state': [42]
}
# 10-fold cross-validation
grid_search_RbfSVC = GridSearchCV(RbfSVC, param_grid, cv=5,
                           scoring='f1_macro', 
                           return_train_score=True,
                           error_score= "raise") 

start = time.time()
grid_search_RbfSVC.fit(headlines_train, sent_train)
end = time.time() - start
print(f"Took {end} seconds")
# Print the set of best hyperparameters
print("Best estimators", grid_search_RbfSVC.best_estimator_)
# Mean cross-validated score of the best_estimator
print("Validation Score", grid_search_RbfSVC.best_score_)
#Print scores of all tests
val_scores = grid_search_RbfSVC.cv_results_["mean_test_score"]
train_scores = grid_search_RbfSVC.cv_results_["mean_train_score"]
params = [str(x) for x in grid_search_RbfSVC.cv_results_["params"]]
for val_score, train_score, param in sorted(zip(val_scores, train_scores, params), reverse=True):
    print('Val:{:.2%};  Train:{:.2%}; Param:{};'.format(val_score, train_score, param))

In [None]:
# Store model
save_model_best = input("Save model best params")
if save_model_best == "yes":
    dump(grid_search_RbfSVC.best_estimator_, '2)Sentiment_Models/RbfSVC_2class.joblib')

In [None]:
#Load best model and get statistics on the full train set and test
##INitiate and store best model
Best_RbfSVC = load("2)Sentiment_Models/RbfSVC_2class.joblib")
#Train the model (with CV)
gs_best = GridSearchCV(Best_RbfSVC, {}, cv=5,
                           scoring='f1_macro', 
                           return_train_score=True,
                           error_score= "raise") 
gs_best.fit(headlines_train, sent_train)
grid_search_results = sorted(zip(gs_best.cv_results_["mean_test_score"], gs_best.cv_results_["mean_train_score"], [str(x) for x in gs_best.cv_results_["params"]]), reverse=True)
val_score = grid_search_results[0][0]
train_score = grid_search_results[0][1]
##Full Train & Full Test/VAlidation
print("Rbf SVC")
  #Train
print("-----TRAIN-----")
train_p, train_r, train_f, train_s = precision_recall_fscore_support(sent_train, Best_RbfSVC.predict(headlines_train), average="macro")
print("Precision: {:.2%}".format(train_p))
print("Recall: {:.2%}".format(train_r))
print("F score: {:.2%}".format(train_f))
  #Test
print("-----TEST-----")
test_p, test_r, test_f, test_s = precision_recall_fscore_support(sent_test, Best_RbfSVC.predict(headlines_test), average="macro")
print("Precision: {:.2%}".format(test_p))
print("Recall: {:.2%}".format(test_r))
print("F score: {:.2%}".format(test_f))
##Append Data
ML_table = pd.DataFrame([["Rbf SVC",Best_RbfSVC,train_score,val_score,train_f,test_f]], columns=["ML","Params","Mean Train","Mean Test","Train","Validation"])
ML_table_Full = pd.concat([ML_table_Full,ML_table])

#### Sigmoid 

In [None]:
#Initialize SVC
SigmoidSVC = SVC()
# grid search 
param_grid = {
    'C': [0.5,0.55,0.6,0.65,0.7,0.8,],
    'kernel':["sigmoid"],
    #'gamma ':["scale", "auto"],
    'decision_function_shape':["ovo", "ovr"],
    'random_state': [42]
}
# 10-fold cross-validation
grid_search_SigmoidSVC = GridSearchCV(SigmoidSVC, param_grid, cv=5,
                           scoring='f1_macro', 
                           return_train_score=True,
                           error_score= "raise") 

start = time.time()
grid_search_SigmoidSVC.fit(headlines_train, sent_train)
end = time.time() - start
print(f"Took {end} seconds")
# Print the set of best hyperparameters
print("Best estimators", grid_search_SigmoidSVC.best_estimator_)
# Mean cross-validated score of the best_estimator
print("Validation Score", grid_search_SigmoidSVC.best_score_)
#Print scores of all tests
val_scores = grid_search_SigmoidSVC.cv_results_["mean_test_score"]
train_scores = grid_search_SigmoidSVC.cv_results_["mean_train_score"]
params = [str(x) for x in grid_search_SigmoidSVC.cv_results_["params"]]
for val_score, train_score, param in sorted(zip(val_scores, train_scores, params), reverse=True):
    print('Val:{:.2%};  Train:{:.2%}; Param:{};'.format(val_score, train_score, param))

In [None]:
# Store model
save_model_best = input("Save model best params")
if save_model_best == "yes":
    dump(grid_search_SigmoidSVC.best_estimator_, '2)Sentiment_Models/SigmoidSVC_2class.joblib')

In [None]:
#Load best model and get statistics on the full train set and test
##INitiate and store best model
Best_SigmoidSVC = load("2)Sentiment_Models/SigmoidSVC_2class.joblib")
#Train the model (with CV)
gs_best = GridSearchCV(Best_SigmoidSVC, {}, cv=5,
                           scoring='f1_macro', 
                           return_train_score=True,
                           error_score= "raise") 
gs_best.fit(headlines_train, sent_train)
grid_search_results = sorted(zip(gs_best.cv_results_["mean_test_score"], gs_best.cv_results_["mean_train_score"], [str(x) for x in gs_best.cv_results_["params"]]), reverse=True)
val_score = grid_search_results[0][0]
train_score = grid_search_results[0][1]
##Full Train & Full Test/VAlidation
print("Sigmoid SVC")
  #Train
print("-----TRAIN-----")
train_p, train_r, train_f, train_s = precision_recall_fscore_support(sent_train, Best_SigmoidSVC.predict(headlines_train), average="macro")
print("Precision: {:.2%}".format(train_p))
print("Recall: {:.2%}".format(train_r))
print("F score: {:.2%}".format(train_f))
  #Test
print("-----TEST-----")
test_p, test_r, test_f, test_s = precision_recall_fscore_support(sent_test, Best_SigmoidSVC.predict(headlines_test), average="macro")
print("Precision: {:.2%}".format(test_p))
print("Recall: {:.2%}".format(test_r))
print("F score: {:.2%}".format(test_f))
##Append Data
ML_table = pd.DataFrame([["Sigmoid SVC",Best_SigmoidSVC,train_score,val_score,train_f,test_f]], columns=["ML","Params","Mean Train","Mean Test","Train","Validation"])
ML_table_Full = pd.concat([ML_table_Full,ML_table])

### Random Forest

In [None]:
?RandomForestClassifier

In [None]:
#Initialize Random Forest
RFC = RandomForestClassifier()
# grid search 
param_grid = {
    'n_estimators': [500],
    'max_features':[100,200],
    'max_depth':[9],
    'min_samples_leaf':[4],
    #'max_features':["auto", "sqrt"],
    'random_state': [42]
}
#cross-validation
grid_search_RFC = GridSearchCV(RFC, param_grid, cv=5,
                           scoring='f1_macro', 
                           return_train_score=True,
                           error_score= "raise") 

start = time.time()
grid_search_RFC.fit(headlines_train, sent_train)
end = time.time() - start
print(f"Took {end} seconds")
# Print the set of best hyperparameters
print("Best estimators", grid_search_RFC.best_estimator_)
# Mean cross-validated score of the best_estimator
print("Validation Score", grid_search_RFC.best_score_)
#Print scores of all tests
val_scores = grid_search_RFC.cv_results_["mean_test_score"]
train_scores = grid_search_RFC.cv_results_["mean_train_score"]
params = [str(x) for x in grid_search_RFC.cv_results_["params"]]
for val_score, train_score, param in sorted(zip(val_scores, train_scores, params), reverse=True):
    print('Val:{:.2%};  Train:{:.2%}; Param:{};'.format(val_score, train_score, param))

In [None]:
# Store model
save_model_best = input("Save model best params")
if save_model_best == "yes":
    dump(grid_search_RFC.best_estimator_, '2)Sentiment_Models/RFC_2class.joblib')

In [None]:
#Load best model and get statistics on the full train set and test
##INitiate and store best model
Best_RFC = load("2)Sentiment_Models/RFC_2class.joblib")
#Train the model (with CV)
gs_best = GridSearchCV(Best_RFC, {}, cv=5,
                           scoring='f1_macro', 
                           return_train_score=True,
                           error_score= "raise") 
gs_best.fit(headlines_train, sent_train)
grid_search_results = sorted(zip(gs_best.cv_results_["mean_test_score"], gs_best.cv_results_["mean_train_score"], [str(x) for x in gs_best.cv_results_["params"]]), reverse=True)
val_score = grid_search_results[0][0]
train_score = grid_search_results[0][1]
##Full Train & Full Test/VAlidation
print("Random Forest")
  #Train
print("-----TRAIN-----")
train_p, train_r, train_f, train_s = precision_recall_fscore_support(sent_train, Best_RFC.predict(headlines_train), average="macro")
print("Precision: {:.2%}".format(train_p))
print("Recall: {:.2%}".format(train_r))
print("F score: {:.2%}".format(train_f))
  #Test
print("-----TEST-----")
test_p, test_r, test_f, test_s = precision_recall_fscore_support(sent_test, Best_RFC.predict(headlines_test), average="macro")
print("Precision: {:.2%}".format(test_p))
print("Recall: {:.2%}".format(test_r))
print("F score: {:.2%}".format(test_f))
##Append Data
ML_table = pd.DataFrame([["Random Forest",Best_RFC,train_score,val_score,train_f,test_f]], columns=["ML","Params","Mean Train","Mean Test","Train","Validation"])
ML_table_Full = pd.concat([ML_table_Full,ML_table])

### Logistic Regression

In [None]:
?LogisticRegression

In [None]:
#Initialize Random Forest
LR = LogisticRegression()
# grid search 
param_grid = {
    'C': [4,4.5,5,5.5],
    'l1_ratio': [0.4,0.5,0.6],
    'penalty':["elasticnet"],  #  elasticnet
    'solver':["saga"],  #  "saga"  "sag", "lbfgs", "newton-cg"
    'max_iter': [200,500],
    'random_state': [42]
}
#cross-validation
grid_search_LR = GridSearchCV(LR, param_grid, cv=5,
                           scoring='f1_macro', 
                           return_train_score=True,
                           error_score= "raise") 

start = time.time()
grid_search_LR.fit(headlines_train, sent_train)
end = time.time() - start
print(f"Took {end} seconds")
# Print the set of best hyperparameters
print("Best estimators", grid_search_LR.best_estimator_)
# Mean cross-validated score of the best_estimator
print("Validation Score", grid_search_LR.best_score_)
#Print scores of all tests
val_scores = grid_search_LR.cv_results_["mean_test_score"]
train_scores = grid_search_LR.cv_results_["mean_train_score"]
params = [str(x) for x in grid_search_LR.cv_results_["params"]]
for val_score, train_score, param in sorted(zip(val_scores, train_scores, params), reverse=True):
    print('Val:{:.2%};  Train:{:.2%}; Param:{};'.format(val_score, train_score, param))

In [None]:
# Store model
save_model_best = input("Save model best params")
if save_model_best == "yes":
    dump(grid_search_LR.best_estimator_, '2)Sentiment_Models/LR_2class.joblib')

In [None]:
#Load best model and get statistics on the full train set and test
##INitiate and store best model
Best_LR = load("2)Sentiment_Models/LR_2class.joblib")
#Train the model (with CV)
gs_best = GridSearchCV(Best_LR, {}, cv=5,
                           scoring='f1_macro', 
                           return_train_score=True,
                           error_score= "raise") 
gs_best.fit(headlines_train, sent_train)
grid_search_results = sorted(zip(gs_best.cv_results_["mean_test_score"], gs_best.cv_results_["mean_train_score"], [str(x) for x in gs_best.cv_results_["params"]]), reverse=True)
val_score = grid_search_results[0][0]
train_score = grid_search_results[0][1]
##Full Train & Full Test/VAlidation
print("Logistic Regression")
  #Train
print("-----TRAIN-----")
train_p, train_r, train_f, train_s = precision_recall_fscore_support(sent_train, Best_LR.predict(headlines_train), average="macro")
print("Precision: {:.2%}".format(train_p))
print("Recall: {:.2%}".format(train_r))
print("F score: {:.2%}".format(train_f))
  #Test
print("-----TEST-----")
test_p, test_r, test_f, test_s = precision_recall_fscore_support(sent_test, Best_LR.predict(headlines_test), average="macro")
print("Precision: {:.2%}".format(test_p))
print("Recall: {:.2%}".format(test_r))
print("F score: {:.2%}".format(test_f))
##Append Data
ML_table = pd.DataFrame([["Logistic Regression",Best_LR,train_score,val_score,train_f,test_f]], columns=["ML","Params","Mean Train","Mean Test","Train","Validation"])
ML_table_Full = pd.concat([ML_table_Full,ML_table])

### Naive Bayes

In [None]:
?GaussianNB()

In [None]:
#Initialize Random Forest
NB = GaussianNB()
# grid search 
param_grid = {
}
#cross-validation
grid_search_NB = GridSearchCV(NB, param_grid, cv=5,
                           scoring='f1_macro', 
                           return_train_score=True,
                           error_score= "raise") 

start = time.time()
grid_search_NB.fit(headlines_train, sent_train)
end = time.time() - start
print(f"Took {end} seconds")
# Print the set of best hyperparameters
print("Best estimators", grid_search_NB.best_estimator_)
# Mean cross-validated score of the best_estimator
print("Validation Score", grid_search_NB.best_score_)
#Print scores of all tests
val_scores = grid_search_NB.cv_results_["mean_test_score"]
train_scores = grid_search_NB.cv_results_["mean_train_score"]
params = [str(x) for x in grid_search_NB.cv_results_["params"]]
for val_score, train_score, param in sorted(zip(val_scores, train_scores, params), reverse=True):
    print('Val:{:.2%};  Train:{:.2%}; Param:{};'.format(val_score, train_score, param))

In [None]:
# Store model
save_model_best = input("Save model best params")
if save_model_best == "yes":
    dump(grid_search_NB.best_estimator_, '2)Sentiment_Models/NB_2class.joblib')

In [None]:
#Load best model and get statistics on the full train set and test
##INitiate and store best model
Best_NB = load("2)Sentiment_Models/NB_2class.joblib")
#Train the model (with CV)
gs_best = GridSearchCV(Best_NB, {}, cv=5,
                           scoring='f1_macro', 
                           return_train_score=True,
                           error_score= "raise") 
gs_best.fit(headlines_train, sent_train)
grid_search_results = sorted(zip(gs_best.cv_results_["mean_test_score"], gs_best.cv_results_["mean_train_score"], [str(x) for x in gs_best.cv_results_["params"]]), reverse=True)
val_score = grid_search_results[0][0]
train_score = grid_search_results[0][1]
##Full Train & Full Test/VAlidation
print("Naive Bayes")
  #Train
print("-----TRAIN-----")
train_p, train_r, train_f, train_s = precision_recall_fscore_support(sent_train, Best_NB.predict(headlines_train), average="macro")
print("Precision: {:.2%}".format(train_p))
print("Recall: {:.2%}".format(train_r))
print("F score: {:.2%}".format(train_f))
  #Test
print("-----TEST-----")
test_p, test_r, test_f, test_s = precision_recall_fscore_support(sent_test, Best_NB.predict(headlines_test), average="macro")
print("Precision: {:.2%}".format(test_p))
print("Recall: {:.2%}".format(test_r))
print("F score: {:.2%}".format(test_f))
##Append Data
ML_table = pd.DataFrame([["Naive Bayes",Best_NB,train_score,val_score,train_f,test_f]], columns=["ML","Params","Mean Train","Mean Test","Train","Validation"])
ML_table_Full = pd.concat([ML_table_Full,ML_table])

### Neural Networks

In [None]:
?MLPClassifier

In [None]:
#Initialize Random Forest
NN = MLPClassifier()
# grid search 
param_grid = {
    'hidden_layer_sizes': [(50,50,50), (50,100,50),(100,)],
    'activation':["relu"],
    'alpha':[0.06,0.07],
    'learning_rate':["invscaling"],  #"constant", "invscaling", "adaptive"
    'max_iter':[1000],
    'random_state': [42]
}
#cross-validation
grid_search_NN = GridSearchCV(NN, param_grid, cv=5,
                           scoring='f1_macro', 
                           return_train_score=True,
                           error_score= "raise") 

start = time.time()
grid_search_NN.fit(headlines_train, sent_train)
end = time.time() - start
print(f"Took {end} seconds")
# Print the set of best hyperparameters
print("Best estimators", grid_search_NN.best_estimator_)
# Mean cross-validated score of the best_estimator
print("Validation Score", grid_search_NN.best_score_)
#Print scores of all tests
val_scores = grid_search_NN.cv_results_["mean_test_score"]
train_scores = grid_search_NN.cv_results_["mean_train_score"]
params = [str(x) for x in grid_search_NN.cv_results_["params"]]
for val_score, train_score, param in sorted(zip(val_scores, train_scores, params), reverse=True):
    print('Val:{:.2%};  Train:{:.2%}; Param:{};'.format(val_score, train_score, param))

In [None]:
# Store model
save_model_best = input("Save model best params")
if save_model_best == "yes":
    dump(grid_search_NN.best_estimator_, '2)Sentiment_Models/NN_2class.joblib')

In [None]:
#Load best model and get statistics on the full train set and test
##INitiate and store best model
Best_NN = load("2)Sentiment_Models/NN_2class.joblib")
#Train the model (with CV)
gs_best = GridSearchCV(Best_NN, {}, cv=5,
                           scoring='f1_macro', 
                           return_train_score=True,
                           error_score= "raise") 
gs_best.fit(headlines_train, sent_train)
grid_search_results = sorted(zip(gs_best.cv_results_["mean_test_score"], gs_best.cv_results_["mean_train_score"], [str(x) for x in gs_best.cv_results_["params"]]), reverse=True)
val_score = grid_search_results[0][0]
train_score = grid_search_results[0][1]
##Full Train & Full Test/VAlidation
print("Neural Networks")
  #Train
print("-----TRAIN-----")
train_p, train_r, train_f, train_s = precision_recall_fscore_support(sent_train, Best_NN.predict(headlines_train), average="macro")
print("Precision: {:.2%}".format(train_p))
print("Recall: {:.2%}".format(train_r))
print("F score: {:.2%}".format(train_f))
  #Test
print("-----TEST-----")
test_p, test_r, test_f, test_s = precision_recall_fscore_support(sent_test, Best_NN.predict(headlines_test), average="macro")
print("Precision: {:.2%}".format(test_p))
print("Recall: {:.2%}".format(test_r))
print("F score: {:.2%}".format(test_f))
##Append Data
ML_table = pd.DataFrame([["Neural Networks",Best_NN,train_score,val_score,train_f,test_f]], columns=["ML","Params","Mean Train","Mean Test","Train","Validation"])
ML_table_Full = pd.concat([ML_table_Full,ML_table])

# FinBert

Implementations Model:
> **Financial News Sentiment Analysis using FinBERT  >** https://medium.com/@ravirajshinde2000/financial-news-sentiment-analysis-using-finbert-25afcc95e65f

> **SentenceTransformers Documentation>**https://www.sbert.net/

> **Training Overview>**https://www.sbert.net/docs/training/overview.html/

> **How to Train BERT  >** https://towardsdatascience.com/how-to-train-bert-aaad00533168

> **Richer Sentence Embeddings using Sentence-BERT — Part I>** https://medium.com/genei-technology/richer-sentence-embeddings-using-sentence-bert-part-i-ce1d9e0b1343

## Specific Functions for embedding Analysis

In [None]:
def FinBERT_tokenization_outputlayer(sentence):
    '''A 3x1 layer is outputed of which the first value represents neutral the second positive and the third negative sentiment  {0:'neutral', 1:'positive',2:'negative'}'''
    try:
        #Tokenizing
        inputs = tokenizer(sentence, return_tensors="pt", padding=True)
        #Output Layes
        outputs = finbert(**inputs)[0]
        array_output = outputs.detach().numpy() 
        return array_output
    except Exception as ex:
        print("Error {} for sentence >  {}".format(ex,sentence))

In [None]:
#Loading pre-trained model and tokenizer
finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')

In [None]:
#?tokenizer
#pprint(vars(tokenizer))

## Initiating Models data

In [None]:
#Loading Processed Headlines
with open("1)Transformed_Headlines/IS_Data_Finbert", "rb") as fp:  
    IS_headlines_processed = pickle.load(fp)

#Save Finbert outpu layer
resave_output = input("RESave Finbert Output?")
if resave_output == "yes":
    IS_headlines_processed["Finbert_output"] = IS_headlines_processed.progress_apply(lambda x: FinBERT_tokenization_outputlayer(x["title"]), axis=1)
    with open("1)Transformed_Headlines/IS_Data_Complete", "wb") as fp:   
        pickle.dump(IS_headlines_processed, fp)  

# Split Dataset into Traing and Test
headlines_train, headlines_test, sent_train, sent_test = train_test_split(list(IS_headlines_processed["Finbert_output"]), list(IS_headlines_processed["Sent"]), 
                                                          test_size=0.2, random_state=42)

## Untoched Model

In [None]:
#Load best model and get statistics on the full train set and test
print("fINBERT ")
labels = {0:1,1:-1}
  #Train
print("-----TRAIN-----")
train_p, train_r, train_f, train_s = precision_recall_fscore_support(sent_train, [labels[np.argmax(h[0][1:])] for h in headlines_train] , average="macro")
print("Precision: {:.2%}".format(train_p))
print("Recall: {:.2%}".format(train_r))
print("F score: {:.2%}".format(train_f))
  #Test
print("-----TEST-----")
test_p, test_r, test_f, test_s = precision_recall_fscore_support(sent_test,[labels[np.argmax(h[0][1:])] for h in headlines_test], average="macro")
print("Precision: {:.2%}".format(test_p))
print("Recall: {:.2%}".format(test_r))
print("F score: {:.2%}".format(test_f))

##Append Data
ML_table = pd.DataFrame([["fINBERT","-","-","-",train_f,test_f]], columns=["ML","Params","Mean Train","Mean Test","Train","Validation"])
ML_table_Full = pd.concat([ML_table_Full,ML_table])

## Finbert + ML

### Extra Layers (SVM Linear)

In [None]:
#Initialize SVC
LinearSVC = SVC()
# grid search 
param_grid = {
    'C': [3.7,4,4.1,4.2,4.3],
    'kernel':["linear"],
    'decision_function_shape':["ovo", "ovr"],
    'random_state': [42]
}
#cross-validation
grid_search_LinearSVC = GridSearchCV(LinearSVC, param_grid, cv=5,
                           scoring='f1_macro', 
                           return_train_score=True,
                           error_score= "raise") 

start = time.time()
grid_search_LinearSVC.fit([list(h[0]) for h in headlines_train], sent_train)
end = time.time() - start
print(f"Took {end} seconds")
# Print the set of best hyperparameters
print("Best estimators", grid_search_LinearSVC.best_estimator_)
# Mean cross-validated score of the best_estimator
print("Validation Score", grid_search_LinearSVC.best_score_)
#Print scores of all tests
val_scores = grid_search_LinearSVC.cv_results_["mean_test_score"]
train_scores = grid_search_LinearSVC.cv_results_["mean_train_score"]
params = [str(x) for x in grid_search_LinearSVC.cv_results_["params"]]
for val_score, train_score, param in sorted(zip(val_scores, train_scores, params), reverse=True):
    print('Val:{:.2%};  Train:{:.2%}; Param:{};'.format(val_score, train_score, param))

In [None]:
# Store model
save_model_best = input("Save model best params")
if save_model_best == "yes":
    dump(grid_search_LinearSVC.best_estimator_, '2)Sentiment_Models/LinearSVC_2class_Finbert.joblib')

In [None]:
#Load best model and get statistics on the full train set and test
##INitiate and store best model
Best_LinearSVC_Finbert = load("2)Sentiment_Models/LinearSVC_2class_Finbert.joblib")
#Train the model (with CV)
gs_best = GridSearchCV(Best_LinearSVC_Finbert, {}, cv=5,
                           scoring='f1_macro', 
                           return_train_score=True,
                           error_score= "raise") 
gs_best.fit([list(h[0]) for h in headlines_train], sent_train)
grid_search_results = sorted(zip(gs_best.cv_results_["mean_test_score"], gs_best.cv_results_["mean_train_score"], [str(x) for x in gs_best.cv_results_["params"]]), reverse=True)
val_score = grid_search_results[0][0]
train_score = grid_search_results[0][1]
##Full Train & Full Test/VAlidation
print("Linear SVC")
  #Train
print("-----TRAIN-----")
train_p, train_r, train_f, train_s = precision_recall_fscore_support(sent_train, Best_LinearSVC_Finbert.predict([list(h[0]) for h in headlines_train]), average="macro")
print("Precision: {:.2%}".format(train_p))
print("Recall: {:.2%}".format(train_r))
print("F score: {:.2%}".format(train_f))
  #Test
print("-----TEST-----")
test_p, test_r, test_f, test_s = precision_recall_fscore_support(sent_test, Best_LinearSVC_Finbert.predict([list(h[0]) for h in headlines_test]), average="macro")
print("Precision: {:.2%}".format(test_p))
print("Recall: {:.2%}".format(test_r))
print("F score: {:.2%}".format(test_f))
##Append Data
ML_table = pd.DataFrame([["Finbert Linear SVC",Best_LinearSVC_Finbert,train_score,val_score,train_f,test_f]], columns=["ML","Params","Mean Train","Mean Test","Train","Validation"])
ML_table_Full = pd.concat([ML_table_Full,ML_table])

### Extra Layers (Poly Linear)

In [None]:
PolySVC = SVC()
# grid search 
param_grid = {
    'C': [0.3,0.4,0.5,0.6],
    'kernel':["poly"],
    'degree':[3],
    'gamma':["auto"],
    'decision_function_shape':["ovr"],
    'random_state': [42]
}
#cross-validation
grid_search_PolySVC = GridSearchCV(PolySVC, param_grid, cv=5,
                           scoring='f1_macro', 
                           return_train_score=True,
                           error_score= "raise") 

start = time.time()
grid_search_PolySVC.fit([list(h[0]) for h in headlines_train], sent_train)
end = time.time() - start
print(f"Took {end} seconds")
# Print the set of best hyperparameters
print("Best estimators", grid_search_PolySVC.best_estimator_)
# Mean cross-validated score of the best_estimator
print("Validation Score", grid_search_PolySVC.best_score_)
#Print scores of all tests
val_scores = grid_search_PolySVC.cv_results_["mean_test_score"]
train_scores = grid_search_PolySVC.cv_results_["mean_train_score"]
params = [str(x) for x in grid_search_PolySVC.cv_results_["params"]]
for val_score, train_score, param in sorted(zip(val_scores, train_scores, params), reverse=True):
    print('Val:{:.2%};  Train:{:.2%}; Param:{};'.format(val_score, train_score, param))

In [None]:
# Store model
save_model_best = input("Save model best params")
if save_model_best == "yes":
    dump(grid_search_PolySVC.best_estimator_, '2)Sentiment_Models/PolySVC_2class_Finbert.joblib')

In [None]:
#Load best model and get statistics on the full train set and test
##INitiate and store best model
Best_PolySVC_Finbert = load("2)Sentiment_Models/PolySVC_2class_Finbert.joblib")
#Train the model (with CV)
gs_best = GridSearchCV(Best_PolySVC_Finbert, {}, cv=5,
                           scoring='f1_macro', 
                           return_train_score=True,
                           error_score= "raise") 
gs_best.fit([list(h[0]) for h in headlines_train], sent_train)
grid_search_results = sorted(zip(gs_best.cv_results_["mean_test_score"], gs_best.cv_results_["mean_train_score"], [str(x) for x in gs_best.cv_results_["params"]]), reverse=True)
val_score = grid_search_results[0][0]
train_score = grid_search_results[0][1]
##Full Train & Full Test/VAlidation
print("Linear SVC")
  #Train
print("-----TRAIN-----")
train_p, train_r, train_f, train_s = precision_recall_fscore_support(sent_train, Best_PolySVC_Finbert.predict([list(h[0]) for h in headlines_train]), average="macro")
print("Precision: {:.2%}".format(train_p))
print("Recall: {:.2%}".format(train_r))
print("F score: {:.2%}".format(train_f))
  #Test
print("-----TEST-----")
test_p, test_r, test_f, test_s = precision_recall_fscore_support(sent_test, Best_PolySVC_Finbert.predict([list(h[0]) for h in headlines_test]), average="macro")
print("Precision: {:.2%}".format(test_p))
print("Recall: {:.2%}".format(test_r))
print("F score: {:.2%}".format(test_f))
##Append Data
ML_table = pd.DataFrame([["Finbert Poly SVC",Best_PolySVC_Finbert,train_score,val_score,train_f,test_f]], columns=["ML","Params","Mean Train","Mean Test","Train","Validation"])
ML_table_Full = pd.concat([ML_table_Full,ML_table])

### Extra Layes (Rbf Linear)

In [None]:
#Initialize SVC
RbfSVC = SVC()
# grid search 
param_grid = {
    'C': [3,3.1,3.2,3.5,3.7],
    'kernel':["rbf"],
    'decision_function_shape':["ovo", "ovr"],
    'random_state': [42]
}
#cross-validation
grid_search_RbfSVC = GridSearchCV(RbfSVC, param_grid, cv=5,
                           scoring='f1_macro', 
                           return_train_score=True,
                           error_score= "raise") 

start = time.time()
grid_search_RbfSVC.fit([list(h[0]) for h in headlines_train], sent_train)
end = time.time() - start
print(f"Took {end} seconds")
# Print the set of best hyperparameters
print("Best estimators", grid_search_RbfSVC.best_estimator_)
# Mean cross-validated score of the best_estimator
print("Validation Score", grid_search_RbfSVC.best_score_)
#Print scores of all tests
val_scores = grid_search_RbfSVC.cv_results_["mean_test_score"]
train_scores = grid_search_RbfSVC.cv_results_["mean_train_score"]
params = [str(x) for x in grid_search_RbfSVC.cv_results_["params"]]
for val_score, train_score, param in sorted(zip(val_scores, train_scores, params), reverse=True):
    print('Val:{:.2%};  Train:{:.2%}; Param:{};'.format(val_score, train_score, param))

In [None]:
# Store model
save_model_best = input("Save model best params")
if save_model_best == "yes":
    dump(grid_search_RbfSVC.best_estimator_, '2)Sentiment_Models/RbfSVC_2class_Finbert.joblib')

In [None]:
#Load best model and get statistics on the full train set and test
##INitiate and store best model
Best_RbfSVC_Finbert = load("2)Sentiment_Models/RbfSVC_2class_Finbert.joblib")
#Train the model (with CV)
gs_best = GridSearchCV(Best_RbfSVC_Finbert, {}, cv=5,
                           scoring='f1_macro', 
                           return_train_score=True,
                           error_score= "raise") 
gs_best.fit([list(h[0]) for h in headlines_train], sent_train)
grid_search_results = sorted(zip(gs_best.cv_results_["mean_test_score"], gs_best.cv_results_["mean_train_score"], [str(x) for x in gs_best.cv_results_["params"]]), reverse=True)
val_score = grid_search_results[0][0]
train_score = grid_search_results[0][1]
##Full Train & Full Test/VAlidation
print("Linear SVC")
  #Train
print("-----TRAIN-----")
train_p, train_r, train_f, train_s = precision_recall_fscore_support(sent_train, Best_RbfSVC_Finbert.predict([list(h[0]) for h in headlines_train]), average="macro")
print("Precision: {:.2%}".format(train_p))
print("Recall: {:.2%}".format(train_r))
print("F score: {:.2%}".format(train_f))
  #Test
print("-----TEST-----")
test_p, test_r, test_f, test_s = precision_recall_fscore_support(sent_test, Best_RbfSVC_Finbert.predict([list(h[0]) for h in headlines_test]), average="macro")
print("Precision: {:.2%}".format(test_p))
print("Recall: {:.2%}".format(test_r))
print("F score: {:.2%}".format(test_f))
##Append Data
ML_table = pd.DataFrame([["Finbert Rbf SVC",Best_RbfSVC_Finbert,train_score,val_score,train_f,test_f]], columns=["ML","Params","Mean Train","Mean Test","Train","Validation"])
ML_table_Full = pd.concat([ML_table_Full,ML_table])

### Extra Layes (Sigmoid Linear)

In [None]:
#Initialize SVC
SigmoidSVC = SVC()
# grid search 
param_grid = {
    'C': [0.5,0.55,0.6,0.65,0.7,0.8,],
    'kernel':["sigmoid"],
    'gamma':["scale", "auto",0.1],
    'decision_function_shape':["ovo", "ovr"],
    'random_state': [42]
}
#cross-validation
grid_search_SigmoidSVC = GridSearchCV(SigmoidSVC, param_grid, cv=5,
                           scoring='f1_macro', 
                           return_train_score=True,
                           error_score= "raise") 

start = time.time()
grid_search_SigmoidSVC.fit([list(h[0]) for h in headlines_train], sent_train)
end = time.time() - start
print(f"Took {end} seconds")
# Print the set of best hyperparameters
print("Best estimators", grid_search_SigmoidSVC.best_estimator_)
# Mean cross-validated score of the best_estimator
print("Validation Score", grid_search_SigmoidSVC.best_score_)
#Print scores of all tests
val_scores = grid_search_SigmoidSVC.cv_results_["mean_test_score"]
train_scores = grid_search_SigmoidSVC.cv_results_["mean_train_score"]
params = [str(x) for x in grid_search_SigmoidSVC.cv_results_["params"]]
for val_score, train_score, param in sorted(zip(val_scores, train_scores, params), reverse=True):
    print('Val:{:.2%};  Train:{:.2%}; Param:{};'.format(val_score, train_score, param))

In [None]:
# Store model
save_model_best = input("Save model best params")
if save_model_best == "yes":
    dump(grid_search_SigmoidSVC.best_estimator_, '2)Sentiment_Models/Sigmoid_2class_Finbert.joblib')

In [None]:
#Load best model and get statistics on the full train set and test
##INitiate and store best model
Best_Sigmoid_Finbert = load("2)Sentiment_Models/Sigmoid_2class_Finbert.joblib")
#Train the model (with CV)
gs_best = GridSearchCV(Best_Sigmoid_Finbert, {}, cv=5,
                           scoring='f1_macro', 
                           return_train_score=True,
                           error_score= "raise") 
gs_best.fit([list(h[0]) for h in headlines_train], sent_train)
grid_search_results = sorted(zip(gs_best.cv_results_["mean_test_score"], gs_best.cv_results_["mean_train_score"], [str(x) for x in gs_best.cv_results_["params"]]), reverse=True)
val_score = grid_search_results[0][0]
train_score = grid_search_results[0][1]
##Full Train & Full Test/VAlidation
print("Linear SVC")
  #Train
print("-----TRAIN-----")
train_p, train_r, train_f, train_s = precision_recall_fscore_support(sent_train, Best_Sigmoid_Finbert.predict([list(h[0]) for h in headlines_train]), average="macro")
print("Precision: {:.2%}".format(train_p))
print("Recall: {:.2%}".format(train_r))
print("F score: {:.2%}".format(train_f))
  #Test
print("-----TEST-----")
test_p, test_r, test_f, test_s = precision_recall_fscore_support(sent_test, Best_Sigmoid_Finbert.predict([list(h[0]) for h in headlines_test]), average="macro")
print("Precision: {:.2%}".format(test_p))
print("Recall: {:.2%}".format(test_r))
print("F score: {:.2%}".format(test_f))
##Append Data
ML_table = pd.DataFrame([["Finbert Sigmoid SVC",Best_Sigmoid_Finbert,train_score,val_score,train_f,test_f]], columns=["ML","Params","Mean Train","Mean Test","Train","Validation"])
ML_table_Full = pd.concat([ML_table_Full,ML_table])

### Extra Layes (Neural Network)

In [None]:
#Initialize Random Forest
NN = MLPClassifier()
# grid search 
param_grid = {
    'hidden_layer_sizes': [(3,),(5,),(4,)],
    'activation':["relu"],
    'alpha':[0.8,1,1.2],
    'learning_rate':["invscaling"],  #"constant", "invscaling", "adaptive"
    'max_iter':[1000],
    'random_state': [42]
}
#cross-validation
grid_search_NN = GridSearchCV(NN, param_grid, cv=5,
                           scoring='f1_macro', 
                           return_train_score=True,
                           error_score= "raise") 

start = time.time()
grid_search_NN.fit([list(h[0]) for h in headlines_train], sent_train)
end = time.time() - start
print(f"Took {end} seconds")
# Print the set of best hyperparameters
print("Best estimators", grid_search_NN.best_estimator_)
# Mean cross-validated score of the best_estimator
print("Validation Score", grid_search_NN.best_score_)
#Print scores of all tests
val_scores = grid_search_NN.cv_results_["mean_test_score"]
train_scores = grid_search_NN.cv_results_["mean_train_score"]
params = [str(x) for x in grid_search_NN.cv_results_["params"]]
for val_score, train_score, param in sorted(zip(val_scores, train_scores, params), reverse=True):
    print('Val:{:.2%};  Train:{:.2%}; Param:{};'.format(val_score, train_score, param))

In [None]:
# Store model
save_model_best = input("Save model best params")
if save_model_best == "yes":
    dump(grid_search_NN.best_estimator_, '2)Sentiment_Models/NN_2class_Finbert.joblib')

In [None]:
#Load best model and get statistics on the full train set and test
##INitiate and store best model
Best_NN_Finbert = load("2)Sentiment_Models/NN_2class_Finbert.joblib")
#Train the model (with CV)
gs_best = GridSearchCV(Best_NN_Finbert, {}, cv=5,
                           scoring='f1_macro', 
                           return_train_score=True,
                           error_score= "raise") 
gs_best.fit([list(h[0]) for h in headlines_train], sent_train)
grid_search_results = sorted(zip(gs_best.cv_results_["mean_test_score"], gs_best.cv_results_["mean_train_score"], [str(x) for x in gs_best.cv_results_["params"]]), reverse=True)
val_score = grid_search_results[0][0]
train_score = grid_search_results[0][1]
##Full Train & Full Test/VAlidation
print("Neural Networks")
  #Train
print("-----TRAIN-----")
train_p, train_r, train_f, train_s = precision_recall_fscore_support(sent_train, Best_NN_Finbert.predict([list(h[0]) for h in headlines_train]), average="macro")
print("Precision: {:.2%}".format(train_p))
print("Recall: {:.2%}".format(train_r))
print("F score: {:.2%}".format(train_f))
  #Test
print("-----TEST-----")
test_p, test_r, test_f, test_s = precision_recall_fscore_support(sent_test, Best_NN_Finbert.predict([list(h[0]) for h in headlines_test]), average="macro")
print("Precision: {:.2%}".format(test_p))
print("Recall: {:.2%}".format(test_r))
print("F score: {:.2%}".format(test_f))
##Append Data
ML_table = pd.DataFrame([["Finbert Neural Networks",Best_NN_Finbert,train_score,val_score,train_f,test_f]], columns=["ML","Params","Mean Train","Mean Test","Train","Validation"])
ML_table_Full = pd.concat([ML_table_Full,ML_table])

### Extra Layes (Logistic Regression) 

In [None]:
#Initialize Random Forest
LR = LogisticRegression()
# grid search 
param_grid = {
    #'C': [4,4.5,5,5.5],
    #'l1_ratio': [0.4,0.5,0.6],
    'penalty':["none"],  #  "elasticnet", "l2", "none"
    'solver':["sag", "lbfgs", "newton-cg"],  #  "saga"  "sag", "lbfgs", "newton-cg"
    'max_iter': [200,500],
    'random_state': [42]
}
#cross-validation
grid_search_LR = GridSearchCV(LR, param_grid, cv=5,
                           scoring='f1_macro', 
                           return_train_score=True,
                           error_score= "raise") 

start = time.time()
grid_search_LR.fit([list(h[0]) for h in headlines_train], sent_train)
end = time.time() - start
print(f"Took {end} seconds")
# Print the set of best hyperparameters
print("Best estimators", grid_search_LR.best_estimator_)
# Mean cross-validated score of the best_estimator
print("Validation Score", grid_search_LR.best_score_)
#Print scores of all tests
val_scores = grid_search_LR.cv_results_["mean_test_score"]
train_scores = grid_search_LR.cv_results_["mean_train_score"]
params = [str(x) for x in grid_search_LR.cv_results_["params"]]
for val_score, train_score, param in sorted(zip(val_scores, train_scores, params), reverse=True):
    print('Val:{:.2%};  Train:{:.2%}; Param:{};'.format(val_score, train_score, param))

In [None]:
Best_LR_Finbert.predict(headlines_train[0][0].reshape(1, -1))

In [None]:
Best_LR_Finbert.classes_

In [None]:
# Store model
save_model_best = input("Save model best params")
if save_model_best == "yes":
    dump(grid_search_LR.best_estimator_, '2)Sentiment_Models/LR_2class_Finbert.joblib')

In [None]:
#Load best model and get statistics on the full train set and test
##INitiate and store best model
Best_LR_Finbert = load("2)Sentiment_Models/LR_2class_Finbert.joblib")
#Train the model (with CV)
gs_best = GridSearchCV(Best_LR_Finbert, {}, cv=5,
                           scoring='f1_macro', 
                           return_train_score=True,
                           error_score= "raise") 
gs_best.fit([list(h[0]) for h in headlines_train], sent_train)
grid_search_results = sorted(zip(gs_best.cv_results_["mean_test_score"], gs_best.cv_results_["mean_train_score"], [str(x) for x in gs_best.cv_results_["params"]]), reverse=True)
val_score = grid_search_results[0][0]
train_score = grid_search_results[0][1]
##Full Train & Full Test/VAlidation
print("Logistic Regression")
  #Train
print("-----TRAIN-----")
train_p, train_r, train_f, train_s = precision_recall_fscore_support(sent_train, Best_LR_Finbert.predict([list(h[0]) for h in headlines_train]), average="macro")
print("Precision: {:.2%}".format(train_p))
print("Recall: {:.2%}".format(train_r))
print("F score: {:.2%}".format(train_f))
  #Test
print("-----TEST-----")
test_p, test_r, test_f, test_s = precision_recall_fscore_support(sent_test, Best_LR_Finbert.predict([list(h[0]) for h in headlines_test]), average="macro")
print("Precision: {:.2%}".format(test_p))
print("Recall: {:.2%}".format(test_r))
print("F score: {:.2%}".format(test_f))
##Append Data
ML_table = pd.DataFrame([["Finbert Logistic Regression",Best_LR_Finbert,train_score,val_score,train_f,test_f]], columns=["ML","Params","Mean Train","Mean Test","Train","Validation"])
ML_table_Full = pd.concat([ML_table_Full,ML_table])

### Extra Layes (Random Forest) 

In [None]:
#Initialize Random Forest
RFC = RandomForestClassifier()
# grid search 
param_grid = {
    'n_estimators': [500],
    'max_features':[3],
    'max_depth':[3],
    #'min_samples_leaf':[4],
    #'max_features':["auto", "sqrt"],
    'random_state': [42]
}
#cross-validation
grid_search_RFC = GridSearchCV(RFC, param_grid, cv=5,
                           scoring='f1_macro', 
                           return_train_score=True,
                           error_score= "raise") 

start = time.time()
grid_search_RFC.fit([list(h[0]) for h in headlines_train], sent_train)
end = time.time() - start
print(f"Took {end} seconds")
# Print the set of best hyperparameters
print("Best estimators", grid_search_RFC.best_estimator_)
# Mean cross-validated score of the best_estimator
print("Validation Score", grid_search_RFC.best_score_)
#Print scores of all tests
val_scores = grid_search_RFC.cv_results_["mean_test_score"]
train_scores = grid_search_RFC.cv_results_["mean_train_score"]
params = [str(x) for x in grid_search_RFC.cv_results_["params"]]
for val_score, train_score, param in sorted(zip(val_scores, train_scores, params), reverse=True):
    print('Val:{:.2%};  Train:{:.2%}; Param:{};'.format(val_score, train_score, param))

In [None]:
# Store model
save_model_best = input("Save model best params")
if save_model_best == "yes":
    dump(grid_search_RFC.best_estimator_, '2)Sentiment_Models/RFC_2class_Finbert.joblib')

In [None]:
#Load best model and get statistics on the full train set and test
##INitiate and store best model
Best_RFC_Finbert = load("2)Sentiment_Models/RFC_2class_Finbert.joblib")
#Train the model (with CV)
gs_best = GridSearchCV(Best_RFC_Finbert, {}, cv=5,
                           scoring='f1_macro', 
                           return_train_score=True,
                           error_score= "raise") 
gs_best.fit([list(h[0]) for h in headlines_train], sent_train)
grid_search_results = sorted(zip(gs_best.cv_results_["mean_test_score"], gs_best.cv_results_["mean_train_score"], [str(x) for x in gs_best.cv_results_["params"]]), reverse=True)
val_score = grid_search_results[0][0]
train_score = grid_search_results[0][1]
##Full Train & Full Test/VAlidation
print("RFC")
  #Train
print("-----TRAIN-----")
train_p, train_r, train_f, train_s = precision_recall_fscore_support(sent_train, Best_RFC_Finbert.predict([list(h[0]) for h in headlines_train]), average="macro")
print("Precision: {:.2%}".format(train_p))
print("Recall: {:.2%}".format(train_r))
print("F score: {:.2%}".format(train_f))
  #Test
print("-----TEST-----")
test_p, test_r, test_f, test_s = precision_recall_fscore_support(sent_test, Best_RFC_Finbert.predict([list(h[0]) for h in headlines_test]), average="macro")
print("Precision: {:.2%}".format(test_p))
print("Recall: {:.2%}".format(test_r))
print("F score: {:.2%}".format(test_f))
##Append Data
ML_table = pd.DataFrame([["Finbert RFC",Best_RFC_Finbert,train_score,val_score,train_f,test_f]], columns=["ML","Params","Mean Train","Mean Test","Train","Validation"])
ML_table_Full = pd.concat([ML_table_Full,ML_table])

### Extra Layes (Naive Bayes)

In [None]:
#Initialize Random Forest
NB = GaussianNB()
# grid search 
param_grid = {
}
#cross-validation
grid_search_NB = GridSearchCV(NB, param_grid, cv=5,
                           scoring='f1_macro', 
                           return_train_score=True,
                           error_score= "raise") 

start = time.time()
grid_search_NB.fit([list(h[0]) for h in headlines_train], sent_train)
end = time.time() - start
print(f"Took {end} seconds")
# Print the set of best hyperparameters
print("Best estimators", grid_search_NB.best_estimator_)
# Mean cross-validated score of the best_estimator
print("Validation Score", grid_search_NB.best_score_)
#Print scores of all tests
val_scores = grid_search_NB.cv_results_["mean_test_score"]
train_scores = grid_search_NB.cv_results_["mean_train_score"]
params = [str(x) for x in grid_search_NB.cv_results_["params"]]
for val_score, train_score, param in sorted(zip(val_scores, train_scores, params), reverse=True):
    print('Val:{:.2%};  Train:{:.2%}; Param:{};'.format(val_score, train_score, param))

In [None]:
# Store model
save_model_best = input("Save model best params")
if save_model_best == "yes":
    dump(grid_search_NB.best_estimator_, '2)Sentiment_Models/NB_2class_Finbert.joblib')

In [None]:
#Load best model and get statistics on the full train set and test
##INitiate and store best model
Best_NB_Finbert = load("2)Sentiment_Models/NB_2class_Finbert.joblib")
#Train the model (with CV)
gs_best = GridSearchCV(Best_NB_Finbert, {}, cv=5,
                           scoring='f1_macro', 
                           return_train_score=True,
                           error_score= "raise") 
gs_best.fit([list(h[0]) for h in headlines_train], sent_train)
grid_search_results = sorted(zip(gs_best.cv_results_["mean_test_score"], gs_best.cv_results_["mean_train_score"], [str(x) for x in gs_best.cv_results_["params"]]), reverse=True)
val_score = grid_search_results[0][0]
train_score = grid_search_results[0][1]
##Full Train & Full Test/VAlidation
print("Naive Bayes")
  #Train
print("-----TRAIN-----")
train_p, train_r, train_f, train_s = precision_recall_fscore_support(sent_train, Best_NB_Finbert.predict([list(h[0]) for h in headlines_train]), average="macro")
print("Precision: {:.2%}".format(train_p))
print("Recall: {:.2%}".format(train_r))
print("F score: {:.2%}".format(train_f))
  #Test
print("-----TEST-----")
test_p, test_r, test_f, test_s = precision_recall_fscore_support(sent_test, Best_NB_Finbert.predict([list(h[0]) for h in headlines_test]), average="macro")
print("Precision: {:.2%}".format(test_p))
print("Recall: {:.2%}".format(test_r))
print("F score: {:.2%}".format(test_f))
##Append Data
ML_table = pd.DataFrame([["Finbert Naive Bayes",Best_NB_Finbert,train_score,val_score,train_f,test_f]], columns=["ML","Params","Mean Train","Mean Test","Train","Validation"])
ML_table_Full = pd.concat([ML_table_Full,ML_table])

# Top Models 
Further Tests for Decision on Close Models

> FinBERT with **SVM (rbf)** Vs FinBERT with **NN**

## Auxiliary Functions & Loading Models/Data

In [None]:
#Loading Processed Headlines
with open("1)Transformed_Headlines/IS_Data_Finbert", "rb") as fp:  
    IS_headlines_complete= pickle.load(fp)
# Split Dataset into Traing and Test
headlines_list = list(IS_headlines_complete["Finbert_output"])
sent_list = list(IS_headlines_complete["Sent"])
headlines_train, headlines_test, sent_train, sent_test = train_test_split(headlines_list, sent_list, test_size=0.2, random_state=42)

## Extra Cross Validation & Comparision

In [None]:
list(os.listdir("2)Sentiment_Models"))

### Model 1

In [None]:
##INitiate 
Best_LinearSVC_Finbert = load("2)Sentiment_Models/RbfSVC_2class_wProb_Finbert.joblib")
#Train the model (with CV)
gs_best = GridSearchCV(Best_LinearSVC_Finbert, {}, cv=5,
                           scoring='f1_macro', 
                           return_train_score=True,
                           error_score= "raise") 
gs_best.fit([list(h[0]) for h in headlines_list], sent_list)
#grid_search_results = sorted(zip(gs_best.cv_results_["mean_test_score"], gs_best.cv_results_["mean_train_score"], [str(x) for x in gs_best.cv_results_["params"]]), reverse=True)

#Show Values
print("Linear SVC")
test_scores = gs_best.cv_results_["mean_test_score"]
train_scores = gs_best.cv_results_["mean_train_score"]
print("Test Scores: {}".format(gs_best.cv_results_["mean_test_score"]))
print("Train Scores: {}".format(gs_best.cv_results_["mean_train_score"]))

#List of scores of each splits
test_splits_M1 = [gs_best.cv_results_["split"+str(i)+"_test_score"] for i in range(5)]
train_splits_M1 = [gs_best.cv_results_["split"+str(i)+"_train_score"] for i in range(5)]

### Model 2

In [None]:
##INitiate 
Best_Logistic_Finbert = load("2)Sentiment_Models/NN_2class_Finbert.joblib") 
#Train the model (with CV)
gs_best = GridSearchCV(Best_Logistic_Finbert, {}, cv=5,
                           scoring='f1_macro', 
                           return_train_score=True,
                           error_score= "raise") 
gs_best.fit([list(h[0]) for h in headlines_list], sent_list)
#grid_search_results = sorted(zip(gs_best.cv_results_["mean_test_score"], gs_best.cv_results_["mean_train_score"], [str(x) for x in gs_best.cv_results_["params"]]), reverse=True)

#Show Values
print("Linear SVC")
test_scores = gs_best.cv_results_["mean_test_score"]
train_scores = gs_best.cv_results_["mean_train_score"]
print("Test Scores: {}".format(gs_best.cv_results_["mean_test_score"]))
print("Train Scores: {}".format(gs_best.cv_results_["mean_train_score"]))

#List of scores of each splits
test_splits_M2 = [gs_best.cv_results_["split"+str(i)+"_test_score"] for i in range(5)]
train_splits_M2 = [gs_best.cv_results_["split"+str(i)+"_train_score"] for i in range(5)]

## Chosen Model

In [None]:
##INitiate 
Best_model = load("2)Sentiment_Models/RbfSVC_2class_wProb_Finbert.joblib") 

# Printprecision, recall and f-score (of macro and by class)
p, r, f, s = precision_recall_fscore_support(sent_test, Best_model.predict([list(h[0]) for h in headlines_test]), average=None)
print("Precision: {}".format(p))
print("Recall: {}".format(r))
print("F score: {}".format(f))
print("Instances {}".format(s))
print("Accuracy {}".format(accuracy_score(sent_test, Best_model.predict([list(h[0]) for h in headlines_test]))))

In [None]:
conf_matrix = plot_confusion_matrix(Best_model, [list(h[0]) for h in headlines_test], sent_test,
                                 cmap=plt.cm.Greys,
                                 normalize='true',
                                 values_format=".1%")

#conf_matrix.figure_.savefig('conf_mat.png',dpi=300)

In [None]:
conf_m = confusion_matrix(sent_test, Best_model.predict([list(h[0]) for h in headlines_test]), normalize='true')
ax = plt.subplot()
sns.heatmap(conf_m, annot=True, fmt='.2%', ax=ax, cmap='Blues_r');  #annot=True to annotate cells, ftm='g' to disable scientific notation

# labels, title and ticks
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.xaxis.set_ticklabels(['negative', 'positive']); ax.yaxis.set_ticklabels(['negative', 'positive']); 

ax.figure.savefig('svm_conf.png', transparent=True, dpi=400)

# Processed data Stats

* Overview of data distribution

In [None]:
#Loading Processed Headlines
with open("1)Transformed_Headlines/IS_Data_Complete", "rb") as fp:  
    IS_headlines_processed = pickle.load(fp)
#Displaying Data
display(IS_headlines_processed.columns)
display(IS_headlines_processed.head(3))

In [None]:
# Split Dataset into Traing and Test
headlines_train, headlines_test, sent_train, sent_test = train_test_split(list(IS_headlines_processed["Processed_Text"]), list(IS_headlines_processed["Sent"]), 
                                                          test_size=0.2, random_state=42)
print("----- Full -----")
size = len(IS_headlines_processed["Sent"])
print("Size: ", size)
counts = collections.Counter(IS_headlines_processed["Sent"])
print("Value Count: ",counts)
print("Value Percentage >  [1]:{:.0%} , [-1]:{:.0%}".format(counts[1]/size,counts[-1]/size))

print("----- Train -----")
size_train = len(sent_train)
print("Size: ", size_train)
train_counts = collections.Counter(sent_train)
print("Value Count: ",train_counts)
print("Value Percentage >  [1]:{:.0%} , [-1]:{:.0%}".format(train_counts[1]/size_train,train_counts[-1]/size_train))
      
print("----- Test -----")
size_test = len(sent_test)
print("Size: ", size_test)
test_counts = collections.Counter(sent_test)
print("Value Count: ",test_counts)
print("Value Percentage >  [1]:{:.0%} , [-1]:{:.0%}".format(test_counts[1]/size_test,test_counts[-1]/size_test))

* Headlines word analysi

In [None]:
def build_vocab(headlines):
    """REturns dictionary with cout of occurence of each word in the full dataset.
    :param sentences: list of list of words
    :return: dictionary of words and their count
    """
    vocab = {}
    for sentence in headlines:
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

In [None]:
#Headlines Size statistics
print('Average word length of Headlines is {0:.0f}.'.format(np.mean([len(headline) for headline in IS_headlines_processed["Processed_Text"]])))
print('Std Dev word length of Headlines is {0:.1f}.'.format(np.std([len(headline) for headline in IS_headlines_processed["Processed_Text"]])))
print('Max word length of Headlines is {0:.0f}.'.format(np.max([len(headline) for headline in IS_headlines_processed["Processed_Text"]])))
print('Average character length of Headlines is {0:.0f}.'.format(np.mean([len(''.join(headline)) for headline in IS_headlines_processed["Processed_Text"]])))

In [None]:
temp = IS_headlines_processed[IS_headlines_processed["Sent"]==1]
#Headlines Size statistics (by class)
print('Average word length of Headlines is {0:.0f}.'.format(np.mean([len(headline) for headline in temp["Processed_Text"]])))
print('Std Dev word length of Headlines is {0:.1f}.'.format(np.std([len(headline) for headline in temp["Processed_Text"]])))
print('Max word length of Headlines is {0:.0f}.'.format(np.max([len(headline) for headline in temp["Processed_Text"]])))
print('Average character length of Headlines is {0:.0f}.'.format(np.mean([len(''.join(headline)) for headline in temp["Processed_Text"]])))

In [None]:
#check specific examples of headlines with given key word
keyword = "finnish"
for headline in IS_headlines_processed["title"]:
    if keyword in headline.lower():
        print(headline.lower())