In [52]:
# import standard libraries
import pandas as pd
import numpy as np

# import spacy for NLP and re for regular expressions
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import re

import pickle
# import sklearn transformers, models and pipelines
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.model_selection import train_test_split

# Load the small language model from spacy
nlp = spacy.load('en_core_web_sm')

# set pandas text output to 400
pd.options.display.max_colwidth = 400

In [53]:
df_fin_phrase = pd.read_csv("../../Data/Prepared/CleanDatasets/fin_phrase_bank_clean.csv")

In [54]:
X_train, X_test, y_train, y_test = train_test_split(df_fin_phrase['clean_text'], df_fin_phrase['label'], test_size=0.3, random_state=42)

In [55]:
# Load the en_core_web_lg model
nlp = spacy.load('en_core_web_lg', disable=["parser", "ner"])
docs_train = [nlp(str(doc)).vector for doc in X_train]
X_train = np.vstack(docs_train)
print('Shape of train set: {}'.format(X_train.shape))


Shape of train set: (3392, 300)


In [56]:
docs_test = [nlp(doc).vector for doc in X_test]
X_test = np.vstack(docs_test)
print('Shape of test set: {}'.format(X_test.shape))

Shape of test set: (1454, 300)


In [57]:
word2vec_pipe = Pipeline([('estimator', LogisticRegression(C=1e3, solver='lbfgs', multi_class='multinomial', random_state=17, n_jobs=4, max_iter=10000))])



In [58]:
# cross validate
print('F1 score: {:.3f}'.format(np.mean(cross_val_score(word2vec_pipe, X_train, y_train, scoring = 'f1_micro'))))

F1 score: 0.691


In [59]:
# fit pipeline
word2vec_pipe.fit(X_train, y_train)

# predict on test set
pred = word2vec_pipe.predict(X_test)



In [60]:
X_train, X_test, y_train, y_test = train_test_split(df_fin_phrase['clean_text'], df_fin_phrase['label'], test_size=0.3, random_state=42)

In [67]:
df = pd.DataFrame({'text': X_test, 'truelabelNum': y_test, 'prediclabelNum': pred})
# class 0: negative, class 1: neutral, class 2: positive
df['pred_label'] = df['truelabelNum'].map({0: 'negative', 1: 'neutral', 2: 'positive'})
df['true_label'] = df['prediclabelNum'].map({0: 'negative', 1: 'neutral', 2: 'positive'})

In [68]:
df.to_csv('../../Data/Prepared/Word2vecAnalysis/FinancialWord2vecLogicReg.csv', index=False)

# Testing the model

In [62]:
sentences = ['growth is strong and we have plenty of liquidity.', 
               'there is a shortage of capital, and we need extra financing.', 
              'formulation patents might protect Vasotec to a limited extent.']
for sen in sentences:
    sentences_vector = [nlp(str(doc)).vector for doc in sentences]# vectorizing
    if word2vec_pipe.predict(sentences_vector).prod() == 0:
        print('Negative')
    elif word2vec_pipe.predict(sentences_vector).prod() == 1:
        print('Neutral')
    else:
        print('Positive')

Neutral
Neutral
Neutral


In [65]:

filename = '../../Data/Models/FinancialLogitRec_word2vec.pt'
pickle.dump(word2vec_pipe, open(filename, 'wb'))