In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import yaml
import os

In [2]:
#Set notebook preferences
pd.set_option('display.max_colwidth', None)

plt.style.use('ggplot')

In [3]:
#Read in rawData
path= r'/Users/ksharma/Documents/ML Engineer/Machine Learning/Projects/FinancialNewsSentimentAnalysis/'
config_name= 'config.yaml'

#read yaml file
with open(os.path.join(path, config_name)) as file:
    config = yaml.safe_load(file)
    
rawData= pd.read_csv(config['paths']['rawData'], header=None)
rawData.columns= ['sentiment', 'newsHeadline']

**Preview Data**

In [4]:
print('Data Shape: %s\n' % (str(rawData.shape)))
print('Value counts: \n%s\n' % (rawData['sentiment'].value_counts()))
display(rawData.head())

Data Shape: (4846, 2)

Value counts: 
neutral     2879
positive    1363
negative     604
Name: sentiment, dtype: int64



Unnamed: 0,sentiment,newsHeadline
0,neutral,"According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing ."
1,neutral,"Technopolis plans to develop in stages an area of no less than 100,000 square meters in order to host companies working in computer technologies and telecommunications , the statement said ."
2,negative,"The international electronic industry company Elcoteq has laid off tens of employees from its Tallinn facility ; contrary to earlier layoffs the company contracted the ranks of its office workers , the daily Postimees reported ."
3,positive,With the new production plant the company would increase its capacity to meet the expected increase in demand and would improve the use of raw materials and therefore increase the production profitability .
4,positive,"According to the company 's updated strategy for the years 2009-2012 , Basware targets a long-term net sales growth in the range of 20 % -40 % with an operating profit margin of 10 % -20 % of net sales ."


Preprocess data

In [5]:
os.chdir('/Users/ksharma/Documents/ML Engineer/Machine Learning/Projects/FinancialNewsSentimentAnalysis')
from src.preprocessing.preprocessing import Preprocessor

#Normalize, tokenize and lemmatize text
tokens= Preprocessor(rawData.loc[:,'newsHeadline'])

tokens.normalization()
tokens.tokenization()
tokens.lemmatization()

#Append lemma to raw data 
cleanDf= rawData.copy()
cleanDf['lemmaTokens']= tokens.lemmaTokens

#Join lemmaTokens into a single string and append to cleanDf
sentences= cleanDf['lemmaTokens'].apply(lambda x: ' '.join(x))
cleanDf.loc[:, 'lemmaTokensSentences']= sentences

#Check
display(cleanDf.head())

Unnamed: 0,sentiment,newsHeadline,lemmaTokens,lemmaTokensSentences
0,neutral,"According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .","[according, gran, company, plan, move, production, russia, although, company, growing]",according gran company plan move production russia although company growing
1,neutral,"Technopolis plans to develop in stages an area of no less than 100,000 square meters in order to host companies working in computer technologies and telecommunications , the statement said .","[technopolis, plan, develop, stage, area, le, 100000, square, meter, order, host, company, working, computer, technology, telecommunication, statement, said]",technopolis plan develop stage area le 100000 square meter order host company working computer technology telecommunication statement said
2,negative,"The international electronic industry company Elcoteq has laid off tens of employees from its Tallinn facility ; contrary to earlier layoffs the company contracted the ranks of its office workers , the daily Postimees reported .","[international, electronic, industry, company, elcoteq, laid, ten, employee, tallinn, facility, contrary, earlier, layoff, company, contracted, rank, office, worker, daily, postimees, reported]",international electronic industry company elcoteq laid ten employee tallinn facility contrary earlier layoff company contracted rank office worker daily postimees reported
3,positive,With the new production plant the company would increase its capacity to meet the expected increase in demand and would improve the use of raw materials and therefore increase the production profitability .,"[new, production, plant, company, would, increase, capacity, meet, expected, increase, demand, would, improve, use, raw, material, therefore, increase, production, profitability]",new production plant company would increase capacity meet expected increase demand would improve use raw material therefore increase production profitability
4,positive,"According to the company 's updated strategy for the years 2009-2012 , Basware targets a long-term net sales growth in the range of 20 % -40 % with an operating profit margin of 10 % -20 % of net sales .","[according, company, updated, strategy, year, 20092012, basware, target, longterm, net, sale, growth, range, 20, 40, operating, profit, margin, 10, 20, net, sale]",according company updated strategy year 20092012 basware target longterm net sale growth range 20 40 operating profit margin 10 20 net sale


Split data

In [6]:
X= cleanDf.loc[:, 'lemmaTokensSentences']
y= cleanDf.loc[:, 'sentiment'].values

In [7]:
#Initialize data representation object
from src.preprocessing.preprocessing import DataRepresentation

trial1= DataRepresentation(X, y)
trial1.split_data()

Evaluate Base Models

In [8]:
#Create BoW representation of data
X_train_BoW= trial1.bag_of_words()

In [9]:
#Init Models object and evaluate base models
from src.modeling.modeling import Models

baseModels= Models(X_train_BoW, trial1.y_train)

baseModels.base_model_evaluation()

Logistic Regression Base Performance Metrics:
Average Accuracy: 0.74
Accuracy Standard Deviation: 0.01
Average F1 Macro: 0.66
Accuracy F1 Macro Standard Deviation: 0.02
Naive Bayes Base Performance Metrics:
Average Accuracy: 0.7
Accuracy Standard Deviation: 0.01
Average F1 Macro: 0.62
Accuracy F1 Macro Standard Deviation: 0.02
Random Forest Base Performance Metrics:
Average Accuracy: 0.74
Accuracy Standard Deviation: 0.01
Average F1 Macro: 0.63
Accuracy F1 Macro Standard Deviation: 0.02
KNN Base Performance Metrics:
Average Accuracy: 0.64
Accuracy Standard Deviation: 0.01
Average F1 Macro: 0.42
Accuracy F1 Macro Standard Deviation: 0.01




Linear SVC Base Performance Metrics:
Average Accuracy: 0.71
Accuracy Standard Deviation: 0.01
Average F1 Macro: 0.64
Accuracy F1 Macro Standard Deviation: 0.02




We'll optimize for Logistic Regression, Random Forest Classifier, & Linear SVC

### Logistic Regression

In [10]:
%load_ext autoreload
%autoreload 2

In [11]:
from src.modeling.modeling import Models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

#Init training data object
iteration1= Models(X_train_BoW, trial1.y_train)

#Tune Logistic Regression
logR= LogisticRegression(multi_class= "multinomial",solver='liblinear')
logR_tuned= iteration1.random_search_cv(estimator= logR, parameters= config['hyperparameters']['logisticRegression']['param_grid'])


Traceback (most recent call last):
  File "/Users/ksharma/miniforge3/envs/NLP/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/ksharma/miniforge3/envs/NLP/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/Users/ksharma/miniforge3/envs/NLP/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got elasticnet penalty.

Traceback (most recent call last):
  File "/Users/ksharma/miniforge3/envs/NLP/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/ksharma/miniforge3/envs/NLP/lib/python3.8/s

In [12]:
#Evaluate
print('Tuned Logistic Regression model best score: %s' % (str(round(logR_tuned.best_score_, 3) * 100) + '%'))
print('Tuned Logistic Regression best model: \n%s' % (logR_tuned.best_estimator_))

Tuned Logistic Regression model best score: 74.3%
Tuned Logistic Regression best model: 
LogisticRegression(C=1, multi_class='multinomial', n_jobs=-1, random_state=24,
                   solver='sag')


In [13]:
import pickle

#Save trained model
path= config['paths']['trainedModels']
pickle.dump(logR_tuned, open(path + '/naiveLogReg', 'wb'))

### Linear SVC

In [14]:
#Tune linearSVC
linearSVC= LinearSVC()
linearSVC_tuned= iteration1.random_search_cv(estimator= linearSVC, parameters= config['hyperparameters']['linearSVC']['param_grid'])

Traceback (most recent call last):
  File "/Users/ksharma/miniforge3/envs/NLP/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/ksharma/miniforge3/envs/NLP/lib/python3.8/site-packages/sklearn/svm/_classes.py", line 234, in fit
    self.coef_, self.intercept_, self.n_iter_ = _fit_liblinear(
  File "/Users/ksharma/miniforge3/envs/NLP/lib/python3.8/site-packages/sklearn/svm/_base.py", line 974, in _fit_liblinear
    solver_type = _get_liblinear_solver_type(multi_class, penalty, loss, dual)
  File "/Users/ksharma/miniforge3/envs/NLP/lib/python3.8/site-packages/sklearn/svm/_base.py", line 830, in _get_liblinear_solver_type
    raise ValueError('Unsupported set of arguments: %s, '
ValueError: Unsupported set of arguments: The combination of penalty='l1' and loss='squared_hinge' are not supported when dual=True, Parameters: penalty='l1', loss='squared_hinge', dual=True

Traceback (m

In [15]:
#Evaluate
print('Tuned Linear SVC model best score: %s' % (str(round(linearSVC_tuned.best_score_, 3) * 100) + '%'))
print('Tuned Linear SVC best model: \n%s' % (linearSVC_tuned.best_estimator_))

Tuned Linear SVC model best score: 74.6%
Tuned Linear SVC best model: 
LinearSVC(C=0.1, max_iter=100, random_state=24)


In [16]:
#Save trained model
path= config['paths']['trainedModels']
pickle.dump(linearSVC_tuned, open(path + '/naiveLinSVC', 'wb'))

### Random Forest

In [17]:
from sklearn.ensemble import RandomForestClassifier

#Init base Random Forest
rf= RandomForestClassifier(random_state= 24)
rf_tuned= iteration1.random_search_cv(estimator= rf, parameters= config['hyperparameters']['randomForestClassifier']['param_grid'])

In [18]:
#Evaluate
print('Tuned Random Forest classifier best score: %s' % (str(round(rf_tuned.best_score_, 3) * 100) + '%'))
print('Tuned Random Forest classifier model: \n%s' % (rf_tuned.best_estimator_))

Tuned Random Forest classifier best score: 70.6%
Tuned Random Forest classifier model: 
RandomForestClassifier(class_weight='balanced', max_depth=7,
                       max_features='sqrt', n_estimators=200, n_jobs=-1,
                       random_state=24)


In [19]:
#Save trained model
path= config['paths']['trainedModels']
pickle.dump(rf_tuned, open(path + '/naiveRanFor', 'wb'))

In [20]:
trial1.X_train.shape

(3876,)

In [21]:
trial1.y_train.shape

(3876,)

In [26]:
#Export training and test data
path= config['paths']['processedData']

pd.DataFrame(trial1.X_train).to_csv(path + '/X_train.csv', header=None, index=None)
pd.DataFrame(trial1.y_train).to_csv(path + '/y_train.csv', header=None, index=None)
pd.DataFrame(trial1.X_test).to_csv(path + '/X_test.csv', header=None, index=None)
pd.DataFrame(trial1.y_test).to_csv(path + '/y_test.csv', header=None, index=None)