In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import yaml
import os

In [2]:
#Set notebook preferences
pd.set_option('display.max_colwidth', None)

plt.style.use('ggplot')

In [3]:
#Read in rawData
path= r'/Users/ksharma/Documents/ML Engineer/Machine Learning/Projects/FinancialNewsSentimentAnalysis/'
config_name= 'config.yaml'

#read yaml file
with open(os.path.join(path, config_name)) as file:
    config = yaml.safe_load(file)
    
rawData= pd.read_csv(config['paths']['rawData'], header=None)
rawData.columns= ['sentiment', 'newsHeadline']

**Preview Data**

In [4]:
print('Data Shape: %s\n' % (str(rawData.shape)))
print('Value counts: \n%s\n' % (rawData['sentiment'].value_counts()))
display(rawData.head())

Data Shape: (4846, 2)

Value counts: 
neutral     2879
positive    1363
negative     604
Name: sentiment, dtype: int64



Unnamed: 0,sentiment,newsHeadline
0,neutral,"According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing ."
1,neutral,"Technopolis plans to develop in stages an area of no less than 100,000 square meters in order to host companies working in computer technologies and telecommunications , the statement said ."
2,negative,"The international electronic industry company Elcoteq has laid off tens of employees from its Tallinn facility ; contrary to earlier layoffs the company contracted the ranks of its office workers , the daily Postimees reported ."
3,positive,With the new production plant the company would increase its capacity to meet the expected increase in demand and would improve the use of raw materials and therefore increase the production profitability .
4,positive,"According to the company 's updated strategy for the years 2009-2012 , Basware targets a long-term net sales growth in the range of 20 % -40 % with an operating profit margin of 10 % -20 % of net sales ."


Preprocess data

In [5]:
#Normalize, tokenize and lemmatize text

os.chdir('/Users/ksharma/Documents/ML Engineer/Machine Learning/Projects/FinancialNewsSentimentAnalysis')

from src.preprocessing.preprocessing import Preprocessor

#Process newsHeadline data
tokens= Preprocessor(rawData.loc[:,'newsHeadline'])

tokens.normalization()
tokens.tokenization()
tokens.lemmatization()

#Append lemma to raw data 
cleanDf= rawData.copy()
cleanDf['lemmaTokens']= tokens.lemmaTokens

#Join lemmaTokens into a single string and append to cleanDf
sentences= cleanDf['lemmaTokens'].apply(lambda x: ' '.join(x))
cleanDf.loc[:, 'lemmaTokensSentences']= sentences

#Check
display(cleanDf.head())

Unnamed: 0,sentiment,newsHeadline,lemmaTokens,lemmaTokensSentences
0,neutral,"According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .","[according, gran, company, plan, move, production, russia, although, company, growing]",according gran company plan move production russia although company growing
1,neutral,"Technopolis plans to develop in stages an area of no less than 100,000 square meters in order to host companies working in computer technologies and telecommunications , the statement said .","[technopolis, plan, develop, stage, area, le, 100000, square, meter, order, host, company, working, computer, technology, telecommunication, statement, said]",technopolis plan develop stage area le 100000 square meter order host company working computer technology telecommunication statement said
2,negative,"The international electronic industry company Elcoteq has laid off tens of employees from its Tallinn facility ; contrary to earlier layoffs the company contracted the ranks of its office workers , the daily Postimees reported .","[international, electronic, industry, company, elcoteq, laid, ten, employee, tallinn, facility, contrary, earlier, layoff, company, contracted, rank, office, worker, daily, postimees, reported]",international electronic industry company elcoteq laid ten employee tallinn facility contrary earlier layoff company contracted rank office worker daily postimees reported
3,positive,With the new production plant the company would increase its capacity to meet the expected increase in demand and would improve the use of raw materials and therefore increase the production profitability .,"[new, production, plant, company, would, increase, capacity, meet, expected, increase, demand, would, improve, use, raw, material, therefore, increase, production, profitability]",new production plant company would increase capacity meet expected increase demand would improve use raw material therefore increase production profitability
4,positive,"According to the company 's updated strategy for the years 2009-2012 , Basware targets a long-term net sales growth in the range of 20 % -40 % with an operating profit margin of 10 % -20 % of net sales .","[according, company, updated, strategy, year, 20092012, basware, target, longterm, net, sale, growth, range, 20, 40, operating, profit, margin, 10, 20, net, sale]",according company updated strategy year 20092012 basware target longterm net sale growth range 20 40 operating profit margin 10 20 net sale


Split data

In [6]:
X= cleanDf.loc[:, 'lemmaTokensSentences']
y= cleanDf.loc[:, 'sentiment'].values

In [7]:
from src.preprocessing.preprocessing import DataRepresentation

trial1= DataRepresentation(X, y)
trial1.split_data()

In [8]:
trial1.X_train.shape

(3876,)

Create BoW Representation of training data

In [9]:
X_train_BoW= trial1.bag_of_words()

In [10]:
X_train_BoW

<3876x9152 sparse matrix of type '<class 'numpy.int64'>'
	with 47754 stored elements in Compressed Sparse Row format>

In [35]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [36]:
from src.modeling.modeling import Models

In [37]:
baseModels= Models(X_train_BoW, trial1.y_train)

In [38]:
baseModels.base_model_evaluation()

LogisticRegression(multi_class='multinomial', n_jobs=-1) Scores: [0.73969072 0.74709677 0.72516129 0.75225806 0.75096774]
Average Sccore: 0.74
Standard Deviations: 0.01
RandomForestClassifier(n_jobs=-1, random_state=24) Scores: [0.74484536 0.73677419 0.7316129  0.7316129  0.75354839]
Average Sccore: 0.74
Standard Deviations: 0.01
KNeighborsClassifier(n_jobs=-1) Scores: [0.63917526 0.64       0.64129032 0.64516129 0.65548387]
Average Sccore: 0.64
Standard Deviations: 0.01




LinearSVC(multi_class='crammer_singer', random_state=24) Scores: [0.68943299 0.71354839 0.70967742 0.70967742 0.72387097]
Average Sccore: 0.71
Standard Deviations: 0.01




We'll optimize for Logistic Regression and Random Forest Classifier