In [38]:
import pandas as pd
import numpy as np

#Below libraries will be used for data preparation
import nltk, re, string
from string import punctuation
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import preprocessing

In [39]:
df_train = pd.read_csv("finbank_data_train.csv")
df_val = pd.read_csv("finbank_data_val.csv")
df_test = pd.read_csv("finbank_data_test.csv")

In [40]:
#remove stop words, perform lemmatization and remove symbols from the texts
lemma = WordNetLemmatizer()
stop = stopwords.words('english')
def Text_clean(txt):
    txt = txt.lower()
    words = nltk.word_tokenize(txt)
    words = ' '.join([lemma.lemmatize(word) for word in words if word not in stop])
    words_joined = "".join(words)
    text = re.sub('[^a-z]',' ',words_joined) #remove symbols 
    return text  

#apply Text_clean to the text column in the table, and create a new column "Text_clean"
df_train['text_clean'] = df_train['text'].apply(Text_clean)
df_val['text_clean'] = df_val['text'].apply(Text_clean)
df_test['text_clean'] = df_test['text'].apply(Text_clean)

In [41]:
#transform the labels to numerical format
le = preprocessing.LabelEncoder()
df_train['label_number'] = le.fit_transform(df_train['label'])
df_val['label_number'] = le.fit_transform(df_val['label'])
df_test['label_number'] = le.fit_transform(df_test['label'])

In [42]:
#Although we have splitted the database into train, validation and test for the project, to accomodate neural network approach,
#we will combine train and validation data in machine learning approach, because we will use 10-fold cross validation.
df_train = pd.concat([df_train, df_val]) 

In [43]:
df_train

Unnamed: 0,text,label,text_clean,label_number
0,"Cost cutting measures , which have produced ar...",positive,cost cutting measure produced around eur m ...,2
1,The Finnish national carrier said net loss in ...,negative,finnish national carrier said net loss april j...,0
2,"In total , more than 3000 surveillance cameras...",neutral,total surveillance camera handled manag...,1
3,"Rohwedder Group is an automotive supplies , te...",neutral,rohwedder group automotive supply telecommun...,1
4,The Russian gas giant invested another 46 mill...,neutral,russian gas giant invested another million ...,1
...,...,...,...,...
579,"Thus , SysOpen Digia has , in accordance with ...",neutral,thus sysopen digia accordance chapter s...,1
580,The measures result from decreased demand in t...,negative,measure result decreased demand technical desi...,0
581,A Helsinki : ELIiV today reported EPS of EUR1 ...,positive,helsinki eliiv today reported eps eur ...,2
582,Profit for the period fell to EUR 1.6 mn from ...,negative,profit period fell eur mn eur mn janua...,0


In [44]:
df_test

Unnamed: 0,text,label,text_clean,label_number
0,Operating profit excluding non-recurring items...,negative,operating profit excluding non recurring item ...,0
1,Finnish waste management and cleaning group La...,negative,finnish waste management cleaning group lassil...,0
2,The liquidity providing was interrupted on May...,negative,liquidity providing interrupted may ...,0
3,"At 1.33 pm , the OMX Helsinki 25 was 0.30 pct ...",negative,pm omx helsinki pct lower ...,0
4,In addition the deal includes a call option wh...,positive,addition deal includes call option enable mari...,2
...,...,...,...,...
579,The decision will have to be made whether the ...,neutral,decision made whether group agrees import esto...,1
580,"It grew in Finland , Norway , Denmark and the ...",neutral,grew finland norway denmark baltic country,1
581,"According to CEO Matti Karppinen , Frozen Food...",positive,according ceo matti karppinen frozen food po...,2
582,We are pleased to invite you to join M-real 's...,neutral,pleased invite join m real s international co...,1


In [45]:
#prepare feature(x) and target(y) for machine learning model

#define x and y 
x_train = df_train.text_clean
y_train=df_train.label_number

#x_val = df_val.text_clean
#y_val=df_val.label_number

x_test=df_test.text_clean
y_test=df_test.label_number

In [46]:
#We use two vectorization methods, bag of words and TF-IDF, to compare the results.  
#bag of words vectorization
vectorizer = CountVectorizer()
vectorizer.fit(x_train)

vec_train = vectorizer.transform(x_train)
#vec_val = vectorizer.transform(x_val)
vec_test  = vectorizer.transform(x_test)

vec_train.toarray() #check the result


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [47]:
#TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(max_df=0.9, ngram_range=(1,2))
tfidf_train = tfidf_vectorizer.fit_transform(x_train)
#tfidf_val = tfidf_vectorizer.transform(x_val)
tfidf_test = tfidf_vectorizer.transform(x_test)

tfidf_test.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [48]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import  accuracy_score
from sklearn.model_selection import KFold, cross_val_score

#we use three machine learning models, Multinomial Naive bayes, SVM and logistic regression, to compare the results

# Multinomial Naive bayes bag of words  
mnb_vec = MultinomialNB() 
mnb_vec.fit(vec_train, y_train)

# Multinomial Naive bayes TF-IDF
mnb_tfidf = MultinomialNB()
mnb_tfidf.fit(tfidf_train, y_train)

#SVM model bag of words
svm_vec =SVC(probability=True,kernel='linear')
svm_vec.fit(vec_train, y_train)

#SVM model TF-IDF
svm_tfidf =SVC(probability=True,kernel='linear')
svm_tfidf.fit(tfidf_train, y_train)

#logistic regression bag of words
lr_vec = LogisticRegression(max_iter = 1000)
lr_vec.fit(vec_train, y_train)

#logistic regression TF-IDF
lr_tfidf =LogisticRegression(max_iter = 1000)
lr_tfidf.fit(tfidf_train, y_train)


LogisticRegression(max_iter=1000)

In [49]:
# 10-folds cross validation
kfold = KFold(n_splits=10)
acc_mnb_vec = cross_val_score(estimator = mnb_vec, X = vec_train, y = y_train, cv = kfold,scoring='accuracy')
acc_mnb_tfidf = cross_val_score(estimator = mnb_tfidf, X = tfidf_train, y = y_train, cv = kfold,scoring='accuracy')

acc_svm_vec = cross_val_score(estimator = svm_vec, X = vec_train, y = y_train, cv = kfold,scoring='accuracy')
acc_svm_tfidf = cross_val_score(estimator = svm_tfidf, X = tfidf_train, y = y_train, cv = kfold,scoring='accuracy')

acc_lr_vec = cross_val_score(estimator = lr_vec, X = vec_train, y = y_train, cv = kfold,scoring='accuracy')
acc_lr_tfidf = cross_val_score(estimator = lr_tfidf, X = tfidf_train, y = y_train, cv = kfold,scoring='accuracy')

print("mnb vec Accuracy:", acc_mnb_vec.mean(),
      "mnb tfidf Accuracy:", acc_mnb_tfidf.mean(),
       "svm vec Accuracy:", acc_svm_vec.mean(),
       "svm tfidf Accuracy:", acc_svm_tfidf.mean(),
       "lr vec Accuracy:", acc_lr_vec.mean(),
       "lr tfidf Accuracy:", acc_lr_tfidf.mean(),
     )

mnb vec Accuracy: 0.7993184863298931 mnb tfidf Accuracy: 0.848011225783089 svm vec Accuracy: 0.8742632627195366 svm tfidf Accuracy: 0.8968962520369365 lr vec Accuracy: 0.8792102118413906 lr tfidf Accuracy: 0.8740738728951657


In [50]:
#use test data to evaluate the result
from sklearn.metrics import  accuracy_score, f1_score

# Multinomial Naive bayes bag of words  
pred_mnb_vec = mnb_vec.predict(vec_test)
acc_mnb_vec = accuracy_score(y_test, pred_mnb_vec)
f1_mnb_vec = f1_score(y_test, pred_mnb_vec,  average = 'macro')

# Multinomial Naive bayes TF-IDF
pred_mnb_tfidf = mnb_tfidf.predict(tfidf_test)
acc_mnb_tfidf = accuracy_score(y_test, pred_mnb_tfidf)
f1_mnb_tfidf = f1_score(y_test, pred_mnb_tfidf, average = 'macro')

#SVM model bag of words
pred_svm_vec = svm_vec.predict(vec_test)
acc_svm_vec = accuracy_score(y_test, pred_svm_vec)
f1_svm_vec = f1_score(y_test, pred_svm_vec,average = 'macro')

#SVM model TF-IDF
pred_svm_tfidf =svm_tfidf.predict(tfidf_test)
acc_svm_tfidf = accuracy_score(y_test, pred_svm_tfidf)
f1_svm_tfidf = f1_score(y_test, pred_svm_tfidf, average = 'macro')

#logistic regression bag of words
pred_lr_vec = lr_vec.predict(vec_test)
acc_lr_vec = accuracy_score(y_test, pred_lr_vec)
f1_lr_vec = f1_score(y_test, pred_lr_vec, average = 'macro')

#logistic regression TF-IDF
pred_lr_tfidf =lr_tfidf.predict(tfidf_test)
acc_lr_tfidf = accuracy_score(y_test, pred_lr_tfidf)
f1_lr_tfidf = f1_score(y_test, pred_lr_tfidf, average = 'macro')


results_table = pd.DataFrame([['Multinomial Naive bayes bag of words',acc_mnb_vec, f1_mnb_vec],
                             ['Multinomial Naive bayes TF-IDF',acc_mnb_tfidf, f1_mnb_tfidf],
                             ['SVM model bag of words',acc_svm_vec, f1_svm_vec],
                             ['SVM model TF-IDF',acc_svm_tfidf, f1_svm_tfidf],
                             ['logistic regression bag of words',acc_lr_vec, f1_lr_vec],
                             ['logistic regression TF-IDF',acc_lr_tfidf, f1_lr_tfidf]], 
                             columns = ['Model','Accuracy','F1 score'])
results_table

Unnamed: 0,Model,Accuracy,F1 score
0,Multinomial Naive bayes bag of words,0.791096,0.789908
1,Multinomial Naive bayes TF-IDF,0.847603,0.847514
2,SVM model bag of words,0.876712,0.876778
3,SVM model TF-IDF,0.893836,0.894636
4,logistic regression bag of words,0.886986,0.88712
5,logistic regression TF-IDF,0.873288,0.874811


In [51]:
df_news_Earnings = pd.read_csv("news_Earnings.csv")
df_news_Economic = pd.read_csv("news_Economic.csv")
df_news_StockMarket = pd.read_csv("news_StockMarket.csv")

df_news_Earnings['text_clean'] = df_news_Earnings['text'].apply(Text_clean)
df_news_Economic['text_clean'] = df_news_Economic['text'].apply(Text_clean)
df_news_StockMarket['text_clean'] = df_news_StockMarket['text'].apply(Text_clean)

df_news_Earnings['label_number'] = le.fit_transform(df_news_Earnings['label'])
df_news_Economic['label_number'] = le.fit_transform(df_news_Economic['label'])
df_news_StockMarket['label_number'] = le.fit_transform(df_news_StockMarket['label'])

x_Earnings = tfidf_vectorizer.transform(df_news_Earnings.text_clean)
x_Economic = tfidf_vectorizer.transform(df_news_Economic.text_clean)
x_StockMarket = tfidf_vectorizer.transform(df_news_StockMarket.text_clean)

df_news_Earnings['test_result'] = svm_tfidf.predict(x_Earnings)
df_news_Economic['test_result'] = svm_tfidf.predict(x_Economic)
df_news_StockMarket['test_result'] = svm_tfidf.predict(x_StockMarket)


In [52]:
df_news_Earnings

Unnamed: 0,text,label,text_clean,label_number,test_result
0,Tokyo Inflation Slows Ahead of BOJ Leadership ...,positive,tokyo inflation slows ahead boj leadership cha...,2,1
1,Fed Watchdog Launches Investigation Into SVB S...,neutral,fed watchdog launch investigation svb supervision,1,1
2,Fed Officials See More Work on Inflation Despi...,neutral,fed official see work inflation despite bank s...,1,1
3,Colombia Hikes Interest Rate to 24-Year High t...,neutral,colombia hike interest rate year high tame ...,1,1
4,"Australian Inflation Eases, Bolstering Case fo...",positive,australian inflation eas bolstering case rat...,2,1
5,Goldman Sachs says the banking meltdown is a ‘...,positive,goldman sachs say banking meltdown headwind ...,2,1
6,Banking crisis has ‘definitely’ tipped the U.S...,negative,banking crisis definitely tipped u s clos...,0,1
7,Existing home sales jump in February as mortga...,positive,existing home sale jump february mortgage rate...,2,1
8,Top economist Mohamed El-Erian warns that ‘ero...,negative,top economist mohamed el erian warns erosion...,0,1
9,Labor market: Workers are getting two jobs ami...,negative,labor market worker getting two job amid inf...,0,0


In [53]:
df_news_Economic

Unnamed: 0,text,label,text_clean,label_number,test_result
0,Homebuilder Lennar sees 'no disruption' in len...,positive,homebuilder lennar see no disruption lendin...,2,1
1,Media giants signal ad market 'stabilized' aft...,positive,medium giant signal ad market stabilized br...,2,1
2,Costco Q2 earnings: Stock slips after mixed re...,neutral,costco q earnings stock slip mixed result,1,0
3,"Salesforce earnings: Company reports beats, st...",positive,salesforce earnings company report beat st...,2,1
4,"Salesforce's activist investors: Who are they,...",neutral,salesforce s activist investor want,1,1
5,J.M. Smucker CEO: PB&J is our 'bread & butter'...,positive,j m smucker ceo pb j bread butter ...,2,1
6,HP earnings: Sales decline ‘driven by the macr...,negative,hp earnings sale decline driven macroecono...,0,2
7,"Warner Bros. Discovery sinks on earnings miss,...",negative,warner bros discovery sink earnings miss we...,0,0
8,Lucid earnings: Stock slides as EV-maker misse...,negative,lucid earnings stock slide ev maker miss rev...,0,2
9,Shake Shack earnings come mostly in line with ...,neutral,shake shack earnings come mostly line estimate,1,2


In [54]:
df_news_StockMarket

Unnamed: 0,text,label,text_clean,label_number,test_result
0,Zoetis Stock Could Rise 30% as People Spend Mo...,positive,zoetis stock could rise people spend pet,2,1
1,Dow Jones Rallies 250 Points; Lululemon Soars ...,positive,dow jones rally point lululemon soar ...,2,2
2,"U.S. Stocks Open Higher, Following Internation...",positive,u s stock open higher following internation...,2,2
3,Bank Crisis Credit Crunch Will Clip S&P 500 Ea...,negative,bank crisis credit crunch clip p earning...,0,2
4,Micron Stock Jumps Despite Sales Plunging 53%....,neutral,micron stock jump despite sale plunging ...,1,1
5,Deere Is Sowing the Seeds of Success. Why the ...,positive,deere sowing seed success stock ride economi...,2,1
6,TikTok parent ByteDance has special stock owne...,neutral,tiktok parent bytedance special stock owned ch...,1,1
7,These Stocks Are Moving the Most Today: Micron...,neutral,stock moving today micron lululemon alib...,1,1
8,Wall Street's sharks are circling 'dumb and gr...,negative,wall street s shark circling dumb greedy b...,0,1
9,Climate Funding Gets Squeezed by Volatile Markets,negative,climate funding get squeezed volatile market,0,1
