# Sentiment Analysis of Financial News Headlines

### Dataset:
https://raw.githubusercontent.com/subashgandyer/datasets/main/financial_news_headlines_sentiment.csv

## Dataset 

In [1]:
import pandas as pd

url = 'https://raw.githubusercontent.com/subashgandyer/datasets/main/financial_news_headlines_sentiment.csv'
data = pd.read_csv(url, encoding='latin1', names=['label', 'news']) # import data

## Explore the dataset

In [2]:
data

Unnamed: 0,label,news
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...
...,...,...
4841,negative,LONDON MarketWatch -- Share prices ended lower...
4842,neutral,Rinkuskiai 's beer sales fell by 6.5 per cent ...
4843,negative,Operating profit fell to EUR 35.4 mn from EUR ...
4844,negative,Net sales of the Paper segment decreased to EU...


In [3]:
data.label.isna().sum() # check if there are any nulls in labels

0

In [4]:
data.news.isna().sum() # check if there are any nulls in features

0

In [5]:
data.label.value_counts() # check our labels

neutral     2879
positive    1363
negative     604
Name: label, dtype: int64

In [6]:
import re

symbols = set()
for news in data.news.tolist():
    syms = re.findall(r'\W', news)
    for sym in syms:
        symbols.add(sym) # check for non char or non numerical values
        
symbols

{' ',
 '!',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '+',
 ',',
 '-',
 '.',
 '/',
 ':',
 ';',
 '=',
 '?',
 '`',
 '\x88',
 '\x8b',
 '\x93',
 '\x97',
 '£',
 '¦'}

## Clean the data

In [7]:
def remove_symbol(x):
    return re.sub(r'\W', ' ', x) # replace non char/number with a space

In [8]:
data.news = data.news.apply(remove_symbol) # replace non char/number with a space

symbols = set()
for news in data.news.tolist():
    syms = re.findall(r'\W', news)
    for sym in syms:
        symbols.add(sym)

symbols # double check that we only have char and numbers

{' '}

## BoW model 

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

bow = CountVectorizer()          # create bow model
bow.fit(data.news)               # train bow model
X_bow = bow.transform(data.news) # vectorize features using Bag of Words
X_bow.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

## Tf-idf model

In [10]:
data.news # call clean data for features

0       According to Gran   the company has no plans t...
1       Technopolis plans to develop in stages an area...
2       The international electronic industry company ...
3       With the new production plant the company woul...
4       According to the company  s updated strategy f...
                              ...                        
4841    LONDON MarketWatch    Share prices ended lower...
4842    Rinkuskiai  s beer sales fell by 6 5 per cent ...
4843    Operating profit fell to EUR 35 4 mn from EUR ...
4844    Net sales of the Paper segment decreased to EU...
4845    Sales in Finland decreased by 10 5   in Januar...
Name: news, Length: 4846, dtype: object

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(data.news) # vectorize feayures using tfidf
X_tfidf.toarray()

array([[0.      , 0.      , 0.      , ..., 0.      , 0.      , 0.      ],
       [0.      , 0.147059, 0.      , ..., 0.      , 0.      , 0.      ],
       [0.      , 0.      , 0.      , ..., 0.      , 0.      , 0.      ],
       ...,
       [0.      , 0.      , 0.      , ..., 0.      , 0.      , 0.      ],
       [0.      , 0.      , 0.      , ..., 0.      , 0.      , 0.      ],
       [0.      , 0.      , 0.      , ..., 0.      , 0.      , 0.      ]])

In [12]:
X = X_tfidf # we will continue using tfidf rather than CBoW

In [13]:
X.toarray()

array([[0.      , 0.      , 0.      , ..., 0.      , 0.      , 0.      ],
       [0.      , 0.147059, 0.      , ..., 0.      , 0.      , 0.      ],
       [0.      , 0.      , 0.      , ..., 0.      , 0.      , 0.      ],
       ...,
       [0.      , 0.      , 0.      , ..., 0.      , 0.      , 0.      ],
       [0.      , 0.      , 0.      , ..., 0.      , 0.      , 0.      ],
       [0.      , 0.      , 0.      , ..., 0.      , 0.      , 0.      ]])

While tfidf and bow are similar, tfidf helps us place importance on certain words. This could give us an edge for our model and due to this, we will use tfidf over bow.

## Handle imbalance dataset

In [14]:
from imblearn.over_sampling import SMOTE

In [15]:
from imblearn.over_sampling import SMOTE

oversampling = SMOTE(sampling_strategy='auto')
X, y = oversampling.fit_resample(X, data.label) 
y.value_counts() # double check to see if we balanced out the data

neutral     2879
positive    2879
negative    2879
Name: label, dtype: int64

## Split train test data

In [16]:
lookup = {'neutral': 0,
          'positive': 1,
          'negative': 2} # define how we want to encode our labels

In [17]:
def convert_label(x):
    return lookup.get(x) # function to encode our labels

In [18]:
y_converted = y.apply(convert_label) # encoding our labels

In [19]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split( X,
                                                     y_converted,
                                                     test_size=0.1,
                                                     random_state=1) # split our data to train and test  data

In [20]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((7773, 10070), (864, 10070), (7773,), (864,))

## 9. Classification Algorithm [10 points]
- Train
- Predict

In [21]:
from sklearn.linear_model import SGDClassifier

sgd = SGDClassifier(random_state=1, verbose=0) # creating sgd classifier 
sgd.fit(X_train, y_train) # training sgd
sgd_pred = sgd.predict(X_test) # making sgd predict y_test

In [22]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=1, verbose=1) # creating random forest classifier
rf.fit(X_train, y_train) # training random forest
rf_pred = rf.predict(X_test) # making random forest predict y_test

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    6.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


## Metrics

In [23]:
from sklearn.metrics import confusion_matrix, accuracy_score

sgd_mat = confusion_matrix(y_test, sgd_pred)
rf_mat = confusion_matrix(y_test, rf_pred)    # get confusion matrix for both sgd and random forest

In [24]:
sgd_mat

array([[263,  35,  11],
       [ 26, 235,   3],
       [  1,   0, 290]], dtype=int64)

In [25]:
rf_mat

array([[291,  14,   4],
       [ 49, 212,   3],
       [ 12,   1, 278]], dtype=int64)

In [26]:
accuracy_score(y_test, sgd_pred) # accuracy of sgd

0.9120370370370371

In [27]:
accuracy_score(y_test, rf_pred) # accuracy of random forest, sgd is slightly better

0.9039351851851852