In [34]:
import numpy as np 
import pandas as pd 
import tensorflow as tf 
import string
from tensorflow import keras
from nltk.corpus import stopwords
from nltk import PorterStemmer
from gensim.models import Word2Vec
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [2]:
df  = pd.read_csv('financial_sentiment.csv')

In [3]:
df.head()

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,According to the Finnish-Russian Chamber of Co...,neutral
4,The Swedish buyout firm has sold its remaining...,neutral


In [4]:
df.isnull().sum()

Sentence     0
Sentiment    0
dtype: int64

In [5]:
df['Sentiment'].value_counts()

Sentiment
neutral     3130
positive    1852
negative     860
Name: count, dtype: int64

In [6]:
puncs = string.punctuation
puncs

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [7]:
def remove_punctuations(text):
    return text.translate(str.maketrans(' ', ' ', puncs))

In [8]:
df['Sentence'] = df['Sentence'].apply(remove_punctuations)

In [9]:
def remove_stopwords(text):
    cleaned_text = []
    text = text.split(' ')
    for word in text:
        if word in stopwords.words('english'):
            cleaned_text.append("")
        else:
            cleaned_text.append(word)
    cleaned_str = cleaned_text[:]
    return ' '.join(cleaned_str)

In [10]:
df['Sentence'] = df['Sentence'].apply(remove_stopwords)

In [11]:
df = df[df['Sentence'].apply(lambda x: len(x.split()) >= 5)]

In [12]:
ps = PorterStemmer()

In [13]:
def stemming(text):
    return " ".join([ps.stem(word) for word in text.split(' ')])

In [14]:
df['Sentence'] = df['Sentence'].apply(stemming)

In [15]:
X = df['Sentence']
Y = df['Sentiment']

In [16]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=10)

In [17]:
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)
y_train = pd.DataFrame(y_train)
y_test = pd.DataFrame(y_test)

In [18]:
wv = Word2Vec(vector_size=100, min_count=2, workers=6)

In [19]:
corpus = x_train.to_list()

In [20]:
input_corpus = []

for sentence in corpus:
    input_corpus.append(sentence.split(' '))

In [21]:
wv.build_vocab(input_corpus)

In [22]:
wv.train(input_corpus, total_examples=wv.corpus_count, epochs=wv.epochs)

(267378, 485975)

In [23]:
def document_vectorizer(doc):
    doc = [word for word in doc.split() if word in wv.wv.index_to_key]
    return np.mean(wv.wv[doc], axis=0)

In [24]:
document_vectorizer('the geosolut technolog   leverag benefon   gp solut   provid locat base search technolog    commun platform  locat relev multimedia content     new   power commerci model ')

array([-1.61473244e-01,  2.12832719e-01,  1.99001968e-01,  2.14132279e-01,
       -9.33123529e-02, -4.96508837e-01,  2.01662242e-01,  6.66209638e-01,
       -2.43261769e-01, -1.54276326e-01,  1.09487474e-01, -2.93196887e-01,
       -1.46771267e-01,  1.51218072e-01,  7.72585720e-02, -2.60082800e-02,
        1.32303506e-01, -2.12638348e-01, -1.48452803e-01, -6.63034797e-01,
        3.30885231e-01,  2.51840413e-01,  2.66760081e-01, -4.36811766e-04,
       -2.21049994e-01,  3.87863070e-02, -1.41406760e-01, -9.32805985e-02,
       -4.54795212e-01, -9.59474444e-02,  2.65165508e-01, -4.63950541e-03,
        2.42000937e-01, -2.20693558e-01, -8.86337161e-02,  2.63509721e-01,
        3.91096890e-01, -3.30680311e-01, -1.09677613e-01, -3.94177377e-01,
       -1.41956642e-01, -4.47916165e-02, -2.96374500e-01,  7.34105706e-02,
        3.55456680e-01, -1.09327182e-01, -3.94090205e-01,  2.26625964e-01,
        7.18851537e-02,  1.91524714e-01, -4.52317782e-02,  7.97542036e-02,
       -2.01431870e-01, -

In [25]:
x_train = pd.DataFrame(x_train)
x_test = pd.DataFrame(x_test)

In [26]:
from tqdm import tqdm

In [27]:
x_train['Sentence'].values[4449]

'insur old mutual pick standard bank hemphil  new ceo'

In [28]:
input_x = []
for doc in tqdm(x_train['Sentence'].values):
    input_x.append(document_vectorizer(doc))

  0%|          | 0/4531 [00:00<?, ?it/s]

100%|██████████| 4531/4531 [00:02<00:00, 2201.82it/s]


In [29]:
test_x = []
for doc in tqdm(x_test['Sentence'].values):
    test_x.append(document_vectorizer(doc))

100%|██████████| 1133/1133 [00:00<00:00, 2000.98it/s]


In [31]:
x_train = pd.DataFrame(input_x)

In [32]:
x_test = pd.DataFrame(test_x)

In [33]:
x_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.065180,0.047393,-0.278572,0.054961,-0.031771,-0.824347,0.158188,1.106758,-0.243654,0.316453,...,0.708754,0.046114,0.278076,-0.248840,1.710252,0.023575,-0.473314,-0.711494,0.031659,0.250035
1,-0.132199,0.168852,0.149055,0.171227,-0.076316,-0.445166,0.171261,0.592303,-0.199368,-0.109264,...,0.210108,-0.020960,0.055184,0.141529,0.620137,0.313458,-0.108117,-0.356073,0.116703,0.082185
2,-0.131432,0.175389,0.157716,0.192019,-0.082181,-0.440687,0.171520,0.599991,-0.208943,-0.122566,...,0.221674,-0.019684,0.047049,0.148593,0.641646,0.335417,-0.120881,-0.367762,0.119429,0.084265
3,-0.119690,0.164854,0.110994,0.181838,-0.080271,-0.514079,0.179196,0.695668,-0.214826,-0.070413,...,0.288664,-0.012950,0.080446,0.104609,0.798205,0.308668,-0.170043,-0.426662,0.111443,0.114751
4,-0.080714,0.111533,0.089116,0.106594,-0.050041,-0.281815,0.105255,0.377127,-0.121493,-0.070072,...,0.130755,-0.011297,0.035960,0.088551,0.397768,0.198146,-0.071384,-0.224143,0.072020,0.052232
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4526,-0.120349,0.159164,0.138883,0.154837,-0.070032,-0.401689,0.156226,0.539385,-0.186336,-0.106543,...,0.185919,-0.015625,0.049283,0.126782,0.559375,0.287423,-0.097047,-0.319445,0.107496,0.073171
4527,-0.161192,0.219307,0.210620,0.255361,-0.115812,-0.651596,0.238353,0.881353,-0.297987,-0.166253,...,0.330815,-0.036700,0.091299,0.171524,0.953727,0.444943,-0.180488,-0.544302,0.170642,0.119135
4528,-0.072521,0.145689,0.052844,0.181196,-0.081720,-0.507918,0.162997,0.685408,-0.199122,-0.019366,...,0.335518,0.006020,0.098136,0.082026,0.850471,0.266974,-0.209178,-0.440364,0.095453,0.131223
4529,-0.128059,0.181737,0.148587,0.194918,-0.089350,-0.509447,0.191452,0.702318,-0.231405,-0.102552,...,0.293210,-0.019773,0.068850,0.124524,0.795496,0.349724,-0.159801,-0.435084,0.127403,0.108350


In [36]:
rf_clf = RandomForestClassifier(max_depth=10, max_features=0.8)
nb_clf = GaussianNB()

In [37]:
rf_clf.fit(x_train, y_train)

  rf_clf.fit(x_train, y_train)


In [39]:
print(classification_report(y_test, rf_clf.predict(x_test)))

              precision    recall  f1-score   support

           0       0.29      0.05      0.09       173
           1       0.60      0.88      0.71       608
           2       0.56      0.34      0.42       352

    accuracy                           0.58      1133
   macro avg       0.48      0.42      0.41      1133
weighted avg       0.54      0.58      0.53      1133



In [40]:
nb_clf.fit(x_train, y_train)

  y = column_or_1d(y, warn=True)


In [41]:
print(classification_report(y_test, nb_clf.predict(x_test)))

              precision    recall  f1-score   support

           0       0.28      0.40      0.33       173
           1       0.58      0.81      0.67       608
           2       0.30      0.03      0.06       352

    accuracy                           0.50      1133
   macro avg       0.39      0.41      0.35      1133
weighted avg       0.45      0.50      0.43      1133

