### This notebook  is used to choose the best classification model that can be used for predicting the sentiment of the data

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [108]:
test = pd.read_csv('sent_data_base.csv')
test.rename({'title':'Title'},axis=1,inplace=True)
test.rename({'date':'Date'},axis=1,inplace=True)

In [91]:
df = pd.read_csv('sentiment_gpt4_only.csv')

In [92]:
#Converting each sentiment into a value, (5 for positive, 4 for negative ...)
le=LabelEncoder()
le.fit(df['Sentiment'].values)
df['val_Sentiment']=le.transform(df['Sentiment'].values)

In [95]:
#merging the two datasets, the one used for training and the one that will be predicted. This is essential in order to vectorize the data
df_combined = pd.concat([df, test], ignore_index=True)

In [96]:
df_combined

Unnamed: 0,Date,Title,Sentiment,val_Sentiment
0,2010-01-04 00:00:00,Global Stocks and Commodities Rally on First T...,positive,5.0
1,2010-01-04 00:00:00,Dollar Slumps Amid Worldwide Manufacturing Imp...,negative,4.0
2,2010-01-04 00:00:00,Oil Prices Surge Above $81 a Barrel Due to U.S...,negative,4.0
3,2010-01-04 00:00:00,"S&P 500 Sees 1.6 Percent Increase, Hits 15-Mon...",positive,5.0
4,2010-01-04 00:00:00,"Argentina's Merval Index Reaches Record High, ...",positive,5.0
...,...,...,...,...
8722,2010-10-09 07:00:00+00:00,A better way - The Economist,,
8723,2010-04-11 07:00:00+00:00,Euro Nations Offer Greece $40 Billion Backstop...,,
8724,2011-12-02 08:00:00+00:00,GMAC Mortgage to halt most new lending in Mass...,,
8725,2011-12-08 08:00:00+00:00,"ECB cuts interest rates, signals willingness t...",,


#### we used two types of prediction, the first one is to predict whether the sentiment is positive or negative or indecisive and the second one is used to predict whether the sentiment value is 0, 1 or 2

In [97]:
#vectorisation 
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df_combined['Title'])
y = df['Sentiment'][:8139]
y_bis = df['val_Sentiment'] 
X_train, X_test, y_train, y_test, y_bis_train,y_bis_test = train_test_split(X[:8139], y,y_bis, test_size=0.2, random_state=42)

In [98]:
X_2 = X[8139:]

In [10]:
#bayesian classification
model = MultinomialNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))

Accuracy: 0.7352579852579852
              precision    recall  f1-score   support

- indecisive       0.00      0.00      0.00         3
  - negative       0.00      0.00      0.00         2
  - positive       0.00      0.00      0.00         2
  indecisive       1.00      0.01      0.02       126
    negative       0.70      0.88      0.78       806
    positive       0.78      0.70      0.74       689

    accuracy                           0.74      1628
   macro avg       0.41      0.27      0.26      1628
weighted avg       0.76      0.74      0.70      1628



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [41]:
model = MultinomialNB()
model.fit(X_train, y_bis_train)
y_bis_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_bis_test, y_bis_pred)}")
print(classification_report(y_bis_test, y_bis_pred))

Accuracy: 0.7352579852579852
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       0.00      0.00      0.00         2
           2       0.00      0.00      0.00         2
           3       1.00      0.01      0.02       126
           4       0.70      0.88      0.78       806
           5       0.78      0.70      0.74       689

    accuracy                           0.74      1628
   macro avg       0.41      0.27      0.26      1628
weighted avg       0.76      0.74      0.70      1628



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [14]:
#decision tree
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

predictions = clf.predict(X_test)

accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

Accuracy: 0.7039312039312039


In [42]:
clf = DecisionTreeClassifier()
clf.fit(X_train, y_bis_train)

predictions = clf.predict(X_test)

accuracy = accuracy_score(y_bis_test, predictions)
print("Accuracy:", accuracy)

Accuracy: 0.7002457002457002


In [43]:
#XGBoost classifier
xgb = XGBClassifier()
xgb.fit(X_train, y_bis_train)
predictions = xgb.predict(X_test)
accuracy = accuracy_score(y_bis_test, predictions)
print("Accuracy:", accuracy)

Accuracy: 0.769041769041769


In [100]:
#Random forest classifier
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

predictions = rf.predict(X_test)

accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

Accuracy: 0.7807125307125307


In [None]:
rf = RandomForestClassifier()
rf.fit(X_train, y_bis_train)
predictions = rf.predict(X_test)
accuracy = accuracy_score(y_bis_test, predictions)
print("Accuracy:", accuracy)

In all models, sentiment prediction is better when it is not labelled and we observe that the best model is random forest classifier

In [101]:
Y = rf.predict(X_2)

In [102]:
test['Sentiment'] = Y

In [105]:
test

Unnamed: 0,Date,Title,Sentiment
0,2010-02-04 08:00:00+00:00,"CNNMoney.com Market Report - Feb. 4, 2010 - CNN",positive
1,2011-08-08 07:00:00+00:00,Debt crisis sends financial markets into turmo...,negative
2,2011-10-04 07:00:00+00:00,"Market Report - Oct. 4, 2011 - CNNMoney - CNN",positive
3,2010-12-02 08:00:00+00:00,Natural Gas Weekly Update - EIA,negative
4,2011-01-05 08:00:00+00:00,World food prices enter 'danger territory' to ...,negative
...,...,...,...
583,2010-10-09 07:00:00+00:00,A better way - The Economist,positive
584,2010-04-11 07:00:00+00:00,Euro Nations Offer Greece $40 Billion Backstop...,positive
585,2011-12-02 08:00:00+00:00,GMAC Mortgage to halt most new lending in Mass...,negative
586,2011-12-08 08:00:00+00:00,"ECB cuts interest rates, signals willingness t...",negative


In [107]:
test.to_csv('sent_predicted.csv',index=False)