### This notebook is used to classify the data collected from the internet

In [67]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [68]:
df = pd.read_csv('sentiment_gpt4_only.csv')
df_2008 = pd.read_csv('archive/2008.csv')
df_2009 = pd.read_csv('archive/2009.csv')
df_2010 = pd.read_csv('archive/2010.csv')
df_2011 = pd.read_csv('archive/2011.csv')

In [69]:
df_2008.drop(columns=['Unnamed: 0'], inplace=True)
df_2009.drop(columns=['Unnamed: 0'], inplace=True)
df_2010.drop(columns=['Unnamed: 0'], inplace=True)
df_2011.drop(columns=['Unnamed: 0'], inplace=True)

In [70]:
df_2008.rename({'News':'Title'},axis=1,inplace=True)
df_2009.rename({'News':'Title'},axis=1,inplace=True)
df_2010.rename({'News':'Title'},axis=1,inplace=True)
df_2011.rename({'News':'Title'},axis=1,inplace=True)

In [71]:
df_2008.dropna(inplace=True)
df_2009.dropna(inplace=True)
df_2010.dropna(inplace=True)
df_2011.dropna(inplace=True)

In [72]:
print ('df shape : ',df.shape)
print ('df_2008 shape : ',df_2008.shape)
print ('df_2009 shape : ',df_2009.shape)
print ('df_2010 shape : ',df_2010.shape)
print ('df_2011 shape : ',df_2011.shape)

df shape :  (8139, 3)
df_2008 shape :  (360, 2)
df_2009 shape :  (359, 2)
df_2010 shape :  (359, 2)
df_2011 shape :  (357, 2)


In [73]:
df_combined = pd.concat([df, df_2008,df_2009,df_2010,df_2011], ignore_index=True)

In [74]:
#vectorisation 
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df_combined['Title'])
y = df['Sentiment'][:8139]
X_train, X_test, y_train, y_test = train_test_split(X[:8139], y, test_size=0.2, random_state=42)

In [75]:
X_2008 = X[8139:8499]
X_2009 = X[8499:8858]
X_2010 = X[8858:9217]
X_2011 = X[9217:]

In [76]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
predictions = rf.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

Accuracy: 0.7794840294840295


In [77]:
Y_2008 = rf.predict(X_2008)
Y_2009 = rf.predict(X_2009)
Y_2010 = rf.predict(X_2010)
Y_2011 = rf.predict(X_2011)

In [78]:
df_2008['Sentiment'] = Y_2008
df_2009['Sentiment'] = Y_2009
df_2010['Sentiment'] = Y_2010
df_2011['Sentiment'] = Y_2011

In [79]:
df_2008.to_csv('sent_predicted_2008.csv',index=False)
df_2009.to_csv('sent_predicted_2009.csv',index=False)
df_2010.to_csv('sent_predicted_2010.csv',index=False)
df_2011.to_csv('sent_predicted_2011.csv',index=False)

In [80]:
df['Date'] = pd.to_datetime(df['Date'])

In [81]:
df['Date'] = df['Date'].dt.date

In [82]:
df_1 = pd.read_csv('sent_predicted.csv')
df_1['Date'] = pd.to_datetime(df_1['Date'])
df_1['Date'] = df_1['Date'].dt.date

In [83]:
concatenated_df = pd.concat([df,df_1, df_2008,df_2009,df_2010,df_2011], ignore_index=True)

In [84]:
concatenated_df

Unnamed: 0,Date,Title,Sentiment
0,2010-01-04,Global Stocks and Commodities Rally on First T...,positive
1,2010-01-04,Dollar Slumps Amid Worldwide Manufacturing Imp...,negative
2,2010-01-04,Oil Prices Surge Above $81 a Barrel Due to U.S...,negative
3,2010-01-04,"S&P 500 Sees 1.6 Percent Increase, Hits 15-Mon...",positive
4,2010-01-04,"Argentina's Merval Index Reaches Record High, ...",positive
...,...,...,...
10157,2011-12-27,iraq in 2011: troop departures and power strug...,negative
10158,2011-12-28,"deeply hated, but present: a u.s. touch at kim...",negative
10159,2011-12-29,space plan from china broadens challenge to u....,negative
10160,2011-12-30,major shale gas basins as spain acts to cut de...,negative


In [85]:
concatenated_df.to_csv('concatenated_sentiment_f.csv',index=False)