<a href="https://colab.research.google.com/github/Gopalkrishna002/Sentiment-analysis/blob/main/coop.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [3]:
data = pd.read_csv(r"/content/twitter_training.csv",names=['dummy', 'country', 'label', 'text'])
data

Unnamed: 0,dummy,country,label,text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
...,...,...,...,...
53822,2048,CallOfDuty,Negative,@ CfDuty fix their game new omg challenge with...
53823,2048,CallOfDuty,Negative,@CallofDuty fix your game new omg challenge re...
53824,2048,CallOfDuty,Negative,Dear @CallofDuty fix your stupid game new omg ...
53825,2048,CallOfDuty,Negative,@CallofDuty fix your game new omg challenge cu...


In [4]:
data.head()

Unnamed: 0,dummy,country,label,text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [5]:
data.drop(['dummy'], axis=1, inplace=True,)

In [7]:
data.isna().any()

country    False
label      False
text        True
dtype: bool

In [6]:
data.isna().sum()

country      0
label        0
text       492
dtype: int64

In [8]:
data.dropna(inplace=True)

In [9]:
data.text.duplicated().any()

True

In [10]:
data[data.text == "from"]

Unnamed: 0,country,label,text
5231,Amazon,Neutral,from
6479,Amazon,Negative,from
34961,Microsoft,Negative,from
53513,RedDeadRedemption(RDR),Positive,from


In [11]:
data.label.value_counts()

Positive      15960
Negative      14784
Neutral       12732
Irrelevant     9859
Name: label, dtype: int64

In [12]:
data.country.value_counts()

LeagueOfLegends                      2377
Verizon                              2365
Microsoft                            2361
Dota2                                2359
WorldOfCraft                         2357
ApexLegends                          2353
NBA2K                                2343
CallOfDutyBlackopsColdWar            2343
FIFA                                 2324
Overwatch                            2316
Battlefield                          2316
HomeDepot                            2292
PlayStation5(PS5)                    2291
Hearthstone                          2286
CS-GO                                2284
Xbox(Xseries)                        2283
Borderlands                          2280
Amazon                               2276
Google                               2274
Fortnite                             2249
RedDeadRedemption(RDR)               2249
PlayerUnknownsBattlegrounds(PUBG)    2234
AssassinsCreed                       2234
CallOfDuty                        

In [13]:
data.text.value_counts()

At the same time, despite the fact that there are currently some 100 million people living below the poverty line, most of them do not have access to health services and do not have access to health care, while most of them do not have access to health care.    120
                                                                                                                                                                                                                                                                      120
It is not the first time that the EU Commission has taken such a step.                                                                                                                                                                                                120
<unk>                                                                                                                                                                                                     

In [14]:
labels_counts = pd.DataFrame(data.label.value_counts())
labels_counts

Unnamed: 0,label
Positive,15960
Negative,14784
Neutral,12732
Irrelevant,9859


In [15]:
nlp = spacy.load("en_core_web_sm")

In [16]:
def preprocess_data(text):
    tokens = nlp(text)
    filtered_token = []
    for token in tokens:
        if token.is_punct or token.is_stop:
            continue
        filtered_token.append(token.lemma_)
    return " ".join(filtered_token)

In [17]:
data['preprocessed_data'] = data.text.apply(preprocess_data)

In [21]:
encoder = LabelEncoder()
data.label = encoder.fit_transform(data.label)

In [22]:
encoder

In [23]:
data.label

0        3
1        3
2        3
3        3
4        3
        ..
53822    1
53823    1
53824    1
53825    1
53826    1
Name: label, Length: 53335, dtype: int64

In [24]:
encoder.classes_

array([0, 1, 2, 3])

In [25]:
X = data['preprocessed_data']
y = data['label']

In [26]:
x_train, x_test, y_train,y_test = train_test_split(X, y, random_state=42, test_size=0.2, stratify=y)

In [27]:
v = TfidfVectorizer()
X_train_normalized = v.fit_transform(x_train)
X_test_normalized = v.transform(x_test)

In [28]:
nb_clf = MultinomialNB()
nb_clf.fit(X_train_normalized, y_train)

In [29]:
y_pred = nb_clf.predict(X_test_normalized)

In [30]:
print(accuracy_score(y_test, y_pred))

0.7438830036561358


In [31]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.49      0.65      1972
           1       0.70      0.85      0.77      2957
           2       0.86      0.64      0.74      2546
           3       0.67      0.88      0.76      3192

    accuracy                           0.74     10667
   macro avg       0.80      0.72      0.73     10667
weighted avg       0.78      0.74      0.74     10667



In [32]:
rf_clf = RandomForestClassifier(n_estimators=60)
rf_clf.fit(X_train_normalized, y_train)

In [34]:
y_pred = rf_clf.predict(X_test_normalized)

In [33]:
print(accuracy_score(y_test, y_pred))

0.7438830036561358


In [35]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.84      0.90      1972
           1       0.92      0.93      0.92      2957
           2       0.94      0.89      0.92      2546
           3       0.86      0.95      0.90      3192

    accuracy                           0.91     10667
   macro avg       0.92      0.90      0.91     10667
weighted avg       0.91      0.91      0.91     10667



In [36]:
test_data = pd.read_csv("/content/twitter_training.csv", names=['Tweet ID', 'entity', 'label', 'text'])
test_data.head(5)

Unnamed: 0,Tweet ID,entity,label,text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [37]:
y_test = test_data.loc[5231].label
x = test_data.loc[5231].text

In [38]:
x

'from'

In [39]:
y_test

'Neutral'

In [40]:
preproc_x = preprocess_data(x)

In [41]:
preproc_x

''

In [42]:
x_testing = v.transform([preproc_x])

In [43]:
x_testing.shape

(1, 21497)

In [44]:
y_pred = rf_clf.predict(x_testing)

In [45]:
print(f"the predicted output is {y_pred} and it corresponds to {encoder.classes_[y_pred]} and\n the true value is {y_test}")

the predicted output is [3] and it corresponds to [3] and
 the true value is Neutral


In [46]:
print(y_test)

Neutral


# New Section