**SENTIMENT ANALYSIS**

In [None]:
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("jp797498e/twitter-entity-sentiment-analysis")

print("Path to dataset files:", path)

# Assuming the dataset is a CSV file, load it into a Pandas DataFrame
df = pd.read_csv(path + '/twitter_training.csv') # Load data into a dataframe, assuming your csv is in the default dataset path


Downloading from https://www.kaggle.com/api/v1/datasets/download/jp797498e/twitter-entity-sentiment-analysis?dataset_version_number=2...


100%|██████████| 1.99M/1.99M [00:01<00:00, 1.81MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/jp797498e/twitter-entity-sentiment-analysis/versions/2


In [None]:
df.head()

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


Deleting unwanted columns

In [None]:
df.drop(columns = ['2401'], inplace = True)

In [None]:
df.head()

Unnamed: 0,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,Borderlands,Positive,I am coming to the borders and I will kill you...
1,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,Borderlands,Positive,im coming on borderlands and i will murder you...
3,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,Borderlands,Positive,im getting into borderlands and i can murder y...


Renaming columns

In [None]:
df.columns

Index(['Borderlands', 'Positive',
       'im getting on borderlands and i will murder you all ,'],
      dtype='object')

In [None]:
df.rename(columns={'Borderlands' :'Game_name',
            'Positive' : 'Sentiment',
           'im getting on borderlands and i will murder you all ,' : 'comment'}, inplace = True)

In [None]:
df.head()

Unnamed: 0,Game_name,Sentiment,comment
0,Borderlands,Positive,I am coming to the borders and I will kill you...
1,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,Borderlands,Positive,im coming on borderlands and i will murder you...
3,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,Borderlands,Positive,im getting into borderlands and i can murder y...


Handling missing values

In [None]:
df.isna().sum()

Unnamed: 0,0
Game_name,0
Sentiment,0
comment,686


In [None]:
df.shape

(74681, 3)

In [None]:
df = df.dropna()

In [None]:
df.isna().sum()

Unnamed: 0,0
Game_name,0
Sentiment,0
comment,0


Creating a new dataset

In [None]:
df.Sentiment.value_counts()

Unnamed: 0_level_0,count
Sentiment,Unnamed: 1_level_1
Negative,22358
Positive,20654
Neutral,18108
Irrelevant,12875


In [None]:
df_positive = df[df.Sentiment == 'Positive'].iloc[:2000]
df_negative = df[df.Sentiment == 'Negative'].iloc[:2000]
df_neutral = df[df.Sentiment == 'Neutral'].iloc[:2000]
df_irr = df[df.Sentiment == 'Irrelevant'].iloc[:2000]

In [None]:
df2 = pd.concat([df_positive, df_negative, df_neutral, df_irr],axis=0)

In [None]:
df2.shape

(8000, 3)

In [None]:
df2.Sentiment.value_counts()

Unnamed: 0_level_0,count
Sentiment,Unnamed: 1_level_1
Positive,2000
Negative,2000
Neutral,2000
Irrelevant,2000


In [None]:
df2.head()

Unnamed: 0,Game_name,Sentiment,comment
0,Borderlands,Positive,I am coming to the borders and I will kill you...
1,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,Borderlands,Positive,im coming on borderlands and i will murder you...
3,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,Borderlands,Positive,im getting into borderlands and i can murder y...


Categorical to numerical data conversion

In [None]:
df2.Game_name.unique()

array(['Borderlands', 'CallOfDutyBlackopsColdWar', 'Amazon', 'Overwatch',
       'Xbox(Xseries)'], dtype=object)

In [None]:
game_df = pd.get_dummies(df2['Game_name']).astype('int')

In [None]:
game_df

Unnamed: 0,Amazon,Borderlands,CallOfDutyBlackopsColdWar,Overwatch,Xbox(Xseries)
0,0,1,0,0,0
1,0,1,0,0,0
2,0,1,0,0,0
3,0,1,0,0,0
4,0,1,0,0,0
...,...,...,...,...,...
10628,0,0,0,0,1
10629,0,0,0,0,1
10630,0,0,0,0,1
10631,0,0,0,0,1


In [None]:
df2.drop(columns = ['Game_name'], inplace = True)

In [None]:
df2.head()

Unnamed: 0,Sentiment,comment
0,Positive,I am coming to the borders and I will kill you...
1,Positive,im getting on borderlands and i will kill you ...
2,Positive,im coming on borderlands and i will murder you...
3,Positive,im getting on borderlands 2 and i will murder ...
4,Positive,im getting into borderlands and i can murder y...


In [None]:
df2.shape , game_df.shape

((8000, 2), (8000, 5))

Text preprocessing

In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
def lemmatization(text):
    doc = nlp(text)
    lemmalist = [word.lemma_ for word in doc]
    return " ".join([token.lemma_ for token in doc])

In [None]:
df2['lemma'] = df2['comment'].apply(lemmatization)

In [None]:
df2.head()

Unnamed: 0,Sentiment,comment,lemma
0,Positive,I am coming to the borders and I will kill you...,"I be come to the border and I will kill you all ,"
1,Positive,im getting on borderlands and i will kill you ...,"I m get on borderland and I will kill you all ,"
2,Positive,im coming on borderlands and i will murder you...,I m come on borderland and I will murder you a...
3,Positive,im getting on borderlands 2 and i will murder ...,I m get on borderland 2 and I will murder you ...
4,Positive,im getting into borderlands and i can murder y...,I m get into borderland and I can murder you a...


In [None]:
def remove_stopwords(text):
    doc = nlp(text)
    no_stop_words = [token.text for token in doc if not token.is_stop]
    return " ".join(no_stop_words)

In [None]:
df2['final'] = df2['lemma'].apply(remove_stopwords)

In [None]:
df2.head()

Unnamed: 0,Sentiment,comment,lemma,final
0,Positive,I am coming to the borders and I will kill you...,"I be come to the border and I will kill you all ,","come border kill ,"
1,Positive,im getting on borderlands and i will kill you ...,"I m get on borderland and I will kill you all ,","m borderland kill ,"
2,Positive,im coming on borderlands and i will murder you...,I m come on borderland and I will murder you a...,"m come borderland murder ,"
3,Positive,im getting on borderlands 2 and i will murder ...,I m get on borderland 2 and I will murder you ...,"m borderland 2 murder ,"
4,Positive,im getting into borderlands and i can murder y...,I m get into borderland and I can murder you a...,"m borderland murder ,"


In [None]:
df2.drop(columns = ['comment', 'lemma'], inplace = True)

In [None]:
df2.head()

Unnamed: 0,Sentiment,final
0,Positive,"come border kill ,"
1,Positive,"m borderland kill ,"
2,Positive,"m come borderland murder ,"
3,Positive,"m borderland 2 murder ,"
4,Positive,"m borderland murder ,"


In [None]:
sentiment_df = pd.concat([df2, game_df], axis = 1)

In [None]:
sentiment_df.head()

Unnamed: 0,Sentiment,final,Amazon,Borderlands,CallOfDutyBlackopsColdWar,Overwatch,Xbox(Xseries)
0,Positive,"come border kill ,",0,1,0,0,0
1,Positive,"m borderland kill ,",0,1,0,0,0
2,Positive,"m come borderland murder ,",0,1,0,0,0
3,Positive,"m borderland 2 murder ,",0,1,0,0,0
4,Positive,"m borderland murder ,",0,1,0,0,0


In [None]:
df2.shape

(8000, 2)

In [None]:
df2.isna().sum()

Unnamed: 0,0
Sentiment,0
final,0


Dividing the dataset

In [None]:
x = sentiment_df.drop(columns = ['Sentiment'])
y = sentiment_df['Sentiment']

In [None]:
x.head()

Unnamed: 0,final,Amazon,Borderlands,CallOfDutyBlackopsColdWar,Overwatch,Xbox(Xseries)
0,"come border kill ,",0,1,0,0,0
1,"m borderland kill ,",0,1,0,0,0
2,"m come borderland murder ,",0,1,0,0,0
3,"m borderland 2 murder ,",0,1,0,0,0
4,"m borderland murder ,",0,1,0,0,0


TFIDF Vectorizer

In [None]:
tfidf = TfidfVectorizer()

In [None]:
tfidf.fit(x['final'])

In [None]:
tfidf_df = tfidf.transform(x['final'])

In [None]:
tfidf_matrix = tfidf.transform(x['final']).toarray()

In [None]:
tfidf_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
vectorizer_df = pd.DataFrame(tfidf_matrix, columns=tfidf.get_feature_names_out())

In [None]:
vectorizer_df.head()

Unnamed: 0,00,000,01,02,03,03573057,0359873057,04,05,06,...,zxxxvids,zyfapoihpy,ееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееее,есть,июля,сетью,третьарце,اللعبه,حبيت,خلاص
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
x.drop(columns=['final'], inplace=True)

In [None]:
x.shape, vectorizer_df.shape

((8000, 5), (8000, 6720))

In [None]:
x = pd.concat([x, vectorizer_df], axis = 1)

In [None]:
x.head()

Unnamed: 0,Amazon,Borderlands,CallOfDutyBlackopsColdWar,Overwatch,Xbox(Xseries),00,000,01,02,03,...,zxxxvids,zyfapoihpy,ееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееееее,есть,июля,сетью,третьарце,اللعبه,حبيت,خلاص
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
x.isna().sum()

Unnamed: 0,0
Amazon,952
Borderlands,952
CallOfDutyBlackopsColdWar,952
Overwatch,952
Xbox(Xseries),952
...,...
сетью,952
третьарце,952
اللعبه,952
حبيت,952


In [None]:
x = x.fillna(0)

In [None]:
x.isna().sum()

Unnamed: 0,0
Amazon,0
Borderlands,0
CallOfDutyBlackopsColdWar,0
Overwatch,0
Xbox(Xseries),0
...,...
сетью,0
третьарце,0
اللعبه,0
حبيت,0


Train - Test split

In [None]:
x.shape , y.shape

((8952, 6725), (8000,))

In [None]:
x = x.iloc[:8000]

In [None]:
x.shape , y.shape

((8000, 6725), (8000,))

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)

In [None]:
x_train.shape , x_test.shape , y_train.shape , y_test.shape

((6400, 6725), (1600, 6725), (6400,), (1600,))

Building the model

In [None]:
model = RandomForestClassifier(n_jobs = 1,
                               random_state = 42,
                               n_estimators = 200)
model.fit(x_train, y_train)

In [None]:
pred = model.predict(x_test)

In [None]:
y_test[7:11]

Unnamed: 0,Sentiment
2674,Positive
210,Neutral
7657,Negative
3704,Positive


In [None]:
pred[7:11]

array(['Positive', 'Neutral', 'Negative', 'Positive'], dtype=object)

In [None]:
accuracy_score(y_test, pred) *100

71.0

In [None]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

  Irrelevant       0.73      0.73      0.73       399
    Negative       0.73      0.59      0.65       379
     Neutral       0.70      0.74      0.72       397
    Positive       0.69      0.78      0.73       425

    accuracy                           0.71      1600
   macro avg       0.71      0.71      0.71      1600
weighted avg       0.71      0.71      0.71      1600

