In [14]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

#Exploratory Data Analysis

In [7]:
columns = ['id','country','Label','Text']
data=pd.read_csv(r"/content/twitter_training.csv",names=columns)
data.head()



Unnamed: 0,id,country,Label,Text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74682 entries, 0 to 74681
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       74682 non-null  int64 
 1   country  74682 non-null  object
 2   Label    74682 non-null  object
 3   Text     73996 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.3+ MB


In [9]:
#Checking for NULL values
data.isnull().sum()

Unnamed: 0,0
id,0
country,0
Label,0
Text,686


In [10]:
data.dropna(inplace=True)

In [11]:
data.isnull().sum()

Unnamed: 0,0
id,0
country,0
Label,0
Text,0


In [12]:
#Seeing how much of each category exists
data['Label'].value_counts()

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
Negative,22358
Positive,20655
Neutral,18108
Irrelevant,12875


#Data Preprocessing

In [15]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_nltk(text):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text.lower())  # Tokenization
    stop_words = set(stopwords.words("english"))
    filtered_tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalnum() and token not in stop_words]
    return " ".join(filtered_tokens)

data['Preprocessed Text'] = data['Text'].apply(preprocess_nltk)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [16]:
#LabelsEncoding

le=LabelEncoder()
data["Label"]=le.fit_transform(data["Label"])

In [19]:
X_train,X_test,y_train,y_test=train_test_split(data["Preprocessed Text"],data['Label'],
                                               test_size=0.2, random_state=42, stratify=data["Label"])

-> Stratify: Ensures proporions of all labels in Label of both training and testing dataset are same.

In [20]:
X_train.shape,X_test.shape

((59196,), (14800,))

#Making Model 0__0

In [25]:
pipe=Pipeline([
    ("vectorizer",TfidfVectorizer()),('naive_bayes', (MultinomialNB()))])
pipe

In [26]:
pipe.fit(X_train,y_train)

In [28]:
y_pred=pipe.predict(X_test)

In [29]:
accuracy_score(y_pred,y_test)

0.7146621621621622

In [31]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.41      0.57      2575
           1       0.65      0.89      0.75      4472
           2       0.81      0.61      0.70      3622
           3       0.69      0.81      0.74      4131

    accuracy                           0.71     14800
   macro avg       0.77      0.68      0.69     14800
weighted avg       0.75      0.71      0.70     14800



##Checking with RandomForest o.O

In [33]:
pipe2=Pipeline([
    ("vectorizer",TfidfVectorizer()),("mo",RandomForestClassifier())
])
pipe2

In [34]:
pipe2.fit(X_train,y_train)

In [35]:
y_pred=pipe2.predict(X_test)

In [36]:
accuracy_score(y_pred,y_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.96      0.85      0.90      2575
           1       0.92      0.92      0.92      4472
           2       0.86      0.92      0.89      3622
           3       0.90      0.91      0.91      4131

    accuracy                           0.91     14800
   macro avg       0.91      0.90      0.91     14800
weighted avg       0.91      0.91      0.91     14800



#Testing using validation data ^___^

In [42]:
test=pd.read_csv(r"/content/twitter_validation.csv",names=columns)
test.head()

Unnamed: 0,id,country,Label,Text
0,3364,Facebook,Irrelevant,I mentioned on Facebook that I was struggling ...
1,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
2,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
3,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
4,4433,Google,Neutral,Now the President is slapping Americans in the...


In [44]:
test_text = test['Text'][10]
print(f"{test_text} ===> {test['Label'][10]}")

The professional dota 2 scene is fucking exploding and I completely welcome it.

Get the garbage out. ===> Positive


In [45]:
test_text_processed = [preprocess_nltk(test_text)]
test_text_processed


['professional dota 2 scene fucking exploding completely welcome get garbage']

In [47]:
test_text = pipe2.predict(test_text_processed)


In [48]:
classes = ['Irrelevant', 'Natural', 'Negative', 'Positive']

print(f"True Label: {test['Label'][10]}")
print(f'Predict Label: {classes[test_text[0]]}')


True Label: Positive
Predict Label: Positive


In [49]:
test["Preprocessed Text"]=test["Text"].apply(preprocess_nltk)

In [50]:
y_predd=pipe2.predict(test["Preprocessed Text"])

In [55]:
y_predd.shape

(1000,)

In [57]:
test["Label"].shape

(1000,)

In [51]:
le_model = LabelEncoder()
test['Label'] = le_model.fit_transform(test['Label'])


In [60]:
print(accuracy_score(y_predd,test["Label"]))
print(classification_report(test["Label"],y_predd))

0.964
              precision    recall  f1-score   support

           0       0.99      0.94      0.97       172
           1       0.96      0.97      0.96       266
           2       0.95      0.97      0.96       285
           3       0.96      0.97      0.97       277

    accuracy                           0.96      1000
   macro avg       0.97      0.96      0.96      1000
weighted avg       0.96      0.96      0.96      1000



In [61]:
import pickle

with open('sentiment_analysis_model.pkl', 'wb') as f:
    pickle.dump(pipe2, f)
