# Import Libraries

In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report


from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

import spacy

import warnings
warnings.filterwarnings('ignore')

# Explore Data

In [2]:
col = ['id','country','Label','Text']
data = pd.read_csv("/kaggle/input/twitter-entity-sentiment-analysis/twitter_training.csv", names=col)

In [3]:
data.head()

Unnamed: 0,id,country,Label,Text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [4]:
data.tail()

Unnamed: 0,id,country,Label,Text
74677,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74678,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74679,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74680,9200,Nvidia,Positive,Just realized between the windows partition of...
74681,9200,Nvidia,Positive,Just like the windows partition of my Mac is l...


In [5]:
data.shape

(74682, 4)

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74682 entries, 0 to 74681
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       74682 non-null  int64 
 1   country  74682 non-null  object
 2   Label    74682 non-null  object
 3   Text     73996 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.3+ MB


In [7]:
data['Label'].value_counts()

Label
Negative      22542
Positive      20832
Neutral       18318
Irrelevant    12990
Name: count, dtype: int64

In [8]:
print(f"{data['Text'][2]} -> {data['Label'][2]}")

im getting on borderlands and i will kill you all, -> Positive


# Preprocessing

In [9]:
data.dropna(inplace=True)

In [10]:
# Preprocess Function
nlp = spacy.load("en_core_web_sm") 
def preprocess(text):
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    
    return " ".join(filtered_tokens) 

In [11]:
data['Preprocessed Text'] = data['Text'].apply(preprocess) 

In [12]:
data

Unnamed: 0,id,country,Label,Text,Preprocessed Text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...,m get borderland murder
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...,come border kill
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...,m get borderland kill
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...,m come borderland murder
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,m get borderland 2 murder
...,...,...,...,...,...
74677,9200,Nvidia,Positive,Just realized that the Windows partition of my...,realize Windows partition Mac like 6 year Nvid...
74678,9200,Nvidia,Positive,Just realized that my Mac window partition is ...,realize Mac window partition 6 year Nvidia dri...
74679,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...,realize window partition Mac 6 year Nvidia dri...
74680,9200,Nvidia,Positive,Just realized between the windows partition of...,realize window partition Mac like 6 year Nvidi...


In [13]:
# Encoding target column
le = LabelEncoder()
data['Label'] = le.fit_transform(data['Label'])

In [14]:
data

Unnamed: 0,id,country,Label,Text,Preprocessed Text
0,2401,Borderlands,3,im getting on borderlands and i will murder yo...,m get borderland murder
1,2401,Borderlands,3,I am coming to the borders and I will kill you...,come border kill
2,2401,Borderlands,3,im getting on borderlands and i will kill you ...,m get borderland kill
3,2401,Borderlands,3,im coming on borderlands and i will murder you...,m come borderland murder
4,2401,Borderlands,3,im getting on borderlands 2 and i will murder ...,m get borderland 2 murder
...,...,...,...,...,...
74677,9200,Nvidia,3,Just realized that the Windows partition of my...,realize Windows partition Mac like 6 year Nvid...
74678,9200,Nvidia,3,Just realized that my Mac window partition is ...,realize Mac window partition 6 year Nvidia dri...
74679,9200,Nvidia,3,Just realized the windows partition of my Mac ...,realize window partition Mac 6 year Nvidia dri...
74680,9200,Nvidia,3,Just realized between the windows partition of...,realize window partition Mac like 6 year Nvidi...


In [15]:
# Split data into train and test

x_train, x_test, y_train, y_test = train_test_split(data['Preprocessed Text'], data['Label'], 
                                                    test_size=0.2, random_state=42, stratify=data['Label'])

In [16]:
x_test.shape

(14800,)

In [17]:
x_train.shape

(59196,)

# ML Model

#### Naive Bayes 

In [18]:
# classifier
clf = Pipeline([
    ('vectorizer_tri_grams', TfidfVectorizer()),
    ('naive_bayes', (MultinomialNB()))         
])

In [19]:
clf.fit(x_train, y_train)

In [20]:
y_pred = clf.predict(x_test)

In [21]:
print(accuracy_score(y_test, y_pred))

0.7312837837837838


In [22]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.46      0.62      2575
           1       0.65      0.90      0.76      4472
           2       0.84      0.63      0.72      3622
           3       0.71      0.81      0.76      4131

    accuracy                           0.73     14800
   macro avg       0.79      0.70      0.71     14800
weighted avg       0.77      0.73      0.72     14800



#### Random Forest

In [23]:
clf = Pipeline([
    ('vectorizer_tri_grams', TfidfVectorizer()),
    ('naive_bayes', (RandomForestClassifier()))         
])

In [24]:
clf.fit(x_train, y_train)

In [25]:
y_pred = clf.predict(x_test)

In [26]:
print(accuracy_score(y_test, y_pred))

0.9117567567567567


In [27]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.86      0.91      2575
           1       0.92      0.93      0.93      4472
           2       0.93      0.89      0.91      3622
           3       0.85      0.94      0.90      4131

    accuracy                           0.91     14800
   macro avg       0.92      0.91      0.91     14800
weighted avg       0.91      0.91      0.91     14800



# Testing

In [28]:
test_data = pd.read_csv('/kaggle/input/twitter-entity-sentiment-analysis/twitter_validation.csv', names=col)

In [29]:
test_data.head()

Unnamed: 0,id,country,Label,Text
0,3364,Facebook,Irrelevant,I mentioned on Facebook that I was struggling ...
1,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
2,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
3,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
4,4433,Google,Neutral,Now the President is slapping Americans in the...


In [30]:
test_txt = test_data['Text'][25]
print(f"{test_txt} ===> {test_data['Label'][25]}")

#gtc20 -  nice, motivational, and very accessible Nvidia/AI product fair + related tech talks
nvidia.com/en-us/gtc/keyn…
interesting interaction/social activities: braindates, dinner with strangers, ...  and free attendance for universities: reg.rainfocus.com/flow/nvidia/gt… ===> Neutral


In [31]:
# Apply preprocess

test_txt_processed = [preprocess(test_txt)]
test_txt_processed

['gtc20   nice motivational accessible Nvidia AI product fair + related tech talk \n nvidia.com/en-us/gtc/keyn \n interesting interaction social activity braindate dinner stranger   free attendance university reg.rainfocus.com/flow/nvidia/gt']

In [32]:
# Get Prediction

test_txt = clf.predict(test_txt_processed)

In [33]:
classes = ['Irrelevant', 'Natural', 'Negative', 'Positive']

print(f"True Label: {test_data['Label'][25]}")
print(f'Predict Label: {classes[test_txt[0]]}')

True Label: Neutral
Predict Label: Negative
