In [2]:
import spacy
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline

# Load Dataset

In [3]:
columns = ['id','country','Sentiment','tweets']

In [4]:
df = pd.read_csv('/kaggle/input/twitter-entity-sentiment-analysis/twitter_training.csv',names= columns)

In [5]:
df.head()

Unnamed: 0,id,country,Sentiment,tweets
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [6]:
df.tail()

Unnamed: 0,id,country,Sentiment,tweets
74677,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74678,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74679,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74680,9200,Nvidia,Positive,Just realized between the windows partition of...
74681,9200,Nvidia,Positive,Just like the windows partition of my Mac is l...


In [7]:
df.shape

(74682, 4)

# Exploration Dataset

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74682 entries, 0 to 74681
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         74682 non-null  int64 
 1   country    74682 non-null  object
 2   Sentiment  74682 non-null  object
 3   tweets     73996 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.3+ MB


In [9]:
df.describe()

Unnamed: 0,id
count,74682.0
mean,6432.586165
std,3740.42787
min,1.0
25%,3195.0
50%,6422.0
75%,9601.0
max,13200.0


In [10]:
df.isnull().sum()

id             0
country        0
Sentiment      0
tweets       686
dtype: int64

In [11]:
df['Sentiment'].value_counts()

Sentiment
Negative      22542
Positive      20832
Neutral       18318
Irrelevant    12990
Name: count, dtype: int64

# Preprocessing

In [12]:
# Remove rows with missing Text values
df.dropna(subset=['tweets'], inplace=True)


In [13]:
# Load the English language model
nlp = spacy.load("en_core_web_sm")

In [14]:
# use this utility function to get the preprocessed text data
def preprocess(text):
    # remove stop words and lemmatize the text
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    
    return " ".join(filtered_tokens) 

In [15]:
df['Preprocessed Tweets'] = df['tweets'].apply(preprocess) 

In [16]:
df

Unnamed: 0,id,country,Sentiment,tweets,Preprocessed Tweets
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...,m get borderland murder
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...,come border kill
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...,m get borderland kill
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...,m come borderland murder
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,m get borderland 2 murder
...,...,...,...,...,...
74677,9200,Nvidia,Positive,Just realized that the Windows partition of my...,realize Windows partition Mac like 6 year Nvid...
74678,9200,Nvidia,Positive,Just realized that my Mac window partition is ...,realize Mac window partition 6 year Nvidia dri...
74679,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...,realize window partition Mac 6 year Nvidia dri...
74680,9200,Nvidia,Positive,Just realized between the windows partition of...,realize window partition Mac like 6 year Nvidi...


# Label Encoding

In [17]:
label_encoding = LabelEncoder()

In [18]:
df['Encoded_Sentiment'] = label_encoding.fit_transform(df['Sentiment'])

In [19]:
# Check the mapping of encoded labels
print(list(label_encoding.classes_))

['Irrelevant', 'Negative', 'Neutral', 'Positive']


In [20]:
# Split the dataset into training and testing sets
X = df['Preprocessed Tweets']
y = df['Encoded_Sentiment']

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [22]:
print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")

Training data shape: (59196,)
Testing data shape: (14800,)


# Models

---

## Random Forest model

In [23]:
rf_pipeline = make_pipeline(TfidfVectorizer(), RandomForestClassifier())
rf_pipeline.fit(X_train, y_train)

In [24]:
rf_pred = rf_pipeline.predict(X_test)

In [25]:
print("Random Forest Accuracy:", accuracy_score(y_test, rf_pred))

Random Forest Accuracy: 0.9115540540540541


In [26]:
print(classification_report(y_test, rf_pred, target_names=label_encoding.classes_))


              precision    recall  f1-score   support

  Irrelevant       0.96      0.86      0.91      2575
    Negative       0.92      0.93      0.93      4472
     Neutral       0.94      0.89      0.91      3622
    Positive       0.85      0.94      0.90      4131

    accuracy                           0.91     14800
   macro avg       0.92      0.91      0.91     14800
weighted avg       0.91      0.91      0.91     14800



---

## Naive Bayse 

In [27]:
nb_pipeline = make_pipeline(TfidfVectorizer(), MultinomialNB())

In [28]:
nb_pipeline.fit(X_train, y_train)

In [29]:
nb_pred = nb_pipeline.predict(X_test)

In [30]:
print("Naive Bayes Accuracy:", accuracy_score(y_test, nb_pred))

Naive Bayes Accuracy: 0.7312837837837838


In [31]:
print(classification_report(y_test, nb_pred, target_names=label_encoding.classes_))

              precision    recall  f1-score   support

  Irrelevant       0.95      0.46      0.62      2575
    Negative       0.65      0.90      0.76      4472
     Neutral       0.84      0.63      0.72      3622
    Positive       0.71      0.81      0.76      4131

    accuracy                           0.73     14800
   macro avg       0.79      0.70      0.71     14800
weighted avg       0.77      0.73      0.72     14800



---

## XGboost Model

In [32]:
import xgboost as xgb

# Build a pipeline with TfidfVectorizer and XGBoost
xgb_pipeline = make_pipeline(TfidfVectorizer(), xgb.XGBClassifier(eval_metric='mlogloss'))

In [33]:
xgb_pipeline.fit(X_train, y_train)

In [34]:
# Make predictions and evaluate the model
xgb_pred = xgb_pipeline.predict(X_test)

In [35]:
print("XGBoost Accuracy:", accuracy_score(y_test, xgb_pred))

XGBoost Accuracy: 0.6462162162162162


In [37]:
print(classification_report(y_test, xgb_pred, target_names=label_encoding.classes_))


              precision    recall  f1-score   support

  Irrelevant       0.74      0.35      0.47      2575
    Negative       0.59      0.84      0.69      4472
     Neutral       0.71      0.58      0.64      3622
    Positive       0.66      0.68      0.67      4131

    accuracy                           0.65     14800
   macro avg       0.67      0.61      0.62     14800
weighted avg       0.66      0.65      0.64     14800



---

## Gradient Boosting 

In [38]:
from sklearn.ensemble import GradientBoostingClassifier

# Build a pipeline with TfidfVectorizer and Gradient Boosting Classifier
gb_pipeline = make_pipeline(TfidfVectorizer(), GradientBoostingClassifier())

In [39]:
gb_pipeline.fit(X_train, y_train)


In [40]:
# Make predictions and evaluate the model
gb_pred = gb_pipeline.predict(X_test)

In [41]:
print("Gradient Boosting Classifier Accuracy:", accuracy_score(y_test, gb_pred))

Gradient Boosting Classifier Accuracy: 0.5354054054054054


In [42]:
print(classification_report(y_test, gb_pred, target_names=label_encoding.classes_))


              precision    recall  f1-score   support

  Irrelevant       0.67      0.17      0.27      2575
    Negative       0.47      0.82      0.60      4472
     Neutral       0.62      0.44      0.51      3622
    Positive       0.59      0.54      0.56      4131

    accuracy                           0.54     14800
   macro avg       0.59      0.49      0.49     14800
weighted avg       0.58      0.54      0.51     14800



---

---

# Test Model

In [43]:
test_df = pd.read_csv('/kaggle/input/twitter-entity-sentiment-analysis/twitter_validation.csv', names=columns)
test_df.head()

Unnamed: 0,id,country,Sentiment,tweets
0,3364,Facebook,Irrelevant,I mentioned on Facebook that I was struggling ...
1,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
2,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
3,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
4,4433,Google,Neutral,Now the President is slapping Americans in the...


In [44]:
test_text = test_df['tweets'][10]
print(f"{test_text} ===> {test_df['Sentiment'][10]}")

The professional dota 2 scene is fucking exploding and I completely welcome it.

Get the garbage out. ===> Positive


In [45]:
test_text_processed = [preprocess(test_text)]
test_text_processed

['professional dota 2 scene fucking explode completely welcome \n\n garbage']

In [47]:
test_text = rf_pipeline.predict(test_text_processed)


In [48]:
classes = ['Irrelevant', 'Natural', 'Negative', 'Positive']

print(f"True Label: {test_df['Sentiment'][10]}")
print(f'Predict Label: {classes[test_text[0]]}')

True Label: Positive
Predict Label: Positive
