In [37]:
import kagglehub
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix

import re
import emoji
import os
import matplotlib.pyplot as plt

import seaborn as sns
import string
import numpy as np
import random
from plotly import graph_objs as go

In [38]:
path = kagglehub.dataset_download("yasserh/twitter-tweets-sentiment-dataset")
print("Path to dataset files:", path)

Path to dataset files: C:\Users\Nicolás\.cache\kagglehub\datasets\yasserh\twitter-tweets-sentiment-dataset\versions\1


In [39]:
file_path = os.path.join(path, "Tweets.csv")

df = pd.read_csv(file_path, encoding="latin1")
df.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


# DATA CLEANING

**Dropping columns**

To predict the sentiment behind the tweet using BOW and Bayesian Probability, we will have to drop some columns:
- `textID`: Unique identificator for each tweet, it doesn't add any info to the sentiment so we can drop the column

- `selected_text`: Using this column would be a little bit of cheating as we are precisely trying to predict which words are the most related to each sentiment and selected_text is already giving us that info. Although we can use the column later on to compare the results of our prediction with the selected text in the dataset, we will drop if for now.

In [40]:
df = df.drop(['textID', 'selected_text'], axis=1)
df.head()

Unnamed: 0,text,sentiment
0,"I`d have responded, if I were going",neutral
1,Sooo SAD I will miss you here in San Diego!!!,negative
2,my boss is bullying me...,negative
3,what interview! leave me alone,negative
4,"Sons of ****, why couldn`t they put them on t...",negative


**Quick look at our data**

Let's take a little look into the way data is organized in our dataset, we will visualize better our data later on.

In [41]:
temp = df.groupby('sentiment').count()['text'].reset_index().sort_values(by='text',ascending=False)
temp.style.background_gradient(cmap='Blues_r')

Unnamed: 0,sentiment,text
1,neutral,11117
2,positive,8582
0,negative,7781


**Mapping labels**

Original target labels are 4 for positive, 2 for neutral and 0 for negative, but i think it would be easier to interprate and more intuitive if i changed the labels to 2 for positive, 1 for neutral and 0 for negative

In [42]:
# Mapping labels
label_mapping = {'negative': 0, 'neutral': 1, 'positive':2}
df['sentiment'] = df['sentiment'].map(label_mapping)

df.head()

Unnamed: 0,text,sentiment
0,"I`d have responded, if I were going",1
1,Sooo SAD I will miss you here in San Diego!!!,0
2,my boss is bullying me...,0
3,what interview! leave me alone,0
4,"Sons of ****, why couldn`t they put them on t...",0


**Cleaning data**

Let's make sure that there aren't any empty cells on our dataset and that we can work with all the data properly.

In [43]:
# Let's see if there are any NaN's in our tweets and treat them in case there are
print("There is a total of", df["text"].isna().sum(), "empty cell")
df["text"] = df["text"].fillna("")

There is a total of 1 empty cell


In [44]:
print("After cleaning the column there are", df["text"].isna().sum(), "empty cells")

def preprocess_tweet(tweet):
    if not isinstance(tweet, str):  # Invalid values check
        return ""
    tweet = re.sub(r"@\w+", "", tweet)  # Eliminate mentions
    tweet = re.sub(r"#\w+", "", tweet)  # Eliminate hashtags
    tweet = re.sub(r"http\S+|www\S+", "", tweet)  # Eliminate URL's
    tweet = emoji.demojize(tweet)  # Convert emoji to text
    tweet = re.sub(r"[^a-zA-Z\s]", "", tweet)  # Eliminate special characters
    tweet = tweet.lower().strip()  # Eliminate uppercase and spaces
    return tweet

df['cleaned_text'] = df["text"].apply(preprocess_tweet)

After cleaning the column there are 0 empty cells


In [45]:
# We don't need the column text anymore as cleaned_text has all the important and clean information from that column so we drop it
df = df.drop('text', axis=1)
df.head()

Unnamed: 0,sentiment,cleaned_text
0,1,id have responded if i were going
1,0,sooo sad i will miss you here in san diego
2,0,my boss is bullying me
3,0,what interview leave me alone
4,0,sons of why couldnt they put them on the rele...


In [46]:
# Now we count how many empty texts there are and we drop them
empty_count = df[df["cleaned_text"] == ""].shape[0]
print(f"Number of rows with empty text: {empty_count}")

Number of rows with empty text: 7


In [47]:
df = df[df["cleaned_text"] != ""]

empty_count = df[df["cleaned_text"] == ""].shape[0]
print(f"Number of rows with empty text: {empty_count}")

Number of rows with empty text: 0


**Data visualization**

I will do a little visualization of the data but the "big" part will come afterwards, once i have done the predictions, to compare the results of my analysis with the column `selected_text` in the original dataset.

In [48]:
# Distribution of sentiments in the dataset
fig = go.Figure(go.Funnelarea(
    text =temp.sentiment,
    values = temp.text,
    title = {"position": "top center", "text": "Funnel-Chart of Sentiment Distribution"}
    ))
fig.show()

# CLASSIFICATION

Now we will divide our data into train and test and start training our model to be able to predict.

In [49]:
X_train, X_test, y_train, y_test = train_test_split(df["cleaned_text"], df["sentiment"], test_size=0.2, random_state=42)

vectorizer = CountVectorizer(max_features=10000, stop_words="english")
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

model = MultinomialNB()
model.fit(X_train_bow, y_train)

y_pred = model.predict(X_test_bow)

print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=["Negative", "Neutral", "Positive"]))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

    Negative       0.67      0.59      0.63      1538
     Neutral       0.60      0.65      0.63      2237
    Positive       0.70      0.70      0.70      1720

    accuracy                           0.65      5495
   macro avg       0.66      0.65      0.65      5495
weighted avg       0.65      0.65      0.65      5495

Confusion Matrix:
[[ 910  531   97]
 [ 370 1462  405]
 [  85  439 1196]]
