In [89]:
import kagglehub
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix

import re
import emoji
import os
import matplotlib.pyplot as plt
import random

import seaborn as sns
import string
import numpy as np
import random
from plotly import graph_objs as go

In [90]:
path = kagglehub.dataset_download("yasserh/twitter-tweets-sentiment-dataset")
print("Path to dataset files:", path)

Path to dataset files: C:\Users\Nicolás\.cache\kagglehub\datasets\yasserh\twitter-tweets-sentiment-dataset\versions\1


In [91]:
file_path = os.path.join(path, "Tweets.csv")

original_df = pd.read_csv(file_path, encoding="latin1")
original_df.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


# DATA CLEANING

**Dropping columns**

To predict the sentiment behind the tweet using BOW and Bayesian Probability, we will have to drop some columns:
- `textID`: Unique identificator for each tweet, it doesn't add any info to the sentiment so we can drop the column

- `selected_text`: Using this column would be a little bit of cheating as we are precisely trying to predict which words are the most related to each sentiment and selected_text is already giving us that info. Although we can use the column later on to compare the results of our prediction with the selected text in the dataset, we will drop if for now.

In [92]:
df = original_df.drop(['textID', 'selected_text'], axis=1)
df.head()

Unnamed: 0,text,sentiment
0,"I`d have responded, if I were going",neutral
1,Sooo SAD I will miss you here in San Diego!!!,negative
2,my boss is bullying me...,negative
3,what interview! leave me alone,negative
4,"Sons of ****, why couldn`t they put them on t...",negative


**Quick look at our data**

Let's take a little look into the way data is organized in our dataset, we will visualize better our data later on.

In [93]:
temp = df.groupby('sentiment').count()['text'].reset_index().sort_values(by='text',ascending=False)
temp.style.background_gradient(cmap='Blues_r')

Unnamed: 0,sentiment,text
1,neutral,11117
2,positive,8582
0,negative,7781


**Mapping labels**

Original target labels are 4 for positive, 2 for neutral and 0 for negative, but i think it would be easier to interprate and more intuitive if i changed the labels to 2 for positive, 1 for neutral and 0 for negative

In [94]:
# Mapping labels
label_mapping = {'negative': 0, 'neutral': 1, 'positive':2}
df['sentiment'] = df['sentiment'].map(label_mapping)

df.head()

Unnamed: 0,text,sentiment
0,"I`d have responded, if I were going",1
1,Sooo SAD I will miss you here in San Diego!!!,0
2,my boss is bullying me...,0
3,what interview! leave me alone,0
4,"Sons of ****, why couldn`t they put them on t...",0


**Cleaning data**

Let's make sure that there aren't any empty cells on our dataset and that we can work with all the data properly.

In [95]:
# Let's see if there are any NaN's in our tweets and treat them in case there are
print("There is a total of", df["text"].isna().sum(), "NaN's")
df["text"] = df["text"].fillna("")

There is a total of 1 NaN's


In [96]:
print("After cleaning the column there are", df["text"].isna().sum(), "NaN's")

def preprocess_tweet(tweet):
    if not isinstance(tweet, str):  # Invalid values check
        return ""
    tweet = re.sub(r"@\w+", "", tweet)  # Eliminate mentions
    tweet = re.sub(r"#\w+", "", tweet)  # Eliminate hashtags
    tweet = re.sub(r"http\S+|www\S+", "", tweet)  # Eliminate URL's
    tweet = emoji.demojize(tweet)  # Convert emoji to text
    tweet = re.sub(r"[^a-zA-Z\s]", "", tweet)  # Eliminate special characters
    tweet = tweet.lower().strip()  # Eliminate uppercase and spaces
    return tweet

df['cleaned_text'] = df["text"].apply(preprocess_tweet)


After cleaning the column there are 0 NaN's


In [97]:
empty_count = df[df["cleaned_text"] == ""].shape[0]
print(f"Number of rows with empty text: {empty_count}")

df = df[df["cleaned_text"] != ""]
empty_count = df[df["cleaned_text"] == ""].shape[0]
print(f"After dropping the rows there are {empty_count} with empty text")

Number of rows with empty text: 7
After dropping the rows there are 0 with empty text


In [98]:
# We don't need the column text anymore as cleaned_text has all the important and clean information from that column so we drop it
df = df.drop('text', axis=1)
df.head()

Unnamed: 0,sentiment,cleaned_text
0,1,id have responded if i were going
1,0,sooo sad i will miss you here in san diego
2,0,my boss is bullying me
3,0,what interview leave me alone
4,0,sons of why couldnt they put them on the rele...


**Data visualization**

I will do a little visualization of the data but the "big" part will come afterwards, once i have done the predictions, to compare the results of my analysis with the column `selected_text` in the original dataset.

In [99]:
# Distribution of sentiments in the dataset
fig = go.Figure(go.Funnelarea(
    text =temp.sentiment,
    values = temp.text,
    title = {"position": "top center", "text": "Funnel-Chart of Sentiment Distribution"}
    ))
fig.show()

# CLASSIFICATION

Now we will divide our data into train and test and start training our model to be able to predict.

In [100]:
X_train, X_test, y_train, y_test = train_test_split(df["cleaned_text"], df["sentiment"], test_size=0.2, random_state=42)

vectorizer = CountVectorizer(max_features=10000, stop_words="english")
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

model = MultinomialNB()
model.fit(X_train_bow, y_train)

y_pred = model.predict(X_test_bow)

print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=["Negative", "Neutral", "Positive"]))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

    Negative       0.67      0.59      0.63      1538
     Neutral       0.60      0.65      0.63      2237
    Positive       0.70      0.70      0.70      1720

    accuracy                           0.65      5495
   macro avg       0.66      0.65      0.65      5495
weighted avg       0.65      0.65      0.65      5495

Confusion Matrix:
[[ 910  531   97]
 [ 370 1462  405]
 [  85  439 1196]]


In [101]:
vocabulary = vectorizer.get_feature_names_out()
print(vocabulary)

['aaaaaa' 'aaaaaaaaaaa' 'aaaaaaaaaahhhhhhhh' ... 'zulu' 'zuluxhosa' 'zzzz']


In [102]:
log_probs = model.feature_log_prob_
print(log_probs)

[[-10.76774788 -10.76774788 -10.0746007  ... -10.76774788 -10.76774788
  -10.0746007 ]
 [-10.27194284 -10.27194284 -10.96509002 ... -10.27194284 -10.27194284
  -10.96509002]
 [-10.87066149 -10.87066149 -10.87066149 ... -10.17751431 -10.87066149
  -10.17751431]]


In [103]:
n_keywords = 10

for i, label in enumerate(["Negative", "Neutral", "Positive"]):
    top_indices = np.argsort(log_probs[i])[-n_keywords:]
    top_words = [vocabulary[j] for j in top_indices]
    print(f"\nPalabras clave para la clase '{label}':")
    print(", ".join(top_words))



Palabras clave para la clase 'Negative':
day, going, sorry, work, sad, miss, like, dont, just, im

Palabras clave para la clase 'Neutral':
today, lol, got, work, like, dont, going, day, just, im

Palabras clave para la clase 'Positive':
like, great, thanks, mothers, just, im, happy, love, good, day


In [104]:
#Transformar un ejemplo específico al formato BoW
n = random.randint(0,1000)
example_tweet = X_test.iloc[n]
print(example_tweet)

example_bow = vectorizer.transform([example_tweet])

freakin hawt guys at this restaurantwhere im eating dinner with my father


In [105]:
tweet_indices = example_bow.indices
tweet_words = [vocabulary[i] for i in tweet_indices]
tweet_contributions = {word: log_probs[:, idx] for word, idx in zip(tweet_words, tweet_indices)}

for word, contrib in tweet_contributions.items():
    print(f"Palabra: {word}, Contribuciones (Negative, Neutral, Positive): {contrib}")

Palabra: dinner, Contribuciones (Negative, Neutral, Positive): [-8.20279852 -7.27621056 -6.95863849]
Palabra: eating, Contribuciones (Negative, Neutral, Positive): [-7.72322544 -7.15842753 -7.65178567]
Palabra: father, Contribuciones (Negative, Neutral, Positive): [-9.15830996 -9.57879565 -9.07890203]
Palabra: freakin, Contribuciones (Negative, Neutral, Positive): [-8.28284123 -9.3556521  -9.07890203]
Palabra: guys, Contribuciones (Negative, Neutral, Positive): [-6.83592224 -6.83795563 -6.45182089]
Palabra: hawt, Contribuciones (Negative, Neutral, Positive): [-10.76774788  -9.57879565 -10.17751431]
Palabra: im, Contribuciones (Negative, Neutral, Positive): [-3.87612198 -4.22457066 -4.44903923]


In [106]:
predicted_class = model.predict(example_bow)[0]
keywords = sorted(tweet_contributions.items(), key=lambda x: x[1][predicted_class], reverse=True)

print(f"Palabras clave seleccionadas para el tweet: {[word for word, contrib in keywords[:5]]}")

Palabras clave seleccionadas para el tweet: ['im', 'guys', 'eating', 'dinner', 'freakin']


In [114]:
def extract_keywords(tweet, vectorizer, model, log_probs, top_n=5):
    bow = vectorizer.transform([tweet])
    tweet_indices = bow.indices
    tweet_words = [vocabulary[i] for i in tweet_indices]
    tweet_contributions = {word: log_probs[:, idx] for word, idx in zip(tweet_words, tweet_indices)}
    predicted_class = model.predict(bow)[0]
    sorted_keywords = sorted(tweet_contributions.items(), key=lambda x: x[1][predicted_class], reverse=True)
    top_keywords = {word for word, contrib in sorted_keywords[:top_n]}
    ordered_keywords = [word for word in tweet.split() if word in top_keywords]
    return " ".join(ordered_keywords)

# We apply it to the original dataset and we compare it to the oirignal column
original_df["cleaned_text"] = df["cleaned_text"]
original_df["predicted_selected_text"] = df["cleaned_text"].apply(lambda tweet: extract_keywords(tweet, vectorizer, model, log_probs))
comparison = original_df[["cleaned_text", "selected_text", "predicted_selected_text"]].head(20)
comparison

Unnamed: 0,cleaned_text,selected_text,predicted_selected_text
0,id have responded if i were going,"I`d have responded, if I were going",id going
1,sooo sad i will miss you here in san diego,Sooo SAD,sooo sad miss san diego
2,my boss is bullying me,bullying me,boss
3,what interview leave me alone,leave me alone,interview leave
4,sons of why couldnt they put them on the rele...,"Sons of ****,",sons releases bought
5,some shameless plugging for the best rangers f...,http://www.dothebouncy.com/smf - some shameles...,plugging best rangers forum earth
6,am feedings for the baby are fun when he is al...,fun,baby fun smiles
7,soooo high,Soooo high,soooo high
8,both of you,Both of you,
9,journey wow u just became cooler hehe is that...,Wow... u just became cooler.,journey wow just hehe possible
