In [26]:
import kagglehub
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.naive_bayes import CategoricalNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import Binarizer

import re
import emoji
import os
import matplotlib.pyplot as plt
import random

import seaborn as sns
import string
import numpy as np
import random
from plotly import graph_objs as go

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import skipgrams
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dot, Dense, Flatten
from tensorflow.keras.optimizers import Adam

In [27]:
path = kagglehub.dataset_download("yasserh/twitter-tweets-sentiment-dataset")
print("Path to dataset files:", path)

Path to dataset files: C:\Users\Nicolás\.cache\kagglehub\datasets\yasserh\twitter-tweets-sentiment-dataset\versions\1


In [28]:
file_path = os.path.join(path, "Tweets.csv")

original_df = pd.read_csv(file_path, encoding="latin1")
original_df.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


# DATA CLEANING

**Dropping columns**

To predict the sentiment behind the tweet using BOW and Bayesian Probability, we will have to drop some columns:
- `textID`: Unique identificator for each tweet, it doesn't add any info to the sentiment so we can drop the column

- `selected_text`: Using this column would be a little bit of cheating as we are precisely trying to predict which words are the most related to each sentiment and selected_text is already giving us that info. Although we can use the column later on to compare the results of our prediction with the selected text in the dataset, we will drop if for now.

In [29]:
df = original_df.drop(['textID', 'selected_text'], axis=1)
df.head()

Unnamed: 0,text,sentiment
0,"I`d have responded, if I were going",neutral
1,Sooo SAD I will miss you here in San Diego!!!,negative
2,my boss is bullying me...,negative
3,what interview! leave me alone,negative
4,"Sons of ****, why couldn`t they put them on t...",negative


**Quick look at our data**

Let's take a little look into the way data is organized in our dataset, we will visualize better our data later on.

In [30]:
temp = df.groupby('sentiment').count()['text'].reset_index().sort_values(by='text',ascending=False)
temp.style.background_gradient(cmap='Blues_r')

Unnamed: 0,sentiment,text
1,neutral,11117
2,positive,8582
0,negative,7781


**Mapping labels**

Original target labels are 4 for positive, 2 for neutral and 0 for negative, but i think it would be easier to interprate and more intuitive if i changed the labels to 2 for positive, 1 for neutral and 0 for negative

In [31]:
# Mapping labels
label_mapping = {'negative': 0, 'neutral': 1, 'positive':2}
df['sentiment'] = df['sentiment'].map(label_mapping)

df.head()

Unnamed: 0,text,sentiment
0,"I`d have responded, if I were going",1
1,Sooo SAD I will miss you here in San Diego!!!,0
2,my boss is bullying me...,0
3,what interview! leave me alone,0
4,"Sons of ****, why couldn`t they put them on t...",0


**Cleaning data**

Let's make sure that there aren't any empty cells on our dataset and that we can work with all the data properly.

In [32]:
# Let's see if there are any NaN's in our tweets and treat them in case there are
print("There is a total of", df["text"].isna().sum(), "NaN's")
df["text"] = df["text"].fillna("")

There is a total of 1 NaN's


In [33]:
print("After cleaning the column there are", df["text"].isna().sum(), "NaN's")

def preprocess_tweet(tweet):
    if not isinstance(tweet, str):  # Invalid values check
        return ""
    tweet = re.sub(r"@\w+", "", tweet)  # Eliminate mentions
    tweet = re.sub(r"#\w+", "", tweet)  # Eliminate hashtags
    tweet = re.sub(r"http\S+|www\S+", "", tweet)  # Eliminate URL's
    tweet = emoji.demojize(tweet)  # Convert emoji to text
    tweet = re.sub(r"[^a-zA-Z\s*]", "", tweet)  # Eliminate special characters
    tweet = tweet.lower().strip()  # Eliminate uppercase and spaces
    return tweet

df['cleaned_text'] = df["text"].apply(preprocess_tweet)


After cleaning the column there are 0 NaN's


In [34]:
empty_count = df[df["cleaned_text"] == ""].shape[0]
print(f"Number of rows with empty text: {empty_count}")

df = df[df["cleaned_text"] != ""]
empty_count = df[df["cleaned_text"] == ""].shape[0]
print(f"After dropping the rows there are {empty_count} with empty text")

Number of rows with empty text: 6
After dropping the rows there are 0 with empty text


In [35]:
# We don't need the column text anymore as cleaned_text has all the important and clean information from that column so we drop it
df = df.drop('text', axis=1)
df.head()

Unnamed: 0,sentiment,cleaned_text
0,1,id have responded if i were going
1,0,sooo sad i will miss you here in san diego
2,0,my boss is bullying me
3,0,what interview leave me alone
4,0,sons of **** why couldnt they put them on the ...


**Data visualization**

I will do a little visualization of the data but the "big" part will come afterwards, once i have done the predictions, to compare the results of my analysis with the column `selected_text` in the original dataset.

In [36]:
# Distribution of sentiments in the dataset
fig = go.Figure(go.Funnelarea(
    text =temp.sentiment,
    values = temp.text,
    title = {"position": "top center", "text": "Funnel-Chart of Sentiment Distribution"}
    ))
fig.show()

# CLASSIFICATION

Now we will divide our data into train and test and start training our model to be able to predict.

### MultinomialNB with BoW

In [37]:
X_train, X_test, y_train, y_test = train_test_split(df["cleaned_text"], df["sentiment"], test_size=0.2, random_state=42)

vectorizer = CountVectorizer(max_features=10000, stop_words="english")
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

model = MultinomialNB()
model.fit(X_train_bow, y_train)

y_pred = model.predict(X_test_bow)

print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=["Negative", "Neutral", "Positive"]))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

    Negative       0.68      0.58      0.62      1563
     Neutral       0.59      0.67      0.63      2199
    Positive       0.72      0.70      0.71      1733

    accuracy                           0.65      5495
   macro avg       0.66      0.65      0.65      5495
weighted avg       0.66      0.65      0.65      5495

Confusion Matrix:
[[ 906  561   96]
 [ 352 1463  384]
 [  83  443 1207]]


### CategoricalNB with TF-IDF

### Csdgadfgdsf

In [39]:
# Vectorización BoW
vectorizer = CountVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['cleaned_text']).toarray()  # Convertimos el texto a TF-IDF y lo pasamos a array
y = df['sentiment']  # Etiquetas

# Configuración de k-fold
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Lista de clasificadores Naive Bayes
classifiers = [
    MultinomialNB(),
    LogisticRegression(),
    BernoulliNB(),
    ComplementNB()
]

# Evaluación de los modelos con cross-validation
cv_results = []
for classifier in classifiers:
    cv_results.append(cross_val_score(classifier, X, y, scoring="f1_macro", cv=kfold, n_jobs=-1))

# Calcular medias y desviaciones estándar de los resultados
cv_means = [result.mean() for result in cv_results]
cv_std = [result.std() for result in cv_results]

# Crear un DataFrame con los resultados
cv_res = pd.DataFrame({
    "CrossValMeans": cv_means,
    "CrossValErrors": cv_std,
    "Algorithm": ["MultinomialNB", "LogisticRegression", "BernoulliNB", "ComplementNB"]
})
cv_res = cv_res.sort_values(by="CrossValMeans", ascending=True)

# Visualización de los resultados
fig = make_subplots(
    rows=1, cols=2,
    column_widths=[0.7, 0.3],
    specs=[[{"type": "bar"}, {"type": "table"}]],
    subplot_titles=("Cross Validation F1 Scores", "Cross Validation Error Table")
)

# Gráfico de barras
fig.add_trace(go.Bar(
    x=cv_res['CrossValMeans'],
    y=cv_res['Algorithm'],
    orientation='h',  # Barras horizontales
    error_x=dict(type='data', array=cv_res['CrossValErrors']),
    marker=dict(color=cv_res['CrossValMeans'], colorscale='Agsunset')
), row=1, col=1)

# Tabla de resultados
fig.add_trace(go.Table(
    header=dict(values=["Algorithm", "Cross Validation Means", "Cross Validation Errors"],
                fill_color='grey',
                align='left'),
    cells=dict(values=[cv_res['Algorithm'], cv_res['CrossValMeans'], cv_res['CrossValErrors']],
               fill_color='black',
               align='left')
), row=1, col=2)

# Configuración del layout
fig.update_layout(
    height=600,
    showlegend=False,
    template="plotly_dark"
)

# Mostrar la gráfica
fig.show()

### Hiperparameters

In [40]:
"""
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, ComplementNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, f1_score
import pandas as pd

# Configuración inicial
f1_scorer = make_scorer(f1_score, average='macro')
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Modelos y sus hiperparámetros
models = {
    'MultinomialNB': MultinomialNB(),
    'BernoulliNB': BernoulliNB(),
    'ComplementNB': ComplementNB(),
    'LogisticRegression': LogisticRegression(random_state=42)
}

param_grids = {
    'MultinomialNB': {'alpha': [0.1, 0.5, 1.0, 1.5, 2.0]},
    'BernoulliNB': {'alpha': [0.1, 0.5, 1.0, 1.5, 2.0]},
    'ComplementNB': {'alpha': [0.1, 0.5, 1.0, 1.5, 2.0]},
    'LogisticRegression': {
        'C': [0.01, 0.1, 1.0, 10.0],
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear', 'saga'],
        'class_weight': [None, 'balanced']
    }
}

# Procesamiento BoW y TF-IDF
vectorizers = {
    'BoW': CountVectorizer(max_features=5000),
    'TF-IDF': TfidfVectorizer(max_features=5000)
}

# Resultados de las búsquedas
results = {}

for vec_name, vectorizer in vectorizers.items():
    print(f"Optimizing models for {vec_name}...")
    X = vectorizer.fit_transform(df['cleaned_text']).toarray()
    y = df['sentiment']
    
    # Almacenar resultados para este vectorizador
    vec_results = {}

    for model_name, model in models.items():
        print(f"Optimizing {model_name}...")
        grid_search = GridSearchCV(
            estimator=model,
            param_grid=param_grids[model_name],
            cv=kfold,
            scoring=f1_scorer,
            n_jobs=-1
        )
        
        # Realizar la búsqueda
        grid_search.fit(X, y)
        
        # Guardar el mejor modelo y los resultados
        vec_results[model_name] = {
            'best_estimator': grid_search.best_estimator_,
            'best_params': grid_search.best_params_,
            'best_score': grid_search.best_score_
        }
    
    results[vec_name] = vec_results

# Mostrar los mejores resultados
for vec_name, vec_results in results.items():
    print(f"\nResults for {vec_name}:")
    for model_name, res in vec_results.items():
        print(f"{model_name} -> Best F1 Score: {res['best_score']:.4f} | Best Params: {res['best_params']}")

"""

'\nfrom sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\nfrom sklearn.model_selection import StratifiedKFold, GridSearchCV\nfrom sklearn.naive_bayes import MultinomialNB, BernoulliNB, ComplementNB\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.metrics import make_scorer, f1_score\nimport pandas as pd\n\n# Configuración inicial\nf1_scorer = make_scorer(f1_score, average=\'macro\')\nkfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\n\n# Modelos y sus hiperparámetros\nmodels = {\n    \'MultinomialNB\': MultinomialNB(),\n    \'BernoulliNB\': BernoulliNB(),\n    \'ComplementNB\': ComplementNB(),\n    \'LogisticRegression\': LogisticRegression(random_state=42)\n}\n\nparam_grids = {\n    \'MultinomialNB\': {\'alpha\': [0.1, 0.5, 1.0, 1.5, 2.0]},\n    \'BernoulliNB\': {\'alpha\': [0.1, 0.5, 1.0, 1.5, 2.0]},\n    \'ComplementNB\': {\'alpha\': [0.1, 0.5, 1.0, 1.5, 2.0]},\n    \'LogisticRegression\': {\n        \'C\': [0.01, 0.1, 1.0, 1

### EDA

Ahora me gustaría crear mi propia columna selected words, compararla con la que traía originalmente el dataset y hacer mi estudio sobre ellas.

In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import numpy as np
import pandas as pd

vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['cleaned_text'])
y = df['sentiment']

model = LogisticRegression(C=1, class_weight='balanced', penalty='l1', solver='liblinear', random_state=42)
model.fit(X, y)

def extract_keywords(tweet, vectorizer, model, percentile=40):
    tfidf = vectorizer.transform([tweet])
    feature_names = np.array(vectorizer.get_feature_names_out())
    coefs = model.coef_

    predicted_class = model.predict(tfidf)[0]
    predicted_class_index = model.classes_.tolist().index(predicted_class)

    tfidf_indices = tfidf.nonzero()[1]
    word_contributions = {
        feature_names[idx]: coefs[predicted_class_index, idx] * tfidf[0, idx] for idx in tfidf_indices
    }

    contributions_array = np.array(list(word_contributions.values()))
    positive_contributions = contributions_array[contributions_array > 0]
    if positive_contributions.size == 0:
        return tweet
    
    threshold = np.percentile(contributions_array[contributions_array > 0], percentile)
    top_keywords = {word for word, contrib in word_contributions.items() if contrib >= threshold}
    ordered_keywords = [word for word in tweet.split() if word in top_keywords]

    return " ".join(ordered_keywords) if ordered_keywords else tweet

original_df["cleaned_text"] = df["cleaned_text"]
original_df["predicted_selected_text"] = df["cleaned_text"].apply(lambda tweet: extract_keywords(tweet, vectorizer, model))
comparison = original_df[["cleaned_text", "selected_text", "predicted_selected_text"]].head(20)

comparison

Unnamed: 0,cleaned_text,selected_text,predicted_selected_text
0,id have responded if i were going,"I`d have responded, if I were going",if going
1,sooo sad i will miss you here in san diego,Sooo SAD,sad miss
2,my boss is bullying me,bullying me,my boss
3,what interview leave me alone,leave me alone,what
4,sons of **** why couldnt they put them on the ...,"Sons of ****,",why couldnt they already
5,some shameless plugging for the best rangers f...,http://www.dothebouncy.com/smf - some shameles...,best earth
6,am feedings for the baby are fun when he is al...,fun,baby fun he smiles
7,soooo high,Soooo high,soooo high
8,both of you,Both of you,both
9,journey wow u just became cooler hehe is that...,Wow... u just became cooler.,journey wow hehe
