In [1]:
#Load the libraries
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB

In [2]:
# cargar csv
df = pd.read_csv('data/goemotions_clean.csv')
df.head()

Unnamed: 0,text,emotion
0,Shhh dont give idea,anger
1,Thank much kind stranger I really need,gratitude
2,Ion know would better buy trim make hard dose,neutral
3,Im honestly surprised We fallen much farther,excitement
4,Jurisprudence fetishist get technicality,neutral


# Regresión logística

In [3]:
# Split del dataset (80-20)
X = df['text']
y = df['emotion']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# Stop words
stop_words = stopwords.words('english')

# Count vectorizer for bag of words
cv = CountVectorizer(min_df=0.01, max_df=0.95, binary=False, ngram_range=(1,2), stop_words=stop_words)

# Pipeline for BoW
pipeline_bow = Pipeline([
    ('vectorizer', cv),
    ('classifier', LogisticRegression(penalty='l2', max_iter=1000, C=1, random_state=42))
])

# Training the model for Bag of words
pipeline_bow.fit(X_train, y_train)


In [5]:
# Predicting the model for bag of words
lr_bow_predict = pipeline_bow.predict(X_test)

# Accuracy score for bag of words
lr_bow_score = accuracy_score(y_test, lr_bow_predict)

In [6]:
# Tfidf vectorizer
tv = TfidfVectorizer(min_df=0.01, max_df=0.95, use_idf=True, ngram_range=(1,2), stop_words=stop_words)

# Pipeline for TF-IDF
pipeline_tfidf = Pipeline([
    ('vectorizer', tv),
    ('classifier', LogisticRegression(penalty='l2', max_iter=1000, C=1, random_state=42))
])


In [7]:
# Training the model for TF-IDF
pipeline_tfidf.fit(X_train, y_train)

# Predicting the model for TF-IDF
lr_tfidf_predict = pipeline_tfidf.predict(X_test)

# Accuracy score for TF-IDF
lr_tfidf_score = accuracy_score(y_test, lr_tfidf_predict)

In [8]:
print("lr_bow_score :", lr_bow_score)
print("lr_tfidf_score :", lr_tfidf_score)

lr_bow_score : 0.31121642969984203
lr_tfidf_score : 0.30847504878728743


In [13]:
# Guardar modelos
from joblib import dump

# Guardar el modelo Bag of Words
dump(pipeline_bow, './models/logReg_bow.joblib')

# Guardar el modelo TF-IDF
dump(pipeline_tfidf, './models/logReg_tfidf.joblib')

['./models/linReg_tfidf.joblib']

In [12]:
import numpy as np

# Texto a predecir
new_text = ["I hate you"]

# Obtener las probabilidades de las clases usando el modelo Bag of Words
probabilities_bow = pipeline_bow.predict_proba(new_text)

# Obtener las probabilidades de las clases usando el modelo TF-IDF
probabilities_tfidf = pipeline_tfidf.predict_proba(new_text)

# Obtener las clases del modelo
classes = pipeline_bow.classes_

# Obtener los índices de las tres mayores probabilidades para Bag of Words
top3_indices_bow = np.argsort(probabilities_bow[0])[-3:][::-1]

# Obtener los índices de las tres mayores probabilidades para TF-IDF
top3_indices_tfidf = np.argsort(probabilities_tfidf[0])[-3:][::-1]

# Imprimir las tres emociones más probables con sus probabilidades para Bag of Words
print("Top 3 predicted emotions with probabilities using Bag of Words model:")
for index in top3_indices_bow:
    print(f"{classes[index]}: {probabilities_bow[0][index]:.2%}")

# Imprimir las tres emociones más probables con sus probabilidades para TF-IDF
print("\nTop 3 predicted emotions with probabilities using TF-IDF model:")
for index in top3_indices_tfidf:
    print(f"{classes[index]}: {probabilities_tfidf[0][index]:.2%}")


Top 3 predicted emotions with probabilities using Bag of Words model:
anger: 24.59%
neutral: 17.89%
annoyance: 11.80%

Top 3 predicted emotions with probabilities using TF-IDF model:
anger: 37.37%
annoyance: 13.09%
neutral: 11.66%


# Naive Bayes

In [15]:
# Split del dataset
X = df['text']
y = df['emotion']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
# Stop words
stop_words = stopwords.words('spanish')

# Count vectorizer for bag of words
cv = CountVectorizer(min_df=0.01, max_df=0.95, binary=False, ngram_range=(1,2), stop_words=stop_words)

# Tfidf vectorizer
tv = TfidfVectorizer(min_df=0.01, max_df=0.95, use_idf=True, ngram_range=(1,2), stop_words=stop_words)

# Naive Bayes classifier for BoW
mnb_bow = Pipeline([
    ('vectorizer', cv),
    ('classifier', MultinomialNB())
])

# Training the model for Bag of words
mnb_bow.fit(X_train, y_train)


In [17]:
# Predicting the model for bag of words
mnb_bow_predict = mnb_bow.predict(X_test)

# Accuracy score for bag of words
mnb_bow_score = accuracy_score(y_test, mnb_bow_predict)

In [18]:
# Naive Bayes classifier for TF-IDF
mnb_tfidf = Pipeline([
    ('vectorizer', tv),
    ('classifier', MultinomialNB())
])

# Training the model for TF-IDF
mnb_tfidf.fit(X_train, y_train)

# Predicting the model for TF-IDF
mnb_tfidf_predict = mnb_tfidf.predict(X_test)

# Accuracy score for TF-IDF features
mnb_tfidf_score = accuracy_score(y_test, mnb_tfidf_predict)

In [19]:
print("mnb_bow_score :", mnb_bow_score)
print("mnb_tfidf_score :", mnb_tfidf_score)

mnb_bow_score : 0.30726698262243285
mnb_tfidf_score : 0.29388532664250533


In [21]:
# Guardar modelos
from joblib import dump

# Guardar el modelo Bag of Words
dump(mnb_bow, './models/naive_bow.joblib')

# Guardar el modelo TF-IDF
dump(mnb_tfidf, './models/naive_tfidf.joblib')

['./models/naive_tfidf.joblib']

In [20]:
import numpy as np

# Texto a predecir
new_text = ["I hate you"]

# Obtener las probabilidades de las clases usando el modelo Bag of Words
probabilities_bow = mnb_bow.predict_proba(new_text)

# Obtener las probabilidades de las clases usando el modelo TF-IDF
probabilities_tfidf = mnb_tfidf.predict_proba(new_text)

# Obtener las clases del modelo
classes = mnb_bow.classes_

# Obtener los índices de las tres mayores probabilidades para Bag of Words
top3_indices_bow = np.argsort(probabilities_bow[0])[-3:][::-1]

# Obtener los índices de las tres mayores probabilidades para TF-IDF
top3_indices_tfidf = np.argsort(probabilities_tfidf[0])[-3:][::-1]

# Imprimir las tres emociones más probables con sus probabilidades para Bag of Words
print("Top 3 predicted emotions with probabilities using Bag of Words model:")
for index in top3_indices_bow:
    print(f"{classes[index]}: {probabilities_bow[0][index]:.2%}")

# Imprimir las tres emociones más probables con sus probabilidades para TF-IDF
print("\nTop 3 predicted emotions with probabilities using TF-IDF model:")
for index in top3_indices_tfidf:
    print(f"{classes[index]}: {probabilities_tfidf[0][index]:.2%}")


Top 3 predicted emotions with probabilities using Bag of Words model:
anger: 35.72%
neutral: 15.72%
annoyance: 12.37%

Top 3 predicted emotions with probabilities using TF-IDF model:
anger: 22.77%
neutral: 20.33%
annoyance: 12.25%
