<a href="https://colab.research.google.com/github/JaiderMon/ProyectoCDD/blob/main/CDD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical

In [None]:
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'@\w+|\#\w+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('spanish') + stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    text = ' '.join(tokens)
    return text

In [None]:
data = pd.read_csv('https://raw.githubusercontent.com/adiacla/sentimientos/refs/heads/main/emociones.csv', sep='|')

In [None]:
data['cleaned_tweet'] = data['tweet'].apply(preprocess_text)

X = data['cleaned_tweet']
y = data['emotion']

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)
y_categorical = to_categorical(y_encoded)

X_train, X_test, y_train, y_test = train_test_split(
    X, y_categorical, test_size=0.2, random_state=42
)

In [None]:
max_words = 5000
max_len = 100

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

In [None]:
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

In [None]:
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(y_categorical.shape[1], activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])



In [None]:
history = model.fit(
    X_train_pad, y_train,
    epochs=10,
    batch_size=32,
    validation_split=0.2,
    verbose=1
)

Epoch 1/10
[1m817/817[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m280s[0m 331ms/step - accuracy: 0.4117 - loss: 1.8169 - val_accuracy: 0.6748 - val_loss: 0.9923
Epoch 2/10
[1m817/817[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m274s[0m 336ms/step - accuracy: 0.7040 - loss: 0.9135 - val_accuracy: 0.6989 - val_loss: 0.9127
Epoch 3/10
[1m817/817[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m323s[0m 337ms/step - accuracy: 0.7500 - loss: 0.7546 - val_accuracy: 0.7117 - val_loss: 0.9022
Epoch 4/10
[1m817/817[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m312s[0m 326ms/step - accuracy: 0.7809 - loss: 0.6637 - val_accuracy: 0.7151 - val_loss: 0.9369
Epoch 5/10
[1m817/817[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m274s[0m 336ms/step - accuracy: 0.8080 - loss: 0.5874 - val_accuracy: 0.7151 - val_loss: 0.9765
Epoch 6/10
[1m817/817[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m321s[0m 335ms/step - accuracy: 0.8247 - loss: 0.5310 - val_accuracy: 0.7142 - val_loss: 1.0625
Epoc

In [None]:
model.save('modelo_emociones.keras')


In [None]:
def predict_emotion(text):
    text_clean = preprocess_text(text)
    seq = tokenizer.texts_to_sequences([text_clean])
    padded = pad_sequences(seq, maxlen=max_len)

    pred = model.predict(padded, verbose=0)
    emotion_idx = np.argmax(pred)

    emotion_label = encoder.inverse_transform([emotion_idx])[0]

    return emotion_label


In [None]:
test_phrase = "Tengo mucho miedo por lo que pase mañana"
print(f"\nPredicción para: '{test_phrase}'")
print("Emoción predicha:", predict_emotion(test_phrase))



Predicción para: 'Tengo mucho miedo por lo que pase mañana'
Emoción predicha: fear


In [None]:
import pickle

In [None]:
# Guardar tokenizer
with open("tokenizer.pickle", "wb") as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# Guardar label encoder
with open("label_encoder.pickle", "wb") as handle:
    pickle.dump(encoder, handle, protocol=pickle.HIGHEST_PROTOCOL)