In [3]:
# train.py
import pandas as pd
import numpy as np
import joblib
import spacy
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dropout, Dense
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report



In [11]:
paths = [
    "data/large_emotion_data/goemotions_1.csv",
    "data/large_emotion_data/goemotions_2.csv",
    "data/large_emotion_data/goemotions_3.csv"
]

In [12]:
dfs = [pd.read_csv(p) for p in paths]
merged = pd.concat(dfs, ignore_index=True)

In [13]:
# Drop duplicates by text
merged = merged.drop_duplicates(subset=["text"]).reset_index(drop=True)

In [14]:
# Identify emotion columns
ignore_cols = {
    "text","id","author","subreddit","link_id","parent_id","created_utc",
    "rater_id","example_very_unclear"
}

In [15]:
emotion_cols = [c for c in merged.columns if c not in ignore_cols]

In [16]:
# Keep only rows with exactly one emotion label
mask_single = merged[emotion_cols].sum(axis=1) == 1
filtered = merged.loc[mask_single].copy()


In [17]:
# Create a single emotion column
filtered["emotion"] = filtered[emotion_cols].idxmax(axis=1)

In [18]:

# Final dataset
final_df = filtered[["text", "emotion"]].reset_index(drop=True)

print(f"Dataset size: {final_df.shape[0]} rows")
print(final_df.head())

Dataset size: 46741 rows
                                                text    emotion
0                                    That game hurt.    sadness
1     You do right, if you don't care then fuck 'em!    neutral
2                                 Man I love reddit.       love
3  [NAME] was nowhere near them, he was by the Fa...    neutral
4  Right? Considering it’s such an important docu...  gratitude


In [19]:
# ---------------------------
# Step 2: Encode labels
# ---------------------------
label_encoder = LabelEncoder()
final_df["label"] = label_encoder.fit_transform(final_df["emotion"])
print("Classes:", label_encoder.classes_)


Classes: ['admiration' 'amusement' 'anger' 'annoyance' 'approval' 'caring'
 'confusion' 'curiosity' 'desire' 'disappointment' 'disapproval' 'disgust'
 'embarrassment' 'excitement' 'fear' 'gratitude' 'grief' 'joy' 'love'
 'nervousness' 'neutral' 'optimism' 'pride' 'realization' 'relief'
 'remorse' 'sadness' 'surprise']


In [21]:
# Step 3: spaCy preprocessing
# ---------------------------
print("Loading spaCy model...")
nlp = spacy.load("en_core_web_sm")

def preprocess_text_spacy(text):
    doc = nlp(str(text).lower())
    tokens = [
        token.lemma_ for token in doc
        if token.is_alpha and not token.is_stop and not token.is_punct
    ]
    return " ".join(tokens)

final_df["clean_text"] = final_df["text"].apply(preprocess_text_spacy)
print("Sample cleaned text:", final_df["clean_text"].iloc[0])


Loading spaCy model...
Sample cleaned text: game hurt


In [22]:
# Step 4: Train/val/test split
# ---------------------------
train_texts, test_texts, y_train, y_test = train_test_split(
    final_df["clean_text"], final_df["label"],
    test_size=0.10, random_state=42, stratify=final_df["label"]
)
train_texts, val_texts, y_train, y_val = train_test_split(
    train_texts, y_train,
    test_size=0.10, random_state=42, stratify=y_train
)


In [23]:
# Step 5: Tokenization & padding
# ---------------------------
MAX_WORDS = 20000
MAX_LEN = 128

tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(train_texts)

X_train = pad_sequences(tokenizer.texts_to_sequences(train_texts), maxlen=MAX_LEN)
X_val   = pad_sequences(tokenizer.texts_to_sequences(val_texts),   maxlen=MAX_LEN)
X_test  = pad_sequences(tokenizer.texts_to_sequences(test_texts),  maxlen=MAX_LEN)

In [27]:

# Step 6: Build BiLSTM model
# ---------------------------
num_classes = len(label_encoder.classes_)
model = Sequential([
    Embedding(input_dim=MAX_WORDS, output_dim=128, input_length=MAX_LEN),
    Bidirectional(LSTM(64, return_sequences=False)),
    Dropout(0.5),
    Dense(64, activation="relu"),
    Dense(num_classes, activation="softmax")
])
model.build(input_shape=(None, MAX_LEN))
model.compile(
    loss="sparse_categorical_crossentropy",
    optimizer="adam",
    metrics=["accuracy"]
)  # Explicitly build
model.summary()
# After you create X_train
model(X_train)  # run one forward pass



<tf.Tensor: shape=(37859, 28), dtype=float32, numpy=
array([[0.03583091, 0.03498027, 0.03576785, ..., 0.0354261 , 0.03578747,
        0.0352505 ],
       [0.03587824, 0.03499416, 0.03608534, ..., 0.03545102, 0.03564062,
        0.03492374],
       [0.03589177, 0.03515601, 0.03614329, ..., 0.03502701, 0.03587715,
        0.03507457],
       ...,
       [0.03605322, 0.03516023, 0.03595654, ..., 0.0353754 , 0.03546734,
        0.03488575],
       [0.03628229, 0.03519404, 0.03559295, ..., 0.03515723, 0.03541458,
        0.03532596],
       [0.03592187, 0.03541268, 0.03603682, ..., 0.03534621, 0.0357894 ,
        0.03544947]], dtype=float32)>

In [28]:
# Step 7: Train
# ---------------------------
early_stop = EarlyStopping(monitor="val_loss", patience=3, restore_best_weights=True)

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=15,
    batch_size=32,
    callbacks=[early_stop],
    verbose=1
)


Epoch 1/15
[1m1184/1184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m134s[0m 107ms/step - accuracy: 0.3494 - loss: 2.6423 - val_accuracy: 0.4345 - val_loss: 2.2227
Epoch 2/15
[1m1184/1184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m88s[0m 75ms/step - accuracy: 0.4452 - loss: 2.1060 - val_accuracy: 0.4390 - val_loss: 2.1803
Epoch 3/15
[1m1184/1184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 73ms/step - accuracy: 0.4842 - loss: 1.8348 - val_accuracy: 0.4362 - val_loss: 2.2209
Epoch 4/15
[1m1184/1184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m661s[0m 559ms/step - accuracy: 0.5475 - loss: 1.5872 - val_accuracy: 0.4103 - val_loss: 2.3258
Epoch 5/15
[1m1184/1184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 71ms/step - accuracy: 0.6116 - loss: 1.3747 - val_accuracy: 0.3901 - val_loss: 2.5632


In [29]:
# Step 8: Evaluate
# ---------------------------
y_pred = np.argmax(model.predict(X_test), axis=1)
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


[1m147/147[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 22ms/step
                precision    recall  f1-score   support

    admiration       0.43      0.45      0.44       292
     amusement       0.51      0.57      0.54       171
         anger       0.30      0.16      0.20       141
     annoyance       0.40      0.03      0.05       206
      approval       0.44      0.03      0.05       304
        caring       0.00      0.00      0.00        91
     confusion       0.00      0.00      0.00       133
     curiosity       0.00      0.00      0.00       165
        desire       0.83      0.18      0.29        57
disappointment       0.00      0.00      0.00       118
   disapproval       0.00      0.00      0.00       206
       disgust       0.50      0.04      0.08        71
 embarrassment       0.00      0.00      0.00        35
    excitement       0.00      0.00      0.00        80
          fear       0.83      0.11      0.19        47
     gratitude       0.82  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [32]:
# Step 9: Save artifacts
# ---------------------------
model.save("emotion_model.keras")
joblib.dump(tokenizer, "tokenizer.joblib")
joblib.dump(label_encoder, "label_encoder.joblib")

print("✅ Saved emotion_model.keras, tokenizer.joblib, label_encoder.joblib")

✅ Saved emotion_model.keras, tokenizer.joblib, label_encoder.joblib


In [33]:
# your model is trained and saved, you can test it with 5 custom inputs like this:
from tensorflow.keras.models import load_model
nlp = spacy.load("en_core_web_sm")
model = load_model("saved_model_emotion_model_tf_bilstm/emotion_model.keras")
tokenizer = joblib.load("saved_model_emotion_model_tf_bilstm/tokenizer.joblib")
label_encoder = joblib.load("saved_model_emotion_model_tf_bilstm/label_encoder.joblib")
MAX_LEN = 128

In [34]:
# Same preprocessing as training
def preprocess_text_spacy(text):
    doc = nlp(str(text).lower())
    return " ".join([t.lemma_ for t in doc if t.is_alpha and not t.is_stop and not t.is_punct])

In [35]:
def predict_emotion(text):
    cleaned = preprocess_text_spacy(text)
    seq = tokenizer.texts_to_sequences([cleaned])
    pad = pad_sequences(seq, maxlen=MAX_LEN)
    probs = model.predict(pad)[0]
    idx = int(np.argmax(probs))
    return {"text": text, "emotion": label_encoder.inverse_transform([idx])[0], "confidence": float(np.max(probs))}

In [36]:
# Test with 5 sample inputs
test_sentences = [
    "I’m so happy to see you!",
    "I can’t believe this happened 😡",
    "I’m feeling really down today",
    "Wow, I didn’t expect that!",
    "I’m a bit worried about tomorrow"
]

In [37]:

for sentence in test_sentences:
    print(predict_emotion(sentence))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 530ms/step
{'text': 'I’m so happy to see you!', 'emotion': 'joy', 'confidence': 0.4356193542480469}
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step
{'text': 'I can’t believe this happened 😡', 'emotion': 'neutral', 'confidence': 0.31213754415512085}
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step
{'text': 'I’m feeling really down today', 'emotion': 'neutral', 'confidence': 0.4235203266143799}
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
{'text': 'Wow, I didn’t expect that!', 'emotion': 'admiration', 'confidence': 0.15229642391204834}
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
{'text': 'I’m a bit worried about tomorrow', 'emotion': 'neutral', 'confidence': 0.2076166570186615}
