In [3]:
# ============================
# 1. Setup & Imports
# ============================
import pandas as pd
import numpy as np
import os, joblib, re

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, Dense, Dropout, Conv1D, GlobalMaxPooling1D, SpatialDropout1D

# ============================
# 2. Load Dataset
# ============================

df = pd.read_csv('df.csv')

print("Dataset size:", len(df))
print(df.head())

texts = df['text'].astype(str).values
labels = df['label'].values
severities = df['severity'].values

# ============================
# 3. Encode Labels
# ============================
label_enc = LabelEncoder()
y_labels = label_enc.fit_transform(labels)
sev_enc = LabelEncoder()
y_sev = sev_enc.fit_transform(severities)

print("Emotion classes:", list(label_enc.classes_))
print("Severity classes:", list(sev_enc.classes_))

# ============================
# 4. Train-Test Split
# ============================
X_train, X_test, y_train, y_test, ys_train, ys_test = train_test_split(
    texts, y_labels, y_sev,
    test_size=0.2, stratify=labels, random_state=42
)

# ============================
# 5. Tokenizer & Sequences
# ============================
vocab_size = 20000
maxlen = 60
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=maxlen, padding="post", truncating="post")
X_test_pad = pad_sequences(X_test_seq, maxlen=maxlen, padding="post", truncating="post")

# Save tokenizer + encoders
os.makedirs("artifacts", exist_ok=True)
joblib.dump(tokenizer, "artifacts/tokenizer.pkl")
joblib.dump(label_enc, "artifacts/label_encoder.pkl")
joblib.dump(sev_enc, "artifacts/severity_encoder.pkl")

# ============================
# 6. Emotion Model (BiLSTM)
# ============================
input_text = Input(shape=(maxlen,))
x = Embedding(vocab_size, 128, input_length=maxlen)(input_text)
x = SpatialDropout1D(0.2)(x)
x = Bidirectional(LSTM(64))(x)
x = Dropout(0.3)(x)
out_emotion = Dense(len(label_enc.classes_), activation="softmax")(x)

emotion_model = Model(input_text, out_emotion)
emotion_model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
emotion_model.summary()

# ============================
# 7. Severity Model (Conv1D)
# ============================
input_text2 = Input(shape=(maxlen,))
y = Embedding(vocab_size, 128, input_length=maxlen)(input_text2)
y = Conv1D(128, 3, activation="relu", padding="same")(y)
y = GlobalMaxPooling1D()(y)
y = Dropout(0.3)(y)
out_severity = Dense(len(sev_enc.classes_), activation="softmax")(y)

sev_model = Model(input_text2, out_severity)
sev_model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
sev_model.summary()

# ============================
# 8. Training
# ============================
es = tf.keras.callbacks.EarlyStopping(monitor="val_accuracy", patience=3, restore_best_weights=True)

history_emotion = emotion_model.fit(
    X_train_pad, y_train,
    validation_split=0.1,
    epochs=8,
    batch_size=128,
    callbacks=[es],
    verbose=2
)

history_sev = sev_model.fit(
    X_train_pad, ys_train,
    validation_split=0.1,
    epochs=8,
    batch_size=128,
    callbacks=[es],
    verbose=2
)

# ============================
# 9. Evaluation
# ============================
pred_em = np.argmax(emotion_model.predict(X_test_pad), axis=1)
pred_sev = np.argmax(sev_model.predict(X_test_pad), axis=1)

print("=== Emotion Report ===")
print(classification_report(y_test, pred_em, target_names=label_enc.classes_))

print("=== Severity Report ===")
print(classification_report(ys_test, pred_sev, target_names=sev_enc.classes_))

# ============================
# 10. Save Models
# ============================
emotion_model.save("artifacts/emotion_model.keras")
sev_model.save("artifacts/severity_model.keras")

print("✅ Models + encoders saved in ./artifacts/")


Dataset size: 10204
      id                                               text       label  \
0   8991  Gusse se sab tod dena chahta hoon kabhi kabhi....       anger   
1   3434  Constant worrying about future, kya hoga? rank...     anxiety   
2   1088  Midterms -> sleepless nights, headache and str...      stress   
3   6486  Hostel mein sab busy hai, kisi se baat karne k...  loneliness   
4  10063     Average day, nothing major happened in class 😤     neutral   

  severity  
0    green  
1      red  
2    green  
3    green  
4    green  
Emotion classes: ['anger', 'anxiety', 'burnout', 'loneliness', 'neutral', 'sadness', 'stress']
Severity classes: ['amber', 'green', 'red']




Epoch 1/8
58/58 - 10s - 176ms/step - accuracy: 0.6760 - loss: 1.1140 - val_accuracy: 0.9829 - val_loss: 0.1423
Epoch 2/8
58/58 - 6s - 96ms/step - accuracy: 0.9981 - loss: 0.0572 - val_accuracy: 0.9988 - val_loss: 0.0175
Epoch 3/8
58/58 - 5s - 92ms/step - accuracy: 1.0000 - loss: 0.0119 - val_accuracy: 0.9988 - val_loss: 0.0079
Epoch 4/8
58/58 - 5s - 89ms/step - accuracy: 1.0000 - loss: 0.0058 - val_accuracy: 0.9988 - val_loss: 0.0053
Epoch 5/8
58/58 - 5s - 90ms/step - accuracy: 1.0000 - loss: 0.0041 - val_accuracy: 0.9988 - val_loss: 0.0044
Epoch 1/8
58/58 - 3s - 56ms/step - accuracy: 0.7111 - loss: 0.7669 - val_accuracy: 0.7099 - val_loss: 0.7181
Epoch 2/8
58/58 - 2s - 29ms/step - accuracy: 0.7163 - loss: 0.6924 - val_accuracy: 0.7099 - val_loss: 0.6958
Epoch 3/8
58/58 - 2s - 29ms/step - accuracy: 0.7159 - loss: 0.6808 - val_accuracy: 0.7099 - val_loss: 0.6884
Epoch 4/8
58/58 - 2s - 30ms/step - accuracy: 0.7159 - loss: 0.6717 - val_accuracy: 0.7111 - val_loss: 0.6858
Epoch 5/8
58/58 -