In [1]:
# 1️⃣ Import library
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import joblib

In [2]:
# 2️⃣ Load dataset
df = pd.read_csv("data/processed/emotion_dataset_preprocessed.csv")

In [3]:
# 3️⃣ Split data
X = df["clean_text"]
y = df["emotion_label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [6]:
# 4️⃣ TF-IDF vectorizer
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train_tfidf =X.isna().sum()tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


SyntaxError: invalid syntax (2656741250.py, line 3)

In [7]:
X.isna().sum()


16

In [8]:
X = df["clean_text"].fillna("")
y = df["emotion_label"]

# lalu lanjut split ulang
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [9]:
(df["clean_text"].str.strip() == "").sum()


0

In [10]:
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


In [11]:
# 5️⃣ Train baseline model
model = LogisticRegression(max_iter=200)
model.fit(X_train_tfidf, y_train)

In [12]:
# 6️⃣ Evaluate
y_pred = model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

       anger       0.92      0.91      0.92     11463
        fear       0.87      0.86      0.86      9542
         joy       0.92      0.94      0.93     28214
        love       0.83      0.78      0.80      6911
     sadness       0.95      0.95      0.95     24238
    surprise       0.79      0.73      0.76      2994

    accuracy                           0.91     83362
   macro avg       0.88      0.86      0.87     83362
weighted avg       0.91      0.91      0.91     83362

[[10472   306   210    30   442     3]
 [  338  8200   181    18   383   422]
 [  103    79 26650  1053   195   134]
 [   29    15  1392  5419    52     4]
 [  466   322   273    40 23104    33]
 [    9   554   193     6    47  2185]]


In [13]:
# 7️⃣ Save model
joblib.dump(model, "../models/logreg_emotion.pkl")
joblib.dump(tfidf, "../models/tfidf_vectorizer.pkl")

print("✅ Baseline model trained & saved successfully!")

✅ Baseline model trained & saved successfully!


In [14]:
import joblib

# Load ulang model dan vectorizer
model = joblib.load("../models/logreg_emotion.pkl")
tfidf = joblib.load("../models/tfidf_vectorizer.pkl")

# Tes prediksi manual
sample_text = ["I am so happy today!"]
sample_vec = tfidf.transform(sample_text)
prediction = model.predict(sample_vec)

print("Predicted emotion:", prediction[0])


Predicted emotion: joy


In [15]:
def predict_emotion(text):
    text_vec = tfidf.transform([text])
    return model.predict(text_vec)[0]


In [16]:
print(predict_emotion("I feel very sad and lonely"))
print(predict_emotion("What a wonderful surprise!"))


sadness
joy


In [17]:
examples = [
    "I love this movie so much!",
    "I'm so scared of that noise",
    "This makes me angry!",
    "Feeling sad today",
    "What a surprise!"
]

for text in examples:
    print(f"{text} --> {predict_emotion(text)}")


I love this movie so much! --> joy
I'm so scared of that noise --> fear
This makes me angry! --> anger
Feeling sad today --> sadness
What a surprise! --> joy
