In [1]:
import pandas as pd
import joblib
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score
)
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [2]:
# Load the cleaned dataset
df = pd.read_csv("../data/processed/cleaned_plots.csv")

# Recreate single-label target
plots = df["Plot"]
genres = df["Genre"].str.split("|").apply(lambda g: g[0])

# Stratify only if possible
stratify_arg = genres if genres.value_counts().min() >= 2 else None

X_train, X_test, y_train, y_test = train_test_split(
    plots,
    genres,
    test_size=0.2,
    random_state=42,
    stratify=stratify_arg
)

In [3]:
# Load the artifacts you saved
vec = joblib.load("../models/tfidf_vectorizer.joblib")
nb_model = joblib.load("../models/nb_model.joblib")
lr_model = joblib.load("../models/lr_model.joblib")

# Transform the test set
X_te = vec.transform(X_test)

In [3]:
y_pred_nb = nb_model.predict(X_te)

print("### Naive Bayes Performance ###")
print("Accuracy:", accuracy_score(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb))

NameError: name 'nb_model' is not defined

In [4]:
y_pred_lr = lr_model.predict(X_te)

print("### Logistic Regression Performance ###")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))

NameError: name 'lr_model' is not defined

In [None]:
cm_nb = confusion_matrix(y_test, y_pred_nb, labels=nb_model.classes_)
plt.figure(figsize=(10,8))
sns.heatmap(cm_nb, annot=True, fmt="d",
            xticklabels=nb_model.classes_,
            yticklabels=nb_model.classes_)
plt.title("Naive Bayes Confusion Matrix")
plt.ylabel("True Genre")
plt.xlabel("Predicted Genre")
plt.show()

In [None]:
cm_lr = confusion_matrix(y_test, y_pred_lr, labels=lr_model.classes_)
plt.figure(figsize=(10,8))
sns.heatmap(cm_lr, annot=True, fmt="d",
            xticklabels=lr_model.classes_,
            yticklabels=lr_model.classes_)
plt.title("Logistic Regression Confusion Matrix")
plt.ylabel("True Genre")
plt.xlabel("Predicted Genre")
plt.show()

In [5]:
# Extract and display top/bottom genres by F1-score for both models
from sklearn.metrics import classification_report
import pandas as pd

def get_f1_df(y_true, y_pred, model_name):
    report = classification_report(y_true, y_pred, output_dict=True)
    df = pd.DataFrame(report).T
    df = df[df.index.isin(y_true.unique())]  # Only keep actual genres
    df['model'] = model_name
    return df[['f1-score', 'precision', 'recall', 'support', 'model']]

f1_nb = get_f1_df(y_test, y_pred_nb, 'Naive Bayes')
f1_lr = get_f1_df(y_test, y_pred_lr, 'Logistic Regression')

# Combine for easy comparison
f1_all = pd.concat([f1_nb, f1_lr])

# Show top 5 and bottom 5 genres by F1-score for each model
for model in f1_all['model'].unique():
    print(f"\nTop 5 genres by F1-score ({model}):")
    display(f1_all[f1_all['model'] == model].sort_values('f1-score', ascending=False).head(5))
    print(f"\nBottom 5 genres by F1-score ({model}):")
    display(f1_all[f1_all['model'] == model].sort_values('f1-score', ascending=True).head(5))

NameError: name 'y_pred_nb' is not defined

## Model Evaluation Summary

- **Overall Accuracy**: See above for both models.
- **Top-performing genres**: The genres with the highest F1-scores are likely those with more samples and clearer language patterns.
- **Genres to improve**: Genres with the lowest F1-scores may be underrepresented or have more ambiguous plot descriptions.
- **Quick hypotheses**:
    - Some genres (e.g., "Horror" or "Documentary") may underperform due to fewer samples or overlapping vocabulary with other genres.
    - Consider consolidating rare genres or using multi-label classification if plots belong to multiple genres.
    - Further balancing the dataset or aggregating similar genres could improve performance.