In [1]:
from models.train import train_and_save_models
from models.predict import predict_genre
import joblib

In [2]:
import pandas as pd
df = pd.read_csv("../data/processed/cleaned_plots.csv")
genres = df["Genre"].str.split("|").apply(lambda g: g[0])
print(genres.value_counts())

Genre
drama                                5909
comedy                               4348
horror                               1151
action                               1085
thriller                              955
                                     ... 
holocaust melodrama                     1
musical, biographical drama             1
summer camp comedy                      1
crime drama based on a true story       1
horror romantic comedy                  1
Name: count, Length: 2227, dtype: int64


In [3]:
X_test, y_test = train_and_save_models(
    "../data/processed/cleaned_plots.csv",
    "../models/tfidf_vectorizer.joblib",
    {"nb": "../models/nb_model.joblib", "lr": "../models/lr_model.joblib"}
)

Checking for NaNs: 0
Sample plots: 24478    farhad pastakia boman irani has a dream job as...
24685    arnab ghosh sunil grover is senior editor and ...
155      at the end of the 19th century in the far west...
5223     the brave bulls is the story of luis bello the...
15907    at the 2011 icca finals barden university s al...
Name: Plot, dtype: object
Type of first plot: <class 'str'>
Average plot length: count    22787.000000
mean      2176.981920
std       1763.690778
min         19.000000
25%        720.000000
50%       1744.000000
75%       3393.000000
max      29219.000000
Name: Plot, dtype: float64
Fitting vectorizer on full data...
Full fit complete.
Full fit complete.

=== LR Training Started (Thu Jul 10 03:39:04 2025) ===

=== LR Training Started (Thu Jul 10 03:39:04 2025) ===


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.


Epoch 1, change: 1
Epoch 2, change: 0.4028258
Epoch 2, change: 0.4028258
Epoch 3, change: 0.38421786
Epoch 3, change: 0.38421786
Epoch 4, change: 0.36271033
Epoch 4, change: 0.36271033
Epoch 5, change: 0.29545914
Epoch 5, change: 0.29545914
Epoch 6, change: 0.28490378
Epoch 6, change: 0.28490378
Epoch 7, change: 0.27327535
Epoch 7, change: 0.27327535
Epoch 8, change: 0.24699657
Epoch 8, change: 0.24699657
Epoch 9, change: 0.21593488
Epoch 9, change: 0.21593488
Epoch 10, change: 0.17730139
Epoch 10, change: 0.17730139
Epoch 11, change: 0.26219258
Epoch 11, change: 0.26219258
Epoch 12, change: 0.21223017
Epoch 12, change: 0.21223017
Epoch 13, change: 0.13595284
Epoch 13, change: 0.13595284
Epoch 14, change: 0.12555936
Epoch 14, change: 0.12555936
Epoch 15, change: 0.14061628
Epoch 15, change: 0.14061628
Epoch 16, change: 0.18413719
Epoch 16, change: 0.18413719
Epoch 17, change: 0.19246155
Epoch 17, change: 0.19246155
Epoch 18, change: 0.077707935
Epoch 18, change: 0.077707935
Epoch 19, c

AttributeError: 'LogisticRegression' object has no attribute 'n_iter_'

Epoch 57, change: 0.036839743


In [4]:
vec = joblib.load("../models/tfidf_vectorizer.joblib")
nb = joblib.load("../models/nb_model.joblib")
lr = joblib.load("../models/lr_model.joblib")

sample = ["A hero saves the world"]
X_samp = vec.transform(sample)
print("NB predicts:", nb.predict(X_samp))
print("LR predicts:", lr.predict(X_samp))

NB predicts: ['drama']
LR predicts: ['drama']


### Baseline Metrics  
Below we compare our trained NB and LR against a dummy “majority‐class” baseline.

In [5]:
from sklearn.model_selection import train_test_split

# Prepare features and target
df = pd.read_csv("../data/processed/cleaned_plots.csv")
X = df["Plot"]
y = df["Genre"].str.split("|").apply(lambda g: g[0])

# Stratify only if possible
stratify_arg = y if y.value_counts().min() >= 2 else None

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=stratify_arg
)

# Vectorize
vec = joblib.load("../models/tfidf_vectorizer.joblib")
X_tr = vec.transform(X_train)
X_te = vec.transform(X_test)


In [6]:
# Generate predictions for NB and LR on the test set
y_pred_nb = nb.predict(X_te)
y_pred_lr = lr.predict(X_te)


In [7]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score

# Load test data (same split logic)
# ...
# NB & LR predictions already in scope as nb, lr
dummy = DummyClassifier(strategy="most_frequent").fit(X_tr, y_train)
y_dummy = dummy.predict(X_te)

print("Dummy Accuracy:", accuracy_score(y_test, y_dummy))
print("NB  Accuracy:", accuracy_score(y_test, y_pred_nb))
print("LR  Accuracy:", accuracy_score(y_test, y_pred_lr))

Dummy Accuracy: 0.20975952255573108
NB  Accuracy: 0.2947165174653326
LR  Accuracy: 0.061786905388801126


In [None]:
# --- Model Diagnostics: Classification Reports and Confusion Matrices ---
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Classification reports
print("Naive Bayes Classification Report:")
print(classification_report(y_test, y_pred_nb))
print("\nLogistic Regression Classification Report:")
print(classification_report(y_test, y_pred_lr))

# Confusion matrices
fig, axes = plt.subplots(1, 2, figsize=(18, 7))
cm_nb = confusion_matrix(y_test, y_pred_nb, labels=nb.classes_)
sns.heatmap(cm_nb, annot=True, fmt="d", xticklabels=nb.classes_, yticklabels=nb.classes_, ax=axes[0])
axes[0].set_title("Naive Bayes Confusion Matrix")
axes[0].set_xlabel("Predicted")
axes[0].set_ylabel("True")

cm_lr = confusion_matrix(y_test, y_pred_lr, labels=lr.classes_)
sns.heatmap(cm_lr, annot=True, fmt="d", xticklabels=lr.classes_, yticklabels=lr.classes_, ax=axes[1])
axes[1].set_title("Logistic Regression Confusion Matrix")
axes[1].set_xlabel("Predicted")
axes[1].set_ylabel("True")

plt.tight_layout()
plt.show()

# --- Next Steps: Addressing Label Imbalance and Fragmentation ---
print("\nNumber of unique genres in y_train:", y_train.nunique())
print("Number of unique genres in y_test:", y_test.nunique())
print("\nTop 20 genres in training set:")
print(y_train.value_counts().head(20))
print("\nNumber of genres with fewer than 10 samples:", sum(y_train.value_counts() < 10))

# Optionally, consolidate rare genres into 'other'
min_count = 20  # Minimum samples to keep a genre
common_genres = y_train.value_counts()[y_train.value_counts() >= min_count].index
y_train_reduced = y_train.where(y_train.isin(common_genres), other="other")
y_test_reduced = y_test.where(y_test.isin(common_genres), other="other")

print("\nAfter consolidation:")
print(y_train_reduced.value_counts())

# Re-train and re-evaluate NB and LR on reduced label set
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

nb_reduced = MultinomialNB()
lr_reduced = LogisticRegression(max_iter=1000, class_weight="balanced", solver="lbfgs")

nb_reduced.fit(X_tr, y_train_reduced)
lr_reduced.fit(X_tr, y_train_reduced)

y_pred_nb_reduced = nb_reduced.predict(X_te)
y_pred_lr_reduced = lr_reduced.predict(X_te)

print("\nNaive Bayes (reduced labels) Classification Report:")
print(classification_report(y_test_reduced, y_pred_nb_reduced))
print("\nLogistic Regression (reduced labels) Classification Report:")
print(classification_report(y_test_reduced, y_pred_lr_reduced))

Naive Bayes Classification Report:
                                                                      precision    recall  f1-score   support

                                                                           0.00      0.00      0.00         2
                                                               [144]       0.00      0.00      0.00         1
                                                              action       0.26      0.11      0.15       221
                                                             action        0.00      0.00      0.00         7
                                                    action & romance       0.00      0.00      0.00         1
                                                  action / adventure       0.00      0.00      0.00         2
                                   action / adventure / martial arts       0.00      0.00      0.00         1
                                       action / adventure / thriller       0.00     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
