In [None]:
5# =====================================================================
# MOVIE GENRE CLASSIFICATION – COMPLETE END-TO-END PROJECT CODE
# Dataset: IMDB Genre Classification (Kaggle)
# Techniques: NLP + TF-IDF + Classical ML
# Models: Naive Bayes, Logistic Regression, Linear SVM
# =====================================================================

# -----------------------------
# 0. IMPORT LIBRARIES
# -----------------------------
import pandas as pd
import numpy as np
import re
import os

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
    confusion_matrix
)

# -----------------------------
# 1. LOAD DATA
# -----------------------------
TRAIN_PATH = "/content/train_data.txt"
TEST_PATH = "/content/test_data.txt"

train_df = pd.read_csv(
    TRAIN_PATH,
    sep=":::",
    engine="python",
    names=["id", "title", "genre", "plot"]
)

train_df = train_df[["plot", "genre"]]
train_df["genre"] = train_df["genre"].str.strip()

print("Train shape:", train_df.shape)
print("\nTop genres:\n", train_df["genre"].value_counts().head())

# -----------------------------
# 2. TEXT PREPROCESSING
# -----------------------------
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z ]", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

train_df["plot"] = train_df["plot"].apply(clean_text)

# -----------------------------
# 3. TRAIN–TEST SPLIT
# -----------------------------
x = train_df["plot"]
y = train_df["genre"]

x_train, x_test, y_train, y_test = train_test_split(
    x,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# -----------------------------
# 4. TF-IDF FEATURE EXTRACTION
# -----------------------------
tfidf = TfidfVectorizer(
    max_features=10000,
    stop_words="english",
    ngram_range=(1, 2),
    min_df=2
)

x_train_tfidf = tfidf.fit_transform(x_train)
x_test_tfidf = tfidf.transform(x_test)

print("TF-IDF feature size:", x_train_tfidf.shape[1])

# -----------------------------
# 5. DEFINE MODELS
# -----------------------------
models = {
    "Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(max_iter=1500),
    "SVM": LinearSVC()
}

results = {}

# -----------------------------
# 6. TRAIN & EVALUATE MODELS
# -----------------------------
for name, model in models.items():
    model.fit(x_train_tfidf, y_train)
    preds = model.predict(x_test_tfidf)

    results[name] = {
        "model": model,
        "accuracy": accuracy_score(y_test, preds),
        "precision": precision_score(y_test, preds, average="weighted"),
        "recall": recall_score(y_test, preds, average="weighted"),
        "f1": f1_score(y_test, preds, average="weighted")
    }

    print("\n" + "=" * 60)
    print("MODEL:", name)
    print("Accuracy :", results[name]["accuracy"])
    print("Precision:", results[name]["precision"])
    print("Recall   :", results[name]["recall"])
    print("F1 Score :", results[name]["f1"])
    print("\nClassification Report:\n")
    print(classification_report(y_test, preds))

# -----------------------------
# 7. CONFUSION MATRIX (BEST MODEL)
# -----------------------------
best_model_name = max(results, key=lambda x: results[x]["f1"])
best_model = results[best_model_name]["model"]

print("\nBEST MODEL:", best_model_name)
print("\nConfusion Matrix:\n")
print(confusion_matrix(y_test, best_model.predict(x_test_tfidf)))

# -----------------------------
# 8. PREDICTION FUNCTION
# -----------------------------
def predict_genre(plot_text):
    plot_text = clean_text(plot_text)
    vec = tfidf.transform([plot_text])
    return best_model.predict(vec)[0]

# -----------------------------
# 9. CUSTOM SAMPLE TEST
# -----------------------------
sample_plot = """
A group of friends embark on a dangerous mission filled with
action, explosions, betrayal, and intense fight sequences.
"""

print("\nSample Prediction:", predict_genre(sample_plot))

# -----------------------------
# 10. PREDICT ON UNSEEN TEST DATA
# -----------------------------
if os.path.exists(TEST_PATH):
    test_df = pd.read_csv(
        TEST_PATH,
        sep=":::",
        engine="python",
        names=["id", "title", "plot"]
    )

    test_df["plot"] = test_df["plot"].apply(clean_text)
    test_vec = tfidf.transform(test_df["plot"])

    test_df["predicted_genre"] = best_model.predict(test_vec)

    OUTPUT_FILE = "/content/final_movie_genre_predictions.csv"
    test_df.to_csv(OUTPUT_FILE, index=False)

    print("\nPredictions saved to:", OUTPUT_FILE)

# -----------------------------
# 11. SAVE MODEL (OPTIONAL)
# -----------------------------
import pickle

with open("/content/genre_model.pkl", "wb") as f:
    pickle.dump(best_model, f)

with open("/content/tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(tfidf, f)

print("\nModel and vectorizer saved successfully.")

# =====================================================================
# END OF PROJECT
# =====================================================================


Train shape: (54214, 2)

Top genres:
 genre
drama          13613
documentary    13096
comedy          7447
short           5073
horror          2204
Name: count, dtype: int64
TF-IDF feature size: 10000


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



MODEL: Naive Bayes
Accuracy : 0.5173844876879092
Precision: 0.4961611192025106
Recall   : 0.5173844876879092
F1 Score : 0.4316055384767194

Classification Report:

              precision    recall  f1-score   support

      action       0.53      0.03      0.06       263
       adult       0.50      0.03      0.06       118
   adventure       0.80      0.05      0.10       155
   animation       0.00      0.00      0.00       100
   biography       0.00      0.00      0.00        53
      comedy       0.51      0.43      0.47      1490
       crime       0.00      0.00      0.00       101
 documentary       0.57      0.90      0.69      2619
       drama       0.46      0.83      0.59      2723
      family       0.00      0.00      0.00       157
     fantasy       0.00      0.00      0.00        65
   game-show       1.00      0.05      0.10        39
     history       0.00      0.00      0.00        49
      horror       0.77      0.31      0.44       441
       music       0.75 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



MODEL: Logistic Regression
Accuracy : 0.5831411970856774
Precision: 0.5612417757873027
Recall   : 0.5831411970856774
F1 Score : 0.5389245351714556

Classification Report:

              precision    recall  f1-score   support

      action       0.55      0.26      0.35       263
       adult       0.74      0.26      0.39       118
   adventure       0.57      0.10      0.17       155
   animation       0.60      0.06      0.11       100
   biography       0.00      0.00      0.00        53
      comedy       0.52      0.59      0.56      1490
       crime       0.33      0.02      0.04       101
 documentary       0.66      0.86      0.75      2619
       drama       0.54      0.78      0.64      2723
      family       0.57      0.08      0.14       157
     fantasy       0.00      0.00      0.00        65
   game-show       1.00      0.38      0.56        39
     history       0.00      0.00      0.00        49
      horror       0.68      0.57      0.62       441
       music    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



MODEL: SVM
Accuracy : 0.5647883427095822
Precision: 0.5351920556769874
Recall   : 0.5647883427095822
F1 Score : 0.5420112411998909

Classification Report:

              precision    recall  f1-score   support

      action       0.40      0.35      0.37       263
       adult       0.59      0.40      0.47       118
   adventure       0.41      0.23      0.30       155
   animation       0.29      0.17      0.21       100
   biography       0.00      0.00      0.00        53
      comedy       0.51      0.57      0.54      1490
       crime       0.23      0.07      0.11       101
 documentary       0.68      0.81      0.74      2619
       drama       0.56      0.68      0.62      2723
      family       0.31      0.16      0.21       157
     fantasy       0.18      0.05      0.07        65
   game-show       0.88      0.56      0.69        39
     history       0.44      0.08      0.14        49
      horror       0.61      0.62      0.62       441
       music       0.58      0.4