In [2]:
# ✅ STEP 1: Import Libraries
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from nltk.corpus import stopwords
nltk.download('stopwords')

# ✅ STEP 2: Upload All 4 Files
from google.colab import files
uploaded = files.upload()

# ✅ STEP 3: Load and Clean Training Data
train_plots = []
train_genres = []

with open("train_data.txt", "r", encoding="utf-8") as f:
    for line in f:
        parts = line.strip().split(" ::: ")
        if len(parts) == 4:
            _, _, genre, plot = parts
            train_genres.append(genre)
            train_plots.append(plot)

# Clean plot text
stop_words = set(stopwords.words("english"))
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    return " ".join([word for word in text.split() if word not in stop_words])

cleaned_train_plots = [clean_text(p) for p in train_plots]

# ✅ STEP 4: Vectorize + Encode
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(cleaned_train_plots)

le = LabelEncoder()
y_train = le.fit_transform(train_genres)

# ✅ STEP 5: Train the Model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
print("🎯 Model training completed!")

# ✅ STEP 6: Predict Test Data
test_titles, test_plots = [], []

with open("test_data.txt", "r", encoding="utf-8") as f:
    for line in f:
        parts = line.strip().split(" ::: ")
        if len(parts) == 3:
            _, title, plot = parts
            test_titles.append(title)
            test_plots.append(plot)

cleaned_test_plots = [clean_text(p) for p in test_plots]
X_test = vectorizer.transform(cleaned_test_plots)
y_pred_test = model.predict(X_test)

# ✅ STEP 7: Load Actual Genres from test_data_solution.txt
true_genres = []

with open("test_data_solution.txt", "r", encoding="utf-8") as f:
    for line in f:
        parts = line.strip().split(" ::: ")
        if len(parts) == 2:
            _, genre = parts
            true_genres.append(genre)

print("✅ Loaded true genres:", len(true_genres))
print("✅ Predicted genres:", len(y_pred_test))

# Evaluate if lengths match
if len(true_genres) == len(y_pred_test):
    y_true = le.transform(true_genres)
    print("🎯 Accuracy on Test Data:", accuracy_score(y_true, y_pred_test))
    print("\nClassification Report:\n")
    print(classification_report(y_true, y_pred_test, target_names=le.classes_))
else:
    print("❌ ERROR: test_data and solution length mismatch")

# ✅ STEP 8: Predict from Custom Plot Input
user_plot = input("✍️ Enter a movie plot to predict genre:\n")
cleaned_input = clean_text(user_plot)
input_vector = vectorizer.transform([cleaned_input])
predicted_genre = le.inverse_transform(model.predict(input_vector))[0]
print(f"🎬 Predicted Genre: {predicted_genre}")



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Saving description.txt to description.txt
Saving test_data.txt to test_data.txt
Saving test_data_solution.txt to test_data_solution.txt
Saving train_data.txt to train_data.txt
🎯 Model training completed!
✅ Loaded true genres: 0
✅ Predicted genres: 54200
❌ ERROR: test_data and solution length mismatch
✍️ Enter a movie plot to predict genre:
A young boy is diagnosed with a terminal illness and forms a bond with his nurse.
🎬 Predicted Genre: drama
