In [3]:
# ✅ STEP 1: Import Libraries
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from nltk.corpus import stopwords
nltk.download('stopwords')

# ✅ STEP 2: Upload All 4 Files
file_path = r"C:\Users\Admin\OneDrive\Documents\ML-INTERNSHIP\Task 1\train_data.txt"

with open(file_path, "r", encoding="utf-8") as f:
    data = f.read()

print("✅ File loaded successfully!")


print("✅ File loaded successfully!")


file_path = r"C:\Users\Admin\OneDrive\Documents\ML-INTERNSHIP\Task 1\train_data.txt"
with open(file_path, "r", encoding="utf-8") as f:
    data = f.readlines()

# Example: Print first 5 lines
for line in data[:5]:
    print(line)


# ✅ STEP 3: Load and Clean Training Data
train_plots = []
train_genres = []

with open("train_data.txt", "r", encoding="utf-8") as f:
    for line in f:
        parts = line.strip().split(" ::: ")
        if len(parts) == 4:
            _, _, genre, plot = parts
            train_genres.append(genre)
            train_plots.append(plot)

# Clean plot text
stop_words = set(stopwords.words("english"))
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    return " ".join([word for word in text.split() if word not in stop_words])

cleaned_train_plots = [clean_text(p) for p in train_plots]

# ✅ STEP 4: Vectorize + Encode
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(cleaned_train_plots)

le = LabelEncoder()
y_train = le.fit_transform(train_genres)

# ✅ STEP 5: Train the Model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
print("🎯 Model training completed!")

# ✅ STEP 6: Predict Test Data
test_titles, test_plots = [], []

with open(r"C:\Users\Admin\OneDrive\Documents\ML-INTERNSHIP\Task 1\test_data.txt", "r", encoding="utf-8") as f:

    for line in f:
        parts = line.strip().split(" ::: ")
        if len(parts) == 3:
            _, title, plot = parts
            test_titles.append(title)
            test_plots.append(plot)

cleaned_test_plots = [clean_text(p) for p in test_plots]
X_test = vectorizer.transform(cleaned_test_plots)
y_pred_test = model.predict(X_test)

# ✅ STEP 7: Load Actual Genres from test_data_solution.txt
from sklearn.metrics import classification_report, accuracy_score

# ✅ STEP 7: Load Actual Genres from test_data_solution.txt
true_genres = []
with open("test_data_solution.txt", "r", encoding="utf-8") as f:
    for line in f:
        parts = line.strip().split(" ::: ")
        if len(parts) == 2:
            _, genre = parts
            true_genres.append(genre)

# ✅ Evaluation with Fix
if len(true_genres) == len(y_pred_test):
    y_true = le.transform(true_genres)
    labels_in_use = sorted(set(y_true) | set(y_pred_test))

    print("🎯 Accuracy on Test Data:", accuracy_score(y_true, y_pred_test))
    print("\n📝 Classification Report:\n")
    print(classification_report(
        y_true, y_pred_test,
        labels=labels_in_use,
        target_names=le.classes_[labels_in_use],
        zero_division=0  # ✅ suppresses recall/f1 warnings
    ))
else:
    print(f"❌ ERROR: test_data and solution length mismatch → {len(true_genres)} vs {len(y_pred_test)}")




# ✅ STEP 8: Predict from Custom Plot Input
user_plot = input("✍️ Enter a movie plot to predict genre:\n")
cleaned_input = clean_text(user_plot)
input_vector = vectorizer.transform([cleaned_input])
predicted_genre = le.inverse_transform(model.predict(input_vector))[0]
print(f"🎬 Predicted Genre: {predicted_genre}")



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


✅ File loaded successfully!
✅ File loaded successfully!
1 ::: Oscar et la dame rose (2009) ::: drama ::: Listening in to a conversation between his doctor and parents, 10-year-old Oscar learns what nobody has the courage to tell him. He only has a few weeks to live. Furious, he refuses to speak to anyone except straight-talking Rose, the lady in pink he meets on the hospital stairs. As Christmas approaches, Rose uses her fantastical experiences as a professional wrestler, her imagination, wit and charm to allow Oscar to live life and love to the full, in the company of his friends Pop Corn, Einstein, Bacon and childhood sweetheart Peggy Blue.

2 ::: Cupid (1997) ::: thriller ::: A brother and sister with a past incestuous relationship have a current murderous relationship. He murders the women who reject him and she murders the women who get too close to him.

3 ::: Young, Wild and Wonderful (1980) ::: adult ::: As the bus empties the students for their field trip to the Museum of Natu