In [2]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

In [3]:
# Load and preprocess data
def load_data(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split(" ::: ")
            if len(parts) == 4:
                movie_id, title, genre, plot = parts
                data.append((genre, plot))
    df = pd.DataFrame(data, columns=["genre", "plot"])
    return df

train_data = load_data("train_data.txt")
test_data = load_data("test_data_solution.txt")

In [4]:
# Text preprocessing (optional step for cleaning)
def preprocess_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    return text

train_data['plot'] = train_data['plot'].apply(preprocess_text)
test_data['plot'] = test_data['plot'].apply(preprocess_text)

In [5]:
# Split train/test sets
X_train, X_val, y_train, y_val = train_test_split(
    train_data['plot'], train_data['genre'], test_size=0.2, random_state=42)

In [6]:
# TF-IDF feature extraction
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf = tfidf.transform(X_val)
X_test_tfidf = tfidf.transform(test_data['plot'])

In [7]:
# Model training and evaluation function
def train_and_evaluate(model, X_train, y_train, X_val, y_val):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    print(f"Accuracy: {accuracy_score(y_val, y_pred)}")
    print(classification_report(y_val, y_pred, zero_division=0))

In [8]:
# Logistic Regression
print("Logistic Regression Results:")
lr_model = LogisticRegression(max_iter=200, class_weight='balanced')
train_and_evaluate(lr_model, X_train_tfidf, y_train, X_val_tfidf, y_val)

Logistic Regression Results:
Accuracy: 0.43017924135056274
              precision    recall  f1-score   support

      action       0.26      0.35      0.30       133
       adult       0.25      0.64      0.36        50
   adventure       0.22      0.29      0.25        77
   animation       0.18      0.23      0.20        44
   biography       0.04      0.12      0.06        26
      comedy       0.55      0.35      0.43       678
       crime       0.08      0.22      0.12        41
 documentary       0.75      0.60      0.67      1179
       drama       0.66      0.34      0.45      1173
      family       0.11      0.22      0.15        65
     fantasy       0.06      0.10      0.07        31
   game-show       0.62      0.76      0.68        17
     history       0.02      0.13      0.04        15
      horror       0.50      0.60      0.55       194
       music       0.41      0.77      0.54        64
     musical       0.15      0.33      0.20        21
     mystery       0.1

In [9]:
# Support Vector Machine
print("SVM Results:")
svm_model = SVC(kernel='linear', class_weight='balanced')
train_and_evaluate(svm_model, X_train_tfidf, y_train, X_val_tfidf, y_val)

SVM Results:
Accuracy: 0.5037515631513131
              precision    recall  f1-score   support

      action       0.27      0.41      0.32       133
       adult       0.43      0.58      0.50        50
   adventure       0.23      0.23      0.23        77
   animation       0.26      0.20      0.23        44
   biography       0.00      0.00      0.00        26
      comedy       0.50      0.47      0.49       678
       crime       0.14      0.17      0.15        41
 documentary       0.73      0.72      0.72      1179
       drama       0.62      0.47      0.54      1173
      family       0.16      0.26      0.20        65
     fantasy       0.08      0.03      0.05        31
   game-show       1.00      0.65      0.79        17
     history       0.10      0.07      0.08        15
      horror       0.53      0.57      0.55       194
       music       0.46      0.66      0.54        64
     musical       0.33      0.19      0.24        21
     mystery       0.15      0.12      

In [10]:
# Predict on test data with SVM (or use Logistic Regression if preferred)
test_predictions = svm_model.predict(X_test_tfidf)
test_data['predicted_genre'] = test_predictions

In [12]:
# Evaluate test predictions
print("\nTest Data Results:")
print(f"Accuracy: {accuracy_score(test_data['genre'], test_predictions)}")
print(classification_report(test_data['genre'], test_predictions, zero_division=0))


Test Data Results:
Accuracy: 0.5076923076923077
              precision    recall  f1-score   support

      action       0.27      0.42      0.33       607
       adult       0.35      0.51      0.41       254
   adventure       0.22      0.29      0.25       333
   animation       0.25      0.17      0.20       247
   biography       0.06      0.02      0.03       106
      comedy       0.52      0.50      0.51      3288
       crime       0.14      0.15      0.14       222
 documentary       0.73      0.71      0.72      5843
       drama       0.64      0.48      0.55      6100
      family       0.15      0.22      0.18       340
     fantasy       0.14      0.09      0.11       141
   game-show       0.81      0.56      0.66        84
     history       0.10      0.06      0.07       105
      horror       0.53      0.58      0.56       969
       music       0.42      0.65      0.51       343
     musical       0.09      0.05      0.06       107
     mystery       0.23      0.1

In [11]:
# Save predictions
test_data.to_csv("predicted_test_data.csv", index=False)