# MOVIE GENRE CLASSIFICATION

In [9]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import re

In [3]:
def load_data(file_name):
    data = []
    with open(f'GenreClassificationDataset/{file_name}', 'r', encoding='latin1') as file:
        for line in file:
            parts = line.strip().split(' ::: ')
            data.append(parts)
    return pd.DataFrame(data)

# Load training data
train_data = load_data('train_data.txt')
train_data.columns = ['ID', 'TITLE', 'GENRE', 'DESCRIPTION']

# Load test data
test_data = load_data('test_data.txt')
test_data.columns = ['ID', 'TITLE', 'DESCRIPTION']


In [4]:
def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    return text

train_data['DESCRIPTION'] = train_data['DESCRIPTION'].apply(preprocess_text)
test_data['DESCRIPTION'] = test_data['DESCRIPTION'].apply(preprocess_text)

X_train, X_val, y_train, y_val = train_test_split(train_data['DESCRIPTION'], train_data['GENRE'], test_size=0.2, random_state=42)

# Vectorize text data using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)
X_test_tfidf = vectorizer.transform(test_data['DESCRIPTION'])

In [5]:
# Train Logistic Regression model
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_tfidf, y_train)
lr_val_pred = lr_model.predict(X_val_tfidf)

# Function to predict genre
def genre(description, vectorizer, model):
    description = preprocess_text(description)
    description_tfidf = vectorizer.transform([description])
    predicted_genre = model.predict(description_tfidf)
    return predicted_genre[0]

description = "A talented film director with difficulty to deal with his sickness, which is making him lose his friends and family."
predicted_genre = genre(description, vectorizer, lr_model)
print(f'The predicted genre is: {predicted_genre}')

The predicted genre is: drama


In [8]:
print("Logistic Regression Classification Report:")
print(classification_report(y_val, lr_val_pred))

best_model = lr_model  # Assuming Logistic Regression is the best performing model
test_data['PREDICTED_GENRE'] = best_model.predict(X_test_tfidf)

Logistic Regression Classification Report:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

      action       0.55      0.27      0.36       263
       adult       0.72      0.21      0.32       112
   adventure       0.44      0.14      0.21       139
   animation       0.67      0.10      0.17       104
   biography       0.00      0.00      0.00        61
      comedy       0.51      0.58      0.54      1443
       crime       0.29      0.02      0.04       107
 documentary       0.66      0.84      0.74      2659
       drama       0.54      0.77      0.64      2697
      family       0.40      0.08      0.13       150
     fantasy       0.00      0.00      0.00        74
   game-show       0.95      0.47      0.63        40
     history       0.00      0.00      0.00        45
      horror       0.62      0.56      0.59       431
       music       0.66      0.49      0.56       144
     musical       0.50      0.02      0.04        50
     mystery       0.00      0.00      0.00        56
        news       1.00    