# MOVIE GENRE CLASSIFICATION

In [1]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Reading the data

In [2]:
# Read and parse the data
def read_data(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            parts = line.strip().split(' ::: ')
            if len(parts) == 4:
                data.append((parts[2], parts[3]))  # (genre, plot summary)
    return pd.DataFrame(data, columns=['genre', 'plot'])

train_data = read_data('train_data.txt')
test_data = read_data('test_data_solution.txt')

# Check if data is read correctly
print("Training data:\n", train_data.head())
print("Test data:\n", test_data.head())

Training data:
       genre                                               plot
0     drama  Listening in to a conversation between his doc...
1  thriller  A brother and sister with a past incestuous re...
2     adult  As the bus empties the students for their fiel...
3     drama  To help their unemployed father make ends meet...
4     drama  The film's title refers not only to the un-rec...
Test data:
          genre                                               plot
0     thriller  L.R. Brane loves his life - his car, his apart...
1       comedy  Spain, March 1964: Quico is a very naughty chi...
2  documentary  One year in the life of Albin and his family o...
3        drama  His father has died, he hasn't spoken with his...
4        drama  Before he was known internationally as a marti...


# Cleaning the data

In [3]:
# Clean the text data
def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I | re.A)
    text = text.lower()
    text = text.strip()
    return text

train_data['plot'] = train_data['plot'].apply(clean_text)
test_data['plot'] = test_data['plot'].apply(clean_text)

# Check if text cleaning worked
print("Cleaned training data:\n", train_data['plot'].head())
print("Cleaned test data:\n", test_data['plot'].head())

Cleaned training data:
 0    listening in to a conversation between his doc...
1    a brother and sister with a past incestuous re...
2    as the bus empties the students for their fiel...
3    to help their unemployed father make ends meet...
4    the films title refers not only to the unrecov...
Name: plot, dtype: object
Cleaned test data:
 0    lr brane loves his life  his car his apartment...
1    spain march  quico is a very naughty child of ...
2    one year in the life of albin and his family o...
3    his father has died he hasnt spoken with his b...
4    before he was known internationally as a marti...
Name: plot, dtype: object


# VECTORIZING TEXT USING TF-IDF

In [4]:
# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train = vectorizer.fit_transform(train_data['plot'])
X_test = vectorizer.transform(test_data['plot'])

y_train = train_data['genre']
y_test = test_data['genre']

# Check if vectorization worked
print("Vectorized training data shape:", X_train.shape)
print("Vectorized test data shape:", X_test.shape)

Vectorized training data shape: (54214, 5000)
Vectorized test data shape: (54200, 5000)


# Naive Bayes

In [5]:
# Train and evaluate a Naive Bayes classifier
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
y_pred_nb = nb_model.predict(X_test)
accuracy_nb = accuracy_score(y_test, y_pred_nb)
report_nb = classification_report(y_test, y_pred_nb, zero_division=0)

print("Naive Bayes Classifier Accuracy:", accuracy_nb)
print("Naive Bayes Classifier Report:\n", report_nb)

Naive Bayes Classifier Accuracy: 0.5245940959409594
Naive Bayes Classifier Report:
               precision    recall  f1-score   support

      action       0.57      0.11      0.18      1314
       adult       0.52      0.07      0.12       590
   adventure       0.74      0.08      0.14       775
   animation       0.00      0.00      0.00       498
   biography       0.00      0.00      0.00       264
      comedy       0.52      0.43      0.47      7446
       crime       0.00      0.00      0.00       505
 documentary       0.57      0.87      0.69     13096
       drama       0.46      0.82      0.59     13612
      family       0.67      0.00      0.01       783
     fantasy       0.00      0.00      0.00       322
   game-show       0.97      0.33      0.49       193
     history       0.00      0.00      0.00       243
      horror       0.69      0.36      0.48      2204
       music       0.75      0.15      0.25       731
     musical       0.00      0.00      0.00       2

# Logistic regression

In [6]:
# Train and evaluate a Logistic Regression classifier
lr_model = LogisticRegression(max_iter=10000)
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)
accuracy_lr = accuracy_score(y_test, y_pred_lr)
report_lr = classification_report(y_test, y_pred_lr, zero_division=0)

print("Logistic Regression Classifier Accuracy:", accuracy_lr)
print("Logistic Regression Classifier Report:\n", report_lr)

Logistic Regression Classifier Accuracy: 0.5827675276752767
Logistic Regression Classifier Report:
               precision    recall  f1-score   support

      action       0.48      0.29      0.36      1314
       adult       0.59      0.23      0.33       590
   adventure       0.57      0.16      0.26       775
   animation       0.47      0.06      0.11       498
   biography       0.00      0.00      0.00       264
      comedy       0.53      0.58      0.55      7446
       crime       0.37      0.04      0.07       505
 documentary       0.67      0.85      0.75     13096
       drama       0.54      0.77      0.64     13612
      family       0.50      0.09      0.15       783
     fantasy       0.56      0.06      0.10       322
   game-show       0.91      0.51      0.66       193
     history       0.00      0.00      0.00       243
      horror       0.64      0.57      0.60      2204
       music       0.68      0.45      0.54       731
     musical       0.24      0.02  