# Supervised Learning Project: Movie Genre Classification
This notebook covers data preprocessing, model training, evaluation, and ensemble learning using a movie genre classification dataset.


## 1. Data Loading
We start by loading the dataset.

In [64]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [65]:
# Load dataset
df = pd.read_csv('movie_genre_classification_final.csv')
df.head()

Unnamed: 0,Title,Year,Director,Duration,Rating,Votes,Description,Language,Country,Budget_USD,BoxOffice_USD,Genre,Production_Company,Content_Rating,Lead_Actor,Num_Awards,Critic_Reviews
0,Winds of Fate 4,1980,R. Lee,167,4.1,182425,A touching love story with heartwarming moments.,Spanish,China,39979615,179936008,Romance,DreamWorks,R,Kangana Ranaut,8,229
1,Firestorm 11,2014,S. Chen,166,4.1,449351,A fast-paced thriller with intense action scenes.,Korean,China,116404774,802121619,Action,Netflix,R,Kangana Ranaut,20,466
2,Silent Echo 2,2016,A. Khan,170,4.1,363328,A fast-paced thriller with intense action scenes.,Korean,Japan,166261330,225526871,Action,Pixar,PG,Amitabh Bachchan,16,539
3,City Lights 4,1982,L. Zhang,170,9.9,62371,An emotional journey exploring complex charact...,Japanese,Japan,28861315,69813738,Drama,Netflix,NC-17,Natalie Portman,15,606
4,Broken Truth 1,1990,L. Zhang,91,5.3,4600,An imaginative world filled with magic and won...,Korean,USA,43890403,375136716,Fantasy,Studio Ghibli,PG,Chris Evans,6,330


## 2. Data Preprocessing
We will drop irrelevant columns, encode categorical variables, and prepare the data for training.

In [66]:
df.columns

Index(['Title', 'Year', 'Director', 'Duration', 'Rating', 'Votes',
       'Description', 'Language', 'Country', 'Budget_USD', 'BoxOffice_USD',
       'Genre', 'Production_Company', 'Content_Rating', 'Lead_Actor',
       'Num_Awards', 'Critic_Reviews'],
      dtype='object')

In [67]:
# Drop columns that do not contribute directly to classification
df = df.drop(['Language', 'Country', 'Content_Rating','Director', 'Production_Company', 'Lead_Actor'], axis=1)

In [68]:
# Encode categorical variables
categorical_cols = ['Title', 'Description' ] #, 'Language', 'Country', 'Content_Rating'
le = LabelEncoder()
for col in categorical_cols:
    df[col] = le.fit_transform(df[col])

# Encode target variable
df['Genre'] = le.fit_transform(df['Genre'])

In [69]:
# Separate features and target
X = df[['Title', 'Description', 'Budget_USD']]
y = df['Genre']

## 3. Train-Test Split
We split the data into training and testing sets.

In [70]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## 4. Model Training with Random Forest
We train a Random Forest Classifier.

In [71]:
model = RandomForestClassifier()
model.fit(X_train, y_train)

## 5. Model Evaluation
We evaluate the Random Forest model's performance.

In [72]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:\n', cm)

# Classification Report
print('Classification Report:\n', classification_report(y_test, y_pred))

Accuracy: 1.0
Confusion Matrix:
 [[2114    0    0    0    0    0    0]
 [   0 2161    0    0    0    0    0]
 [   0    0 2154    0    0    0    0]
 [   0    0    0 2116    0    0    0]
 [   0    0    0    0 2188    0    0]
 [   0    0    0    0    0 2152    0]
 [   0    0    0    0    0    0 2115]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      2114
           1       1.00      1.00      1.00      2161
           2       1.00      1.00      1.00      2154
           3       1.00      1.00      1.00      2116
           4       1.00      1.00      1.00      2188
           5       1.00      1.00      1.00      2152
           6       1.00      1.00      1.00      2115

    accuracy                           1.00     15000
   macro avg       1.00      1.00      1.00     15000
weighted avg       1.00      1.00      1.00     15000



## 6. Ensemble Learning with Voting Classifier
We combine Logistic Regression, Decision Tree, and Random Forest.

In [73]:
# Create individual models
model1 = LogisticRegression(max_iter=200)
model2 = DecisionTreeClassifier()
model3 = RandomForestClassifier()

# Create Voting Classifier
voting_model = VotingClassifier(estimators=[('lr', model1), ('dt', model2), ('rf', model3)], voting='hard')

# Train the ensemble model
voting_model.fit(X_train, y_train)

# Evaluate the ensemble model
y_pred_voting = voting_model.predict(X_test)
voting_accuracy = accuracy_score(y_test, y_pred_voting)
print('Voting Classifier Accuracy:', voting_accuracy)

# Classification Report for Ensemble
print('Classification Report:\n', classification_report(y_test, y_pred_voting))

Voting Classifier Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      2114
           1       1.00      1.00      1.00      2161
           2       1.00      1.00      1.00      2154
           3       1.00      1.00      1.00      2116
           4       1.00      1.00      1.00      2188
           5       1.00      1.00      1.00      2152
           6       1.00      1.00      1.00      2115

    accuracy                           1.00     15000
   macro avg       1.00      1.00      1.00     15000
weighted avg       1.00      1.00      1.00     15000

