In [3]:
import pandas as pd
import json

# Load the dataset
movies_df = pd.read_csv('tmdb_5000_movies.csv')

# Drop rows with missing overviews or genres
movies_df = movies_df.dropna(subset=['overview', 'genres'])

# Extract the first genre name from the genres column
def extract_main_genre(genre_str):
    try:
        genres = json.loads(genre_str.replace("'", '"'))
        if genres:
            return genres[0]['name']  # Use only the first genre as label
    except:
        return None

movies_df['main_genre'] = movies_df['genres'].apply(extract_main_genre)
movies_df = movies_df.dropna(subset=['main_genre'])

# Check class distribution
print(movies_df['main_genre'].value_counts())


Drama              1206
Comedy             1042
Action              754
Adventure           339
Horror              300
Crime               195
Thriller            194
Animation           123
Fantasy             117
Romance             106
Science Fiction      96
Documentary          87
Family               56
Mystery              41
Music                34
Western              27
History              25
War                  24
TV Movie              4
Foreign               2
Name: main_genre, dtype: int64


In [4]:
# I want to focus on the most common genres, so I'm keeping only the top 8
top_genres = movies_df['main_genre'].value_counts().nlargest(8).index.tolist()

# Now I’ll filter the dataset to include only those top genres
filtered_df = movies_df[movies_df['main_genre'].isin(top_genres)].copy()

# Just checking how many rows are left and which genres are included
print(filtered_df.shape)
print(filtered_df['main_genre'].value_counts())


(4153, 21)
Drama        1206
Comedy       1042
Action        754
Adventure     339
Horror        300
Crime         195
Thriller      194
Animation     123
Name: main_genre, dtype: int64


In [6]:
import nltk
import re
from nltk.corpus import stopwords

# Download only what's needed
nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

# I’m keeping this simple — just cleaning and removing stopwords, no lemmatization
def preprocess_text(text):
    text = text.lower()  # lowercase the text
    text = re.sub(r'[^a-z\s]', '', text)  # remove punctuation and numbers
    words = nltk.word_tokenize(text)  # tokenize words
    words = [word for word in words if word not in stop_words]  # remove stopwords
    return ' '.join(words)  # join back to a cleaned string

# Applying this to all overviews
filtered_df['clean_overview'] = filtered_df['overview'].apply(preprocess_text)

# Let me check a few cleaned samples
filtered_df[['overview', 'clean_overview']].head(3)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\govin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\govin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,overview,clean_overview
0,"In the 22nd century, a paraplegic Marine is di...",nd century paraplegic marine dispatched moon p...
1,"Captain Barbossa, long believed to be dead, ha...",captain barbossa long believed dead come back ...
2,A cryptic message from Bond’s past sends him o...,cryptic message bonds past sends trail uncover...


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

# I'm limiting the max features to 5000 for performance (you can tune this)
tfidf = TfidfVectorizer(max_features=5000)

# Applying TF-IDF on the cleaned overview column
X = tfidf.fit_transform(filtered_df['clean_overview']).toarray()

# These are our input features now
print("TF-IDF shape:", X.shape)


TF-IDF shape: (4153, 5000)


In [8]:
from sklearn.preprocessing import LabelEncoder

# I'm creating the label encoder instance
le = LabelEncoder()

# Fitting it on the main_genre column and transforming it
y = le.fit_transform(filtered_df['main_genre'])

# Let’s check how classes were encoded
print("Encoded Classes:", list(le.classes_))
print("Target shape:", y.shape)


Encoded Classes: ['Action', 'Adventure', 'Animation', 'Comedy', 'Crime', 'Drama', 'Horror', 'Thriller']
Target shape: (4153,)


In [11]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Use X instead of X_tfidf
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Train the model
model = LogisticRegression(max_iter=200)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.4452466907340554

Classification Report:
               precision    recall  f1-score   support

           0       0.45      0.50      0.48       151
           1       0.40      0.03      0.05        68
           2       0.00      0.00      0.00        25
           3       0.46      0.57      0.51       208
           4       0.00      0.00      0.00        39
           5       0.43      0.68      0.53       241
           6       0.62      0.13      0.22        60
           7       0.00      0.00      0.00        39

    accuracy                           0.45       831
   macro avg       0.29      0.24      0.22       831
weighted avg       0.40      0.45      0.39       831



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


(4153, 5000)


* Insights
The model achieved ~44% accuracy using only plot summaries, showing that genre prediction from text is possible.
Genres like Drama and Comedy performed better due to more data.
Less frequent genres like Horror and Animation had low performance.
TF-IDF gave a decent start, but more advanced techniques like BERT could improve results.
The model is biased toward dominant genres, and multilabel classification could better reflect real-world movie genres.