SET UP

In [48]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

DATA EXPLORATION

In [5]:
data = pd.read_csv('TOP 100 IMDB MOVIES.csv')

In [19]:
data.shape

(100, 6)

In [11]:
data.head()

Unnamed: 0,rank,title,description,genre,rating,year
0,18,Spider-Man: Across the Spider-Verse,"Miles Morales catapults across the Multiverse,...","['Animation', 'Action', 'Adventure']",8.7,2023
1,32,Oppenheimer,"The story of American scientist, J. Robert Opp...","['Biography', 'Drama', 'History']",8.6,2023
2,77,Joker,A mentally troubled stand-up comedian embarks ...,"['Crime', 'Drama', 'Thriller']",8.4,2019
3,76,Avengers: Endgame,After the devastating events of Avengers: Infi...,"['Action', 'Adventure', 'Drama']",8.4,2019
4,37,Parasite,Greed and class discrimination threaten the ne...,"['Drama', 'Thriller']",8.5,2019


In [12]:
data.tail()

Unnamed: 0,rank,title,description,genre,rating,year
95,99,Citizen Kane,Following the death of publishing tycoon Charl...,"['Drama', 'Mystery']",8.3,1941
96,65,The Great Dictator,Dictator Adenoid Hynkel tries to expand his em...,"['Comedy', 'Drama', 'War']",8.4,1940
97,49,Modern Times,The Tramp struggles to live in modern industri...,"['Comedy', 'Drama', 'Romance']",8.5,1936
98,54,City Lights,"With the aid of a wealthy erratic tippler, a d...","['Comedy', 'Drama', 'Romance']",8.5,1931
99,100,M,When the police in a German city are unable to...,"['Crime', 'Mystery', 'Thriller']",8.3,1931


In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   rank         100 non-null    int64  
 1   title        100 non-null    object 
 2   description  100 non-null    object 
 3   genre        100 non-null    object 
 4   rating       100 non-null    float64
 5   year         100 non-null    int64  
dtypes: float64(1), int64(2), object(3)
memory usage: 4.8+ KB


In [18]:
data.isnull().sum()

rank           0
title          0
description    0
genre          0
rating         0
year           0
dtype: int64

In [20]:
data.duplicated().sum()

0

In [23]:
data.columns

Index(['rank', 'title', 'description', 'genre', 'rating', 'year'], dtype='object')

DATA PREPROCESSING

In [40]:
data.dropna(subset=['title', 'genre'], inplace=True)

In [41]:
from sklearn.preprocessing import MultiLabelBinarizer

# Assuming the genre column is already a list
mlb = MultiLabelBinarizer()
genre_encoded = mlb.fit_transform(data['genre'])  # Use 'Genre' as per your column name

# Create a DataFrame with the encoded genres
genre_df = pd.DataFrame(genre_encoded, columns=mlb.classes_)

# Combine the original data with the encoded genres
data_encoded = pd.concat([data[['title']], genre_df], axis=1)  # Use 'Title' as per your column name

print(data_encoded.head())

                                 title  'Action'  'Adventure'  'Adventure']  \
0  Spider-Man: Across the Spider-Verse         1            0             1   
1                          Oppenheimer         0            0             0   
2                                Joker         0            0             0   
3                    Avengers: Endgame         0            1             0   
4                             Parasite         0            0             0   

   'Biography'  'Comedy'  'Comedy']  'Crime'  'Drama'  'Drama']  ...  \
0            0         0          0        0        0         0  ...   
1            0         0          0        0        1         0  ...   
2            0         0          0        0        1         0  ...   
3            0         0          0        0        0         1  ...   
4            0         0          0        0        0         0  ...   

   ['Animation'  ['Animation']  ['Biography'  ['Comedy'  ['Crime'  ['Drama'  \
0            

DATA CLEANING AND DATASET CREATION

In [75]:
# Clean the genre column if necessary
def clean_genre_string(genre_string):
    # Check if genre_string is a string
    if isinstance(genre_string, str):
        # Remove the outer brackets and extra quotes
        genre_string = genre_string.strip("[]")
        # Split by comma and strip spaces and quotes from each genre
        genres = [genre.strip().strip("'") for genre in genre_string.split(',')]
        return genres
    else:
        # If it's already a list, return it as is
        return genre_string

# Apply the cleaning function
data['genre'] = data['genre'].apply(clean_genre_string)

# Now, check the cleaned genre column
print(data['genre'].head())  # Should show cleaned lists of genres

# Now apply MultiLabelBinarizer
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
genre_encoded = mlb.fit_transform(data['genre'])

# Create a DataFrame with the encoded genres
genre_df = pd.DataFrame(genre_encoded, columns=mlb.classes_)

# Combine the original data with the encoded genres
data_encoded = pd.concat([data[['title']], genre_df], axis=1)

# Check the encoded DataFrame
print(data_encoded.head())


0    [Animation, Action, Adventure]
1       [Biography, Drama, History]
2          [Crime, Drama, Thriller]
3        [Action, Adventure, Drama]
4                 [Drama, Thriller]
Name: genre, dtype: object
                                 title  Action  Adventure  Animation  \
0  Spider-Man: Across the Spider-Verse       1          1          1   
1                          Oppenheimer       0          0          0   
2                                Joker       0          0          0   
3                    Avengers: Endgame       1          1          0   
4                             Parasite       0          0          0   

   Biography  Comedy  Crime  Drama  Family  Fantasy  ...  History  Horror  \
0          0       0      0      0       0        0  ...        0       0   
1          1       0      0      1       0        0  ...        1       0   
2          0       0      1      1       0        0  ...        0       0   
3          0       0      0      1       0        0 

In [63]:
print(data_encoded.head())  # Display the first few rows of the encoded DataFrame

                                 title  Action  Adventure  Animation  \
0  Spider-Man: Across the Spider-Verse       1          1          1   
1                          Oppenheimer       0          0          0   
2                                Joker       0          0          0   
3                    Avengers: Endgame       1          1          0   
4                             Parasite       0          0          0   

   Biography  Comedy  Crime  Drama  Family  Fantasy  ...  History  Horror  \
0          0       0      0      0       0        0  ...        0       0   
1          1       0      0      1       0        0  ...        1       0   
2          0       0      1      1       0        0  ...        0       0   
3          0       0      0      1       0        0  ...        0       0   
4          0       0      0      1       0        0  ...        0       0   

   Music  Musical  Mystery  Romance  Sci-Fi  Thriller  War  Western  
0      0        0        0        

In [64]:
data_encoded.to_csv('encoded_top_100_imdb_movies.csv', index=False)

TRAIN TEST SPLIT

In [70]:
from sklearn.model_selection import train_test_split
X = data_encoded[['title']]  # Features (you can include other relevant features)
y = genre_df  # Target (binary matrix for genres)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

VECTORIZATION

In [71]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the titles
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train['title'])
X_test_tfidf = tfidf_vectorizer.transform(X_test['title'])

In [None]:
MODEL DEFINATION AND TRAINING

In [72]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the model
model = RandomForestClassifier(random_state=42)

# Fit the model on the training data
model.fit(X_train_tfidf, y_train)


MODEL EVALUATION

In [73]:
# Make predictions
y_pred = model.predict(X_test_tfidf)

In [74]:
from sklearn.metrics import accuracy_score, f1_score

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")

Accuracy: 0.2
F1 Score: 0.45734843387904606


PREDICTIONS

In [77]:
def suggest_movies_by_genre(genre_input):
    # Check if the input genre is in the list of available genres
    if genre_input in mlb.classes_:
        # Get the movies that contain the specified genre
        suggested_movies = data_encoded[data_encoded[genre_input] == 1]
        return suggested_movies[['title']].values.flatten().tolist()  # Return the titles as a list
    else:
        return f"Genre '{genre_input}' not found in the dataset."

# Example usage
user_input_genre = input("Please enter a genre: ")
suggested_movies = suggest_movies_by_genre(user_input_genre)
print("Suggested Movies:", suggested_movies)

Please enter a genre:  Horror


Suggested Movies: ['The Shining', 'Alien', 'Psycho']
