In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [7]:
movies_df=pd.read_csv("IMDB_10000.csv")
movies_df

Unnamed: 0,title,year,certificate,runtime,genre,desc,rating,votes
0,Freddy,2022,UA 16+,124 min,"Drama, Mystery, Thriller",The lines between love and obsession blur in t...,7.9,16441
1,An Action Hero,2022,U,130 min,Action,Youth Icon. Superstar. Action Hero. At the age...,8.1,15690
2,Kantara,2022,UA,148 min,"Action, Adventure, Drama",It involves culture of Kambala and Bhootha Kol...,8.7,78358
3,Khakee: The Bihar Chapter,2022–,UA 13+,45 min,"Action, Crime, Drama",As a righteous cop pursues a merciless crimina...,8.3,4464
4,Drishyam 2,2022,UA,140 min,"Crime, Drama, Mystery",A gripping tale of an investigation and a fami...,8.6,18743
...,...,...,...,...,...,...,...,...
9995,Kisan Aur Bhagwan,1974,,142 min,"Action, Comedy, Drama",,,
9996,Aadmi Sadak Ka,1977,,138 min,"Drama, Family",,,
9997,Nadodi Mannan,1958,,220 min,"Action, Adventure, Comedy",,,
9998,Njan Marykutty,2018,U,126 min,Drama,,,


In [3]:
movies_df.shape

(10000, 8)

In [8]:
movies_df.isnull().sum()

title             0
year            264
certificate    3440
runtime        2062
genre           133
desc           1041
rating         1041
votes          1041
dtype: int64

In [9]:
movies_df = movies_df.dropna(subset=['title', 'year', 'certificate', 'runtime', 'genre', 'desc', 'rating', 'votes'])
movies_df

Unnamed: 0,title,year,certificate,runtime,genre,desc,rating,votes
0,Freddy,2022,UA 16+,124 min,"Drama, Mystery, Thriller",The lines between love and obsession blur in t...,7.9,16441
1,An Action Hero,2022,U,130 min,Action,Youth Icon. Superstar. Action Hero. At the age...,8.1,15690
2,Kantara,2022,UA,148 min,"Action, Adventure, Drama",It involves culture of Kambala and Bhootha Kol...,8.7,78358
3,Khakee: The Bihar Chapter,2022–,UA 13+,45 min,"Action, Crime, Drama",As a righteous cop pursues a merciless crimina...,8.3,4464
4,Drishyam 2,2022,UA,140 min,"Crime, Drama, Mystery",A gripping tale of an investigation and a fami...,8.6,18743
...,...,...,...,...,...,...,...,...
8953,Pyaar Kii Ye Ek Kahaani,2010–2021,PG,24 min,"Fantasy, Romance",The story of a young village girl who abandons...,8.4,17
8954,Hero Gayab Mode On,2020–2021,UA 13+,23 min,"Action, Adventure, Fantasy","Ghasitaram, under pressure from his wife, deci...",6.6,23
8955,Aakashavaani,2021,UA 16+,124 min,Drama,The Nath family consists of Retired Commission...,5.5,39
8956,Aladdin - Naam Toh Suna Hoga,2018–2021,U,22 min,Fantasy,A wanderer/revolutionary is a dead ringer for ...,8.1,110


In [11]:
movies_df.drop_duplicates()

Unnamed: 0,title,year,certificate,runtime,genre,desc,rating,votes
0,Freddy,2022,UA 16+,124 min,"Drama, Mystery, Thriller",The lines between love and obsession blur in t...,7.9,16441
1,An Action Hero,2022,U,130 min,Action,Youth Icon. Superstar. Action Hero. At the age...,8.1,15690
2,Kantara,2022,UA,148 min,"Action, Adventure, Drama",It involves culture of Kambala and Bhootha Kol...,8.7,78358
3,Khakee: The Bihar Chapter,2022–,UA 13+,45 min,"Action, Crime, Drama",As a righteous cop pursues a merciless crimina...,8.3,4464
4,Drishyam 2,2022,UA,140 min,"Crime, Drama, Mystery",A gripping tale of an investigation and a fami...,8.6,18743
...,...,...,...,...,...,...,...,...
8953,Pyaar Kii Ye Ek Kahaani,2010–2021,PG,24 min,"Fantasy, Romance",The story of a young village girl who abandons...,8.4,17
8954,Hero Gayab Mode On,2020–2021,UA 13+,23 min,"Action, Adventure, Fantasy","Ghasitaram, under pressure from his wife, deci...",6.6,23
8955,Aakashavaani,2021,UA 16+,124 min,Drama,The Nath family consists of Retired Commission...,5.5,39
8956,Aladdin - Naam Toh Suna Hoga,2018–2021,U,22 min,Fantasy,A wanderer/revolutionary is a dead ringer for ...,8.1,110


In [13]:
## data cleaning
movies_df['runtime'] = movies_df['runtime'].str.extract('(\d+)').astype(float)  # Extract numbers from 'runtime'
movies_df['votes'] = movies_df['votes'].str.replace(',', '').astype(float)  # Convert votes to numeric


In [15]:
# Fill missing values
movies_df['desc'] = movies_df['desc'].fillna('')
movies_df['genre'] = movies_df['genre'].fillna('')


In [17]:

# Combine genre and description for text processing
movies_df['features'] = movies_df['genre'] + " " + movies_df['desc']

In [19]:
# TF-IDF Vectorization
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies_df['features'])

In [21]:

# Compute cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [23]:
# Function to get recommendations
def get_recommendations(title, cosine_sim=cosine_sim):
    if title not in movies_df['title'].values:
        return f"Movie '{title}' not found in dataset."

    idx = movies_df[movies_df['title'] == title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:6]  # Top 5 similar movies
    movie_indices = [i[0] for i in sim_scores]

    
    return movies_df.iloc[movie_indices][['title', 'genre', 'rating']]

In [25]:
get_recommendations("The Truck of Dreams")

"Movie 'The Truck of Dreams' not found in dataset."