In [1]:
import numpy as np
import pandas as pd
import difflib

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Data Colletion and Pre-Processing

In [2]:
# loading data from csv file
df = pd.read_csv("movies.csv")
df.head()

Unnamed: 0,filmtv_id,title,year,genre,duration,country,directors,actors,avg_vote,critics_vote,public_vote,total_votes,description,notes,humor,rhythm,effort,tension,erotism
0,2,Bugs Bunny's Third Movie: 1001 Rabbit Tales,1982,Animation,76,United States,"David Detiege, Art Davis, Bill Perez",,7.7,8.0,7.0,22,"With two protruding front teeth, a slightly sl...","These are many small independent stories, whic...",3,3,0,0,0
1,3,18 anni tra una settimana,1991,Drama,98,Italy,Luigi Perelli,"Kim Rossi Stuart, Simona Cavallari, Ennio Fant...",6.5,6.0,7.0,4,"Samantha, not yet eighteen, leaves the comfort...","Luigi Perelli, the director of the ""Piovra"", o...",0,2,0,2,0
2,17,Ride a Wild Pony,1976,Romantic,91,United States,Don Chaffey,"Michael Craig, John Meillon, Eva Griffith, Gra...",5.6,6.0,5.0,9,"In the Australia of the pioneers, a boy and a ...","""Ecological"" story with a happy ending, not wi...",1,2,1,0,0
3,18,Diner,1982,Comedy,95,United States,Barry Levinson,"Mickey Rourke, Steve Guttenberg, Ellen Barkin,...",7.0,8.0,6.0,18,Five boys from Baltimore have a habit of meeti...,A cast of will be famous for Levinson's direct...,2,2,0,1,2
4,20,A che servono questi quattrini?,1942,Comedy,85,Italy,Esodo Pratelli,"Eduardo De Filippo, Peppino De Filippo, Clelia...",5.9,5.33,7.0,15,"With a stratagem, the penniless and somewhat p...",Taken from the play by Armando Curcio that the...,3,1,1,0,0


In [4]:
df.describe()

Unnamed: 0,filmtv_id,year,duration,avg_vote,critics_vote,public_vote,total_votes,humor,rhythm,effort,tension,erotism
count,40046.0,40046.0,40046.0,40046.0,35536.0,39580.0,40046.0,40046.0,40046.0,40046.0,40046.0,40046.0
mean,55787.412026,1993.155846,100.503446,5.81097,5.806275,5.939869,36.099036,0.580657,1.362408,0.68711,0.93013,0.301578
std,57706.121659,23.624476,26.691985,1.412188,1.597825,1.488673,68.089964,0.897701,1.146867,1.111305,1.092918,0.628829
min,2.0,1897.0,41.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,15474.25,1976.0,90.0,4.9,4.67,5.0,5.0,0.0,0.0,0.0,0.0,0.0
50%,35469.5,2000.0,96.0,5.9,6.0,6.0,11.0,0.0,2.0,0.0,1.0,0.0
75%,68946.5,2013.0,107.0,6.9,7.0,7.0,35.0,1.0,2.0,1.0,2.0,0.0
max,219729.0,2023.0,1525.0,10.0,10.0,10.0,1052.0,5.0,5.0,5.0,5.0,4.0


In [6]:
# checking rows and columns

df.shape

(40046, 19)

In [7]:
selected_features = [ "genre", "title", "directors" ]

In [8]:
for feature in selected_features:
    df[feature] = df[feature].fillna("")

# Fitting a vectorizer


In [13]:
vectorizer = TfidfVectorizer()

feature_vector = vectorizer.fit_transform(df["title"])

In [14]:
feature_vector.shape

(40046, 28728)

# calculating similarity scores using cosine similarity


In [15]:
similarity = cosine_similarity(feature_vector)

In [16]:
similarity.shape

(40046, 40046)

# Recommendation System

In [24]:
movie_name = input("Movie Name: ")

close_match = difflib.get_close_matches(movie_name, df["title"].tolist())[0]
print("Close Match:", close_match)

index_of_movie = df[df["title"] == close_match].index.values[0]
print("Index of Movie:", index_of_movie)

similarity_score = list( enumerate( similarity[index_of_movie] ) )
#print("Similarity Score:", similarity_score)

sorted_similarity_score = sorted( similarity_score, key=lambda x: x[1], reverse=True )
#print("Sorted similarity Score:", sorted_similarity_score)

# Recommending the 3 most similar movies
recommendations = sorted_similarity_score[:3]
print("Recommendations:", recommendations)

# Getting the title of the movies
recommendation_ids = [ recommendation[0] for recommendation in recommendations ]
print("Recommendation IDs:", recommendation_ids)

recommendation_titles = [ df[ df.index == id ]["title"].tolist()[0] for id in recommendation_ids ]
print("Recommended movies:", recommendation_titles)

Close Match: Iron Man
Index of Movie: 20976
Recommendations: [(20976, 1.0), (23348, 1.0), (27083, 1.0)]
Recommendation IDs: [20976, 23348, 27083]
Recommended movies: ['Iron Man', 'Iron Man 2', 'Iron Man 3']
