# Movie Recommendation System

A movie recommendation system is an application designed to suggest movies to users based on their preferences, historical data, and behavior. It utilizes cosine similarity and ratings to analyze and recommend movies to users.

Importing lirabries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
movies=pd.read_csv("/content/drive/MyDrive/movies dataset/movies.csv")

In [None]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [None]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [None]:
movies.isnull().sum()

movieId    0
title      0
genres     0
dtype: int64

Data cleaning

In [None]:
import re
def clean_title(title):
  return re.sub("[^a-zA-Z0-9 ]","",title)

In [None]:
movies["clean_title"]=movies["title"].apply(clean_title)

In [None]:
movies.head()

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995


Preprocessing

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer =TfidfVectorizer(ngram_range=(1,2))
tfidf=vectorizer.fit_transform(movies["clean_title"])

In [None]:
print(tfidf)

  (0, 138180)	0.5609151642422612
  (0, 153617)	0.5236464902527855
  (0, 763)	0.2947573407787223
  (0, 138134)	0.30818287987354687
  (0, 153609)	0.4788631896261391
  (1, 76516)	0.679914841526996
  (1, 76515)	0.6556226145512709
  (1, 763)	0.3284429867728573
  (2, 93339)	0.4587178998289233
  (2, 107075)	0.4026827592738571
  (2, 61532)	0.4587178998289233
  (2, 93306)	0.2658829644982531
  (2, 107020)	0.2945915056134832
  (2, 61531)	0.4587178998289233
  (2, 763)	0.22159051090518359
  (3, 47815)	0.4482553482876628
  (3, 151964)	0.4482553482876628
  (3, 161363)	0.4482553482876628
  (3, 47814)	0.4482553482876628
  (3, 151795)	0.1883000782500215
  (3, 161345)	0.33752574781287953
  (3, 763)	0.21653641961669368
  (4, 70008)	0.39452077294643884
  (4, 111066)	0.3080717668800027
  (4, 20729)	0.4091386155137103
  :	:
  (62419, 135163)	0.335141385640017
  (62419, 842)	0.21664294220561653
  (62419, 165119)	0.3842738783112516
  (62419, 106460)	0.1903879435867097
  (62419, 143735)	0.09431759289399541
  (6

compute similarity

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
def search(title):
  title=clean_title(title)
  query_vec=vectorizer.transform([title])
  similarity=cosine_similarity(query_vec,tfidf).flatten()
  indices=np.argpartition(similarity,-5)[-5:]
  results=movies.iloc[indices][::-1]
  return results

In [None]:
movie_name=input("Enter the movie name ")
res=search(movie_name)


Enter the movie name the hulk


In [None]:
res

Unnamed: 0,movieId,title,genres,clean_title
6411,6534,Hulk (2003),Action|Adventure|Sci-Fi,Hulk 2003
12425,60040,"Incredible Hulk, The (2008)",Action|Sci-Fi,Incredible Hulk The 2008
51827,183983,Hulk: Where Monsters Dwell (2016),Action|Animation|Fantasy|Sci-Fi,Hulk Where Monsters Dwell 2016
32940,142056,Iron Man & Hulk: Heroes United (2013),Action|Adventure|Animation,Iron Man Hulk Heroes United 2013
45854,171251,"Nobody Speak: Hulk Hogan, Gawker and Trials of...",Documentary,Nobody Speak Hulk Hogan Gawker and Trials of a...


Loading ratings dataset

In [None]:
ratings=pd.read_csv("/content/drive/MyDrive/movies dataset/ratings.csv")

In [None]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [None]:
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [None]:
ratings.isnull().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [None]:
movie_id=1

Users who have rated (Toy Story) more than 4 ratings

In [None]:
similar_users=ratings[(ratings["movieId"]==movie_id)&(ratings["rating"]>4)]["userId"].unique()

In [None]:
similar_users

array([    36,     75,     86, ..., 162527, 162530, 162533])

The movie of the same users which they have rated more than 4 , ie similar movies we can also suggest

In [None]:
similar_user_recs=ratings[(ratings["userId"].isin(similar_users))&(ratings["rating"]>4)]

In [None]:
similar_user_recs

Unnamed: 0,userId,movieId,rating,timestamp
5101,36,1,5.0,857131378
5105,36,34,5.0,834413787
5111,36,110,5.0,834412999
5114,36,150,5.0,839928587
5127,36,260,5.0,857131062
...,...,...,...,...
24998854,162533,60069,4.5,1280919889
24998861,162533,67997,4.5,1280920712
24998876,162533,78499,4.5,1281405901
24998884,162533,81591,4.5,1297289876


Now we will recommend only those movies for which 10% or more the users similar to us have rated

In [None]:
similar_user_recs=similar_user_recs["movieId"]

In [None]:
similar_user_recs.value_counts()

1         18835
318        8393
260        7605
356        6973
296        6918
          ...  
128478        1
125125        1
119701        1
107563        1
7625          1
Name: movieId, Length: 19282, dtype: int64

Dividing to find the percentage

In [None]:
similar_user_recs=similar_user_recs.value_counts() / len(similar_users)

Only taking movies which are greater than 10%

In [None]:
similar_user_recs=similar_user_recs[similar_user_recs>0.1]

In [None]:
similar_user_recs

1        1.000000
318      0.445607
260      0.403770
356      0.370215
296      0.367295
           ...   
953      0.103053
551      0.101195
1222     0.100876
745      0.100345
48780    0.100186
Name: movieId, Length: 113, dtype: float64

All of the users who watched movies recommended to us

In [None]:
all_users=ratings[(ratings["movieId"].isin(similar_user_recs.index))& (ratings["rating"]>4)]

In [None]:
all_users

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
29,1,4973,4.5,1147869080
48,1,7361,5.0,1147880055
72,2,110,5.0,1141416589
76,2,260,5.0,1141417172
...,...,...,...,...
25000062,162541,5618,4.5,1240953299
25000065,162541,5952,5.0,1240952617
25000078,162541,7153,5.0,1240952613
25000081,162541,7361,4.5,1240953484


In [None]:
all_users_recs=all_users["movieId"].value_counts() / len(all_users["userId"].unique())

If the 100 % of users similar to us like toy story and also 100 % of other users(all users) also like the toy story so in this case toy story is not a good recommendation since toy story is a movie loved by all.

In [None]:
all_users_recs

318      0.342220
296      0.284674
2571     0.244033
356      0.235266
593      0.225909
           ...   
551      0.040918
50872    0.039111
745      0.037031
78499    0.035131
2355     0.025091
Name: movieId, Length: 113, dtype: float64

Recommendation

In [None]:
rec_percentages=pd.concat([similar_user_recs,all_users_recs],axis=1)
rec_percentages.columns=["similar","all"]

In [None]:
rec_percentages

Unnamed: 0,similar,all
1,1.000000,0.124728
318,0.445607,0.342220
260,0.403770,0.222207
356,0.370215,0.235266
296,0.367295,0.284674
...,...,...
953,0.103053,0.045792
551,0.101195,0.040918
1222,0.100876,0.066877
745,0.100345,0.037031


So we want movies that have big difference between them

In [None]:
rec_percentages["score"]=rec_percentages["similar"]/rec_percentages["all"]

In [None]:
rec_percentages=rec_percentages.sort_values("score",ascending=False)

Higher the score better the recommendation

In [None]:
rec_percentages

Unnamed: 0,similar,all,score
1,1.000000,0.124728,8.017414
3114,0.280648,0.053706,5.225654
2355,0.110539,0.025091,4.405452
78499,0.152960,0.035131,4.354038
4886,0.235147,0.070811,3.320783
...,...,...,...
2858,0.216724,0.167634,1.292845
296,0.367295,0.284674,1.290232
79132,0.166817,0.131384,1.269693
4973,0.142501,0.112405,1.267747


Top 10 movies to be recommended

In [None]:
rec_percentages.head(10)

Unnamed: 0,similar,all,score
1,1.0,0.124728,8.017414
3114,0.280648,0.053706,5.225654
2355,0.110539,0.025091,4.405452
78499,0.15296,0.035131,4.354038
4886,0.235147,0.070811,3.320783
588,0.216618,0.067513,3.208539
6377,0.228139,0.072268,3.156862
595,0.1794,0.059977,2.99115
8961,0.203504,0.068453,2.972889
364,0.253411,0.085764,2.954762


In [None]:
rec_percentages.head(10).merge(movies,left_index=True,right_on="movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
0,1.0,0.124728,8.017414,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,0.280648,0.053706,5.225654,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
2264,0.110539,0.025091,4.405452,2355,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy,Bugs Life A 1998
14813,0.15296,0.035131,4.354038,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
4780,0.235147,0.070811,3.320783,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,Monsters Inc 2001
580,0.216618,0.067513,3.208539,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,Aladdin 1992
6258,0.228139,0.072268,3.156862,6377,Finding Nemo (2003),Adventure|Animation|Children|Comedy,Finding Nemo 2003
587,0.1794,0.059977,2.99115,595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX,Beauty and the Beast 1991
8246,0.203504,0.068453,2.972889,8961,"Incredibles, The (2004)",Action|Adventure|Animation|Children|Comedy,Incredibles The 2004
359,0.253411,0.085764,2.954762,364,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX,Lion King The 1994


Creating a function by using the above steps

In [None]:
def find_similar_movies(movie_id):
  similar_users=ratings[(ratings["movieId"]==movie_id)&(ratings["rating"]>4)]["userId"].unique()
  similar_user_recs=ratings[(ratings["userId"].isin(similar_users))&(ratings["rating"]>4)]

  similar_user_recs=similar_user_recs["movieId"].value_counts() / len(similar_users)
  similar_user_recs=similar_user_recs[similar_user_recs>0.1]

  all_users=ratings[(ratings["movieId"].isin(similar_user_recs.index))& (ratings["rating"]>4)]
  all_users_recs=all_users["movieId"].value_counts() / len(all_users["userId"].unique())


  rec_percentages=pd.concat([similar_user_recs,all_users_recs],axis=1)
  rec_percentages.columns=["similar","all"]

  rec_percentages["score"]=rec_percentages["similar"]/rec_percentages["all"]

  rec_percentages=rec_percentages.sort_values("score",ascending=False)

  return rec_percentages.head(10).merge(movies,left_index=True,right_on="movieId")[["score","title","genres"]]

Implementation

In [None]:
movie_name=input("Enter the movie name ")
res=search(movie_name)
movie_id=res.iloc[0]["movieId"]
r=find_similar_movies(movie_id)

Enter the movie name Moonlight


In [None]:
r

Unnamed: 0,score,title,genres
41769,200.064474,Moonlight,Drama
48904,72.682599,The Florida Project (2017),Drama
55938,69.773375,Roma (2018),Drama
43222,62.173883,Manchester by the Sea (2016),Drama
44566,60.646722,Call Me by Your Name (2017),Drama|Romance
51758,48.471678,The Favourite,Drama
48887,44.145682,Lady Bird (2017),Comedy
18487,42.355992,"Master, The (2012)",Drama
40153,41.454801,The Handmaiden (2016),Drama|Romance|Thriller
51773,40.287583,Hereditary (2018),(no genres listed)
