<a href="https://colab.research.google.com/github/Nandini-Reddyy/Movie-Recommendation/blob/main/Movie_Recommendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

In [2]:
movies = pd.read_csv("movies.csv")

In [3]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [4]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
import re
def clean_title(title):
  x = re.sub("[^a-zA-Z0-9]"," ",title)
  return x

In [6]:
movies["title"] = movies["title"].astype(str)

In [7]:
movies["clean_title"] = movies["title"].apply(clean_title)

In [8]:
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2))

tfidf = vectorizer.fit_transform(movies["clean_title"])

In [10]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies.iloc[indices].iloc[::-1]

    return results

In [11]:
import ipywidgets as widgets
from IPython.display import display
movie_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)
movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            display(search(title))

movie_input.observe(on_type, names='value')


display(movie_input, movie_list)

Text(value='Toy Story', description='Movie Title:')

Output()

In [12]:
movie_id = 89745
movie = movies[movies["movieId"] == movie_id]

In [13]:
ratings=pd.read_csv("ratings.csv")

In [14]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1.147880e+09
1,1,306,3.5,1.147869e+09
2,1,307,5.0,1.147869e+09
3,1,665,5.0,1.147879e+09
4,1,899,3.5,1.147869e+09
...,...,...,...,...
3190004,21052,317,3.0,8.439064e+08
3190005,21052,318,5.0,8.436576e+08
3190006,21052,319,3.0,8.492056e+08
3190007,21052,322,4.0,8.444320e+08


In [15]:
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp    float64
dtype: object

In [16]:
movie_id=1

In [17]:
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()

In [18]:
similar_users

array([   36,    75,    86, ..., 20993, 21002, 21051])

In [19]:
similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]

In [20]:
similar_user_recs

5101            1
5105           34
5111          110
5114          150
5127          260
            ...  
3189934     91500
3189935     99114
3189936    106002
3189937    106487
3189939    135133
Name: movieId, Length: 171304, dtype: int64

In [21]:
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

similar_user_recs = similar_user_recs[similar_user_recs > .10]

In [22]:
similar_user_recs

1        1.000000
318      0.435338
260      0.378501
296      0.350494
356      0.348435
           ...   
5418     0.102142
4878     0.101730
48516    0.101730
8368     0.100906
953      0.100082
Name: movieId, Length: 102, dtype: float64

In [23]:
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]

In [24]:
all_users

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1.147880e+09
29,1,4973,4.5,1.147869e+09
48,1,7361,5.0,1.147880e+09
72,2,110,5.0,1.141417e+09
76,2,260,5.0,1.141417e+09
...,...,...,...,...
3189966,21052,110,5.0,8.436576e+08
3189967,21052,111,5.0,8.464469e+08
3189991,21052,260,5.0,8.464473e+08
3190000,21052,296,5.0,8.436575e+08


In [25]:
all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

In [26]:
all_user_recs

318      0.342617
296      0.286403
2571     0.245320
356      0.234036
593      0.228497
           ...   
953      0.044212
50872    0.038980
745      0.035903
78499    0.034621
2355     0.023799
Name: movieId, Length: 102, dtype: float64

In [27]:
rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
rec_percentages.columns = ["similar", "all"]

In [28]:
rec_percentages

Unnamed: 0,similar,all
1,1.000000,0.124532
318,0.435338,0.342617
260,0.378501,0.218085
296,0.350494,0.286403
356,0.348435,0.234036
...,...,...
5418,0.102142,0.062061
4878,0.101730,0.071088
48516,0.101730,0.076268
8368,0.100906,0.048725


In [29]:
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]

In [30]:
rec_percentages = rec_percentages.sort_values("score", ascending=False)

In [31]:
rec_percentages

Unnamed: 0,similar,all,score
1,1.000000,0.124532,8.030066
3114,0.277595,0.053085,5.229241
2355,0.105025,0.023799,4.413075
78499,0.142504,0.034621,4.116152
588,0.221170,0.068164,3.244654
...,...,...,...
4973,0.135091,0.110068,1.227335
296,0.350494,0.286403,1.223780
79132,0.159802,0.131815,1.212321
7361,0.123970,0.102631,1.207921


In [32]:
rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
0,1.0,0.124532,8.030066,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,0.277595,0.053085,5.229241,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
2264,0.105025,0.023799,4.413075,2355,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy,Bug s Life A 1998
14813,0.142504,0.034621,4.116152,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
580,0.22117,0.068164,3.244654,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,Aladdin 1992
4780,0.219934,0.070216,3.132254,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,Monsters Inc 2001
6258,0.205519,0.06878,2.988071,6377,Finding Nemo (2003),Adventure|Animation|Children|Comedy,Finding Nemo 2003
587,0.184102,0.062471,2.946995,595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX,Beauty and the Beast 1991
8246,0.193575,0.06719,2.881016,8961,"Incredibles, The (2004)",Action|Adventure|Animation|Children|Comedy,Incredibles The 2004
729,0.102554,0.035903,2.856409,745,Wallace & Gromit: A Close Shave (1995),Animation|Children|Comedy,Wallace Gromit A Close Shave 1995


In [33]:
def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

    similar_user_recs = similar_user_recs[similar_user_recs > .10]
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]

    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]


In [35]:
movie_name_input=widgets.Text(
    value="Toy Story",
    description="Movie Title:",
    disabled=False
)
recommendation_list=widgets.Output()
def on_type(data):
  with recommendation_list:
    recommendation_list.clear_output()
    title=data["new"]
    if len(title)>5:
      results=search(title)
      movie_id=results.iloc[0]["movieId"]
      display(find_similar_movies(movie_id))
movie_name_input.observe(on_type,names="value")
display(movie_name_input,recommendation_list)

Text(value='Toy Story', description='Movie Title:')

Output()