In [1]:
import pandas as pd

In [2]:
#importing our data (movies)
movies=pd.read_csv('C://Users//VICKY//DataAnalysis//ml,gen ai//movies.csv')

In [3]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [4]:
#Cleanig movie title with regex
import re

def clean_title(title):
    title = re.sub("[^a-zA-Z0-9 ]", "", title)
    return title

In [5]:
movies["clean_title"] = movies["title"].apply(clean_title)

In [6]:

movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


In [7]:
#Creating a tfifd matrix
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2))

tfidf = vectorizer.fit_transform(movies["clean_title"])

In [8]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies.iloc[indices].iloc[::-1]
    
    return results

In [9]:
#Building an interactive search box 
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)
movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            display(search(title))

movie_input.observe(on_type, names='value')


display(movie_input, movie_list)

Text(value='Toy Story', description='Movie Title:')

Output()

In [57]:
movie_id = 89745

#def find_similar_movies(movie_id):
movie = movies[movies["movieId"] == movie_id]

In [43]:
#importing ratings data
ratings=pd.read_csv("G://Isha//COURSES//ratings.csv")

In [44]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434


In [58]:
#finding users who liked the same movie
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()

In [59]:
similar_users

array([    21,    187,    208, ..., 162469, 162485, 162532], dtype=int64)

In [78]:
similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]


In [79]:
similar_user_recs

3741           318
3742           527
3743           541
3744           589
3745           741
             ...  
24998517     91542
24998518     92259
24998522     98809
24998523    102125
24998524    112852
Name: movieId, Length: 577796, dtype: int64

In [80]:
similar_user_recs.value_counts()

89745     6036
58559     3461
59315     3203
79132     3137
2571      2998
          ... 
160402       1
161642       1
158950       1
199648       1
198609       1
Name: movieId, Length: 16553, dtype: int64

In [81]:
#we will try to find only the movies that are greater than 10% of users who are similar to us liked

similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

similar_user_recs = similar_user_recs[similar_user_recs > .10]

In [82]:
similar_user_recs

89745    1.000000
58559    0.573393
59315    0.530649
79132    0.519715
2571     0.496687
           ...   
47610    0.103545
780      0.103380
88744    0.103048
1258     0.101226
1193     0.100895
Name: movieId, Length: 193, dtype: float64

In [91]:
#finding how much all users like movies in similar_user_recs
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]

In [92]:
all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

In [94]:
all_user_recs

318       0.346395
296       0.288146
2571      0.247010
356       0.238136
593       0.228665
            ...   
86332     0.010142
91630     0.009324
122900    0.008573
122926    0.008070
106072    0.005289
Name: movieId, Length: 193, dtype: float64

In [95]:
#Creating a recommendation score
rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
rec_percentages.columns = ["similar", "all"]

In [96]:
rec_percentages

Unnamed: 0,similar,all
89745,1.000000,0.040459
58559,0.573393,0.148256
59315,0.530649,0.054931
79132,0.519715,0.132987
2571,0.496687,0.247010
...,...,...
47610,0.103545,0.022770
780,0.103380,0.054723
88744,0.103048,0.010383
1258,0.101226,0.083887


In [97]:
rec_percentages['score']=rec_percentages['similar']/rec_percentages['all']
rec_percentages

Unnamed: 0,similar,all,score
89745,1.000000,0.040459,24.716368
58559,0.573393,0.148256,3.867590
59315,0.530649,0.054931,9.660345
79132,0.519715,0.132987,3.908027
2571,0.496687,0.247010,2.010791
...,...,...,...
47610,0.103545,0.022770,4.547463
780,0.103380,0.054723,1.889149
88744,0.103048,0.010383,9.924843
1258,0.101226,0.083887,1.206688


In [98]:
rec_percentages.sort_values('score',ascending=False)

Unnamed: 0,similar,all,score
89745,1.000000,0.040459,24.716368
106072,0.103711,0.005289,19.610199
122892,0.241054,0.012367,19.491770
102125,0.216534,0.012119,17.867419
88140,0.215043,0.012052,17.843074
...,...,...,...
296,0.288933,0.288146,1.002730
593,0.222830,0.228665,0.974483
527,0.199967,0.217833,0.917984
1193,0.100895,0.120244,0.839081


In [99]:
rec_percentages.head(10).merge(movies,left_index=True,right_on='movieId')

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
17067,1.0,0.040459,24.716368,89745,"Avengers, The (2012)",Action|Adventure|Sci-Fi|IMAX,Avengers The 2012
12221,0.573393,0.148256,3.86759,58559,"Dark Knight, The (2008)",Action|Crime|Drama|IMAX,Dark Knight The 2008
12324,0.530649,0.054931,9.660345,59315,Iron Man (2008),Action|Adventure|Sci-Fi,Iron Man 2008
14937,0.519715,0.132987,3.908027,79132,Inception (2010),Action|Crime|Drama|Mystery|Sci-Fi|Thriller|IMAX,Inception 2010
2480,0.496687,0.24701,2.010791,2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller,Matrix The 1999
7028,0.496521,0.174679,2.842477,7153,"Lord of the Rings: The Return of the King, The...",Action|Adventure|Drama|Fantasy,Lord of the Rings The Return of the King The 2003
4887,0.48277,0.189801,2.543562,4993,"Lord of the Rings: The Fellowship of the Ring,...",Adventure|Fantasy,Lord of the Rings The Fellowship of the Ring T...
5840,0.46057,0.168365,2.735548,5952,"Lord of the Rings: The Two Towers, The (2002)",Adventure|Fantasy,Lord of the Rings The Two Towers The 2002
21936,0.459907,0.043469,10.580206,112852,Guardians of the Galaxy (2014),Action|Adventure|Sci-Fi,Guardians of the Galaxy 2014
314,0.441683,0.346395,1.275085,318,"Shawshank Redemption, The (1994)",Crime|Drama,Shawshank Redemption The 1994


In [101]:
#Building a recommendation function 

def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()  #finding users similar to us
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
    similar_user_recs = similar_user_recs[similar_user_recs > .10]
    
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)] #finding movies liked by all users
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1) #creating recommendation score
    rec_percentages.columns = ["similar", "all"]
    rec_percentages['score']=rec_percentages['similar']/rec_percentages['all']
    rec_percentages.sort_values('score',ascending=False)
    return rec_percentages.head(10).merge(movies,left_index=True,right_on='movieId')[['score','title','genres']]

In [106]:
#creating an interactive recommendation widget

movie_name_input=widgets.Text(
   value='toy story',
   description='Movie Title:',
   disbaled=False)

recommendation_list=widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title=data['new']
        if len(title)>5:
            results=search(title)
            movie_id=results.iloc[0]['movieId']
            display(find_similar_movies(movie_id))
            
movie_name_input.observe(on_type,names='value')
display(movie_name_input,recommendation_list)
            

Text(value='toy story', description='Movie Title:')

Output()