In [1]:
import pandas as pd
data = pd.read_csv("movies.csv")
data[:5]

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [2]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62423 entries, 0 to 62422
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  62423 non-null  int64 
 1   title    62423 non-null  object
 2   genres   62423 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.4+ MB


In [3]:
#use regex to clean title
#re is regural expresson library
import re
#remove every other thng except which is specifed in the sub 
def clean_title(title):
    return re.sub("[^a-zA-Z0-9 ]","",title)
#create a new clean  title column
data["clean_title"] = data["title"].apply(clean_title)
data[:5]

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995


In [41]:
#change title to numbers 
from sklearn.feature_extraction.text import TfidfVectorizer
#it checks ngrams ie group of two words which are consecutive for more search accuracy
Vectorizer = TfidfVectorizer(ngram_range=(1,2))
#turn title into set of numbers(matrix) 
tfidf = Vectorizer.fit_transform(data["clean_title"])
tfidf

<62423x170073 sparse matrix of type '<class 'numpy.float64'>'
	with 446566 stored elements in Compressed Sparse Row format>

In [5]:
# to find similarities in title 
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
    
    #function to clean your title done above
    title = clean_title(title)
    # change title into a set of numbers 
    query_vec = Vectorizer.transform([title])
    #it will compare query term to the titles and will return how similar they are
    similarity =cosine_similarity(query_vec,tfidf).flatten()
    #get 5 titles that has most similar search terms
    indices =np.argpartition(similarity,-5)[-5:]
    #-1 reverse the results
    result = data.iloc[indices][::-1]
    return result


In [6]:
#widgets are what allows us to enter input and use it
import ipywidgets as widgets
#function used to show display from jupyter cells
from IPython.display import display 
#creating our input text box 
movie_input = widgets.Text(
    value="toy story",
    description ="movie title: ",
    disabled = False 
)
movie_list = widgets.Output()
#called when we type somethng
def on_type (data):
    with movie_list:
        movie_list.clear_output()
        title=data["new"]
        if len(title) > 5:
            display(search(title))
movie_input.observe(on_type, names='value')            
            
display(movie_input,movie_list)        

Text(value='toy story', description='movie title: ')

Output()

In [34]:
ratings = pd.read_csv("ratings.csv")

In [19]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
1048570,7045,4447,3.5,1164258032
1048571,7045,4720,4.0,1164257756
1048572,7045,4857,4.0,1164242753
1048573,7045,4886,5.0,1168033506


In [22]:
#check the data types 
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [36]:
#building a recommendation function 
def find_similar_movies(movie_id):
    #find users similar to us 
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

    similar_user_recs = similar_user_recs[similar_user_recs > .10]
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]
    
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    return rec_percentages.head(10).merge(data, left_index=True, right_on="movieId")[["score", "title", "genres"]]
    

In [42]:
import ipywidgets as widgets
from IPython.display import display

movie_name_input = widgets.Text(
    value='enter movie',
    description='Movie Title:',
    disabled=False
)
recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))

movie_name_input.observe(on_type, names='value')

display(movie_name_input, recommendation_list)

Text(value='enter movie', description='Movie Title:')

Output()