In [544]:
import warnings
%matplotlib inline
# Ignore all warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from nltk.tokenize import RegexpTokenizer
from sklearn.base import BaseEstimator, TransformerMixin
from nltk.stem import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity
import ipywidgets as widgets
from IPython.display import display
# Create a new dataFrame from the clean lines
data = pd.read_csv('movies.csv')

data


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [545]:
#Remove brackets
def new_title(title):
    title = re.sub("[^a-zA-Z0-9 ]", "", title)
    return title

In [546]:
#add the new title without brackets
data["new_title"] = data["title"].apply(new_title)
data.info()
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62423 entries, 0 to 62422
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   movieId    62423 non-null  int64 
 1   title      62423 non-null  object
 2   genres     62423 non-null  object
 3   new_title  62423 non-null  object
dtypes: int64(1), object(3)
memory usage: 1.9+ MB


Unnamed: 0,movieId,title,genres,new_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995


In [547]:
# Custom transformer for Tokenizer
class Tokenizer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.tokenizer = RegexpTokenizer(r'\w+')

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.apply(lambda x: self.tokenizer.tokenize(x.lower()) if isinstance(x, str) else [])

In [548]:
# Custom transformer for stemming
class Stemmer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.stemmer = PorterStemmer()

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.apply(lambda tokens: ' '.join([self.stemmer.stem(word) for word in tokens]))


In [549]:
# Create a TfidfVectorizer instance with desired parameters
vectorizer = TfidfVectorizer(
    ngram_range=(1,2),
    stop_words='english'  # Remove common English stop words
   
)

In [550]:
# Define the pipeline
pipeline = Pipeline([
    ('tokenize', Tokenizer()),
    ('stem', Stemmer()),
    ('tfidf', vectorizer),
])


In [551]:
# creating a new dataset (TFIDF vectorizer) with the pipeline
moviespip= pipeline.fit_transform(data['new_title'])

Creating the search results function

In [552]:
title= 'Toy Story'
title = new_title(title)
query_vectorizer = vectorizer.transform([title])
similarity = cosine_similarity(query_vectorizer, moviespip).flatten()
similarity
indices = np.argpartition(similarity,-5)[-5:]
indices 
results = data.iloc[indices][::-1]


turn the above into a function for use

In [553]:
# creating a fuction to search with title
def search(title):
    # select the tiltle and find it in new_title column 
    title = new_title(title)
    query_vectorizer = vectorizer.transform([title])
     #the similarity finds other titles similar to the variable title
    similarity = cosine_similarity(query_vectorizer, moviespip).flatten()
    #find the 5 most similar titles
    indices = np.argpartition(similarity, -5)[-5:]
    #index the movie data in order to find the titles
    results = data.iloc[indices].iloc[::-1]
    
    return results




creating the widget function

In [554]:

movie_input = widgets.Text(
         #default value
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)
#output widget to do the interaction
movie_list = widgets.Output()
# function that gets called every time something is typed
def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
         #if the length of the title is bigger than 5
        if len(title) > 5:
    #display the top 5 titles for recommendation
            display(search(title))
# if something is typed this is called, name values are observed
movie_input.observe(on_type, names='value')

#creating the widget for interactivity 
display(movie_input, movie_list)


Text(value='Toy Story', description='Movie Title:')

Output()

In [555]:
movie_id = 89745
#def find_similar(movie_id):
movie = data[data["movieId"] == movie_id]
movie

Unnamed: 0,movieId,title,genres,new_title
17067,89745,"Avengers, The (2012)",Action|Adventure|Sci-Fi|IMAX,Avengers The 2012


In [556]:
ratings = pd.read_csv("ratings.csv")

In [557]:
# finds users that liked the same data
close_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()

In [558]:
close_users

array([    21,    187,    208, ..., 162469, 162485, 162532], dtype=int64)

In [559]:
#finds the other data that the similar users liked
close_user_recommendations = ratings[(ratings["userId"].isin(close_users)) & (ratings["rating"] > 4)]["movieId"]# movie id joins the two datasets

In [560]:
close_user_recommendations


3741           318
3742           527
3743           541
3744           589
3745           741
             ...  
24998517     91542
24998518     92259
24998522     98809
24998523    102125
24998524    112852
Name: movieId, Length: 577796, dtype: int64

In [561]:
# determines the data that were greater than 10% liked
close_user_recommendations = close_user_recommendations.value_counts() / len(close_users)

close_user_recommendations = close_user_recommendations[close_user_recommendations > .10]

In [562]:
close_user_recommendations.value_counts()

0.191186    2
0.136017    2
0.182240    2
0.131710    2
0.116634    2
           ..
0.216700    1
0.216534    1
0.215043    1
0.214049    1
0.100895    1
Name: movieId, Length: 182, dtype: int64

determine how much all users liked a movie to determine whether it is just a universally liked movie

In [563]:
# find similar or 5 star ratings for all users 
all_users = ratings[(ratings["movieId"].isin(close_user_recommendations.index)) & (ratings["rating"] > 4)]

In [564]:
all_users

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
29,1,4973,4.5,1147869080
48,1,7361,5.0,1147880055
72,2,110,5.0,1141416589
76,2,260,5.0,1141417172
...,...,...,...,...
25000065,162541,5952,5.0,1240952617
25000078,162541,7153,5.0,1240952613
25000081,162541,7361,4.5,1240953484
25000086,162541,31658,4.5,1240953287


In [565]:
# finding percentage of all users that recommend a spesific movie
all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

creating a recommendation score

In [566]:
# combines the two series together 
recommended_percent = pd.concat([close_user_recommendations, all_user_recs], axis=1)
recommended_percent.columns = ["similar", "all"]

In [567]:
#each recommended movie, how much users were similar to 'us' and the average rating
recommended_percent

Unnamed: 0,similar,all
89745,1.000000,0.040459
58559,0.573393,0.148256
59315,0.530649,0.054931
79132,0.519715,0.132987
2571,0.496687,0.247010
...,...,...
47610,0.103545,0.022770
780,0.103380,0.054723
88744,0.103048,0.010383
1258,0.101226,0.083887


to see if it is a valid recommendation and not just a well liked movie the similar column and the all column should have a bigger difference

In [568]:
recommended_percent["score"] = recommended_percent["similar"] / recommended_percent["all"]

In [569]:
#create a score of the ratio
recommended_percent = recommended_percent.sort_values("score", ascending=False)

In [570]:
#create the table
recommended_percent.head(10).merge(data, left_index=True, right_on="movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,new_title
17067,1.0,0.040459,24.716368,89745,"Avengers, The (2012)",Action|Adventure|Sci-Fi|IMAX,Avengers The 2012
20513,0.103711,0.005289,19.610199,106072,Thor: The Dark World (2013),Action|Adventure|Fantasy|IMAX,Thor The Dark World 2013
25058,0.241054,0.012367,19.49177,122892,Avengers: Age of Ultron (2015),Action|Adventure|Sci-Fi,Avengers Age of Ultron 2015
19678,0.216534,0.012119,17.867419,102125,Iron Man 3 (2013),Action|Sci-Fi|Thriller|IMAX,Iron Man 3 2013
16725,0.215043,0.012052,17.843074,88140,Captain America: The First Avenger (2011),Action|Adventure|Sci-Fi|Thriller|War,Captain America The First Avenger 2011
16312,0.175447,0.010142,17.299824,86332,Thor (2011),Action|Adventure|Drama|Fantasy|IMAX,Thor 2011
21348,0.287608,0.016737,17.183667,110102,Captain America: The Winter Soldier (2014),Action|Adventure|Sci-Fi|IMAX,Captain America The Winter Soldier 2014
25071,0.214049,0.012856,16.649399,122920,Captain America: Civil War (2016),Action|Sci-Fi|Thriller,Captain America Civil War 2016
25061,0.136017,0.008573,15.865628,122900,Ant-Man (2015),Action|Adventure|Sci-Fi,AntMan 2015
14628,0.242876,0.015517,15.651921,77561,Iron Man 2 (2010),Action|Adventure|Sci-Fi|Thriller|IMAX,Iron Man 2 2010


In [571]:
#create a function of everything regarding rating that we did
def find_similar(movie_id):
    close_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    close_user_recommendations = ratings[(ratings["userId"].isin(close_users)) & (ratings["rating"] > 4)]["movieId"]
    close_user_recommendations = close_user_recommendations.value_counts() / len(close_users)

    close_user_recommendations = close_user_recommendations[close_user_recommendations > .10]  
    all_users = ratings[(ratings["movieId"].isin(close_user_recommendations.index)) & (ratings["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    recommended_percent = pd.concat([close_user_recommendations, all_user_recs], axis=1)
    recommended_percent.columns = ["similar", "all"]
    
    recommended_percent["score"] = recommended_percent["similar"] / recommended_percent["all"]
    recommended_percent = recommended_percent.sort_values("score", ascending=False)
    return recommended_percent.head(10).merge(data, left_index=True, right_on="movieId")[["score", "title", "genres"]]

In [572]:

movie_name = widgets.Text(
     #default value
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)
#output widget to do the interaction
rec_list = widgets.Output()
# function that gets called every time something is typed
def on_type(data):
    with rec_list:
        rec_list.clear_output()
        title = data["new"]
         #if the length of the title is bigger than 5
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
             #display the top 5 titles for recommendation
            display(find_similar(movie_id))
# if something is typed this is called, name values are observed
movie_name.observe(on_type, names='value')
#creating the widget for interactivity 
display(movie_name, rec_list)

Text(value='Toy Story', description='Movie Title:')

Output()