# BCSF19M506 Muhammad Ahmad

# BCSF19M511 Hamza Saleem

# BCSF19M512 Hafiz Abdulmanan



Movies Recommendation System with Search Engine on Collaborative Filtering 

In [2]:
import pandas as pd

movies = pd.read_csv("movies1.csv")
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance


Removing Other than alphanumeric characters

In [4]:
import re

def clean_title(title):
    return re.sub("[^a-zA-Z0-9 ]","",title)


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance


A new column for cleaned titiles

In [6]:
movies["clean_title"] = movies["title"].apply(clean_title)
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995


Making sparse matrices of every document (evrey title) using fit-transform function 

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

tfidf = vectorizer.fit_transform(movies["clean_title"])

print(vectorizer.vocabulary_)

print(tfidf.toarray())



{'toy': 8, 'story': 6, '1995': 0, 'jumanji': 3, 'grumpier': 2, 'old': 5, 'men': 4, 'waiting': 9, 'to': 7, 'exhale': 1}
[[0.34618161 0.         0.         0.         0.         0.
  0.66338461 0.         0.66338461 0.        ]
 [0.46263733 0.         0.         0.88654763 0.         0.
  0.         0.         0.         0.        ]
 [0.28847675 0.         0.55280532 0.         0.55280532 0.55280532
  0.         0.         0.         0.        ]
 [0.28847675 0.55280532 0.         0.         0.         0.
  0.         0.55280532 0.         0.55280532]]


Now comparing the best matching result with our previous matrix using cosine similarity

In [13]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec,tfidf).flatten()
    indices = np.argpartition(similarity,-5)[-5:]
    results = movies.iloc[indices][::-1]
    return results


A widget to get the input and test its working

In [15]:
import ipywidgets as widgets
from IPython.display import display

movie_input =  widgets.Text(value = "Toy Story", description = "Movie Title", disabled = False)

movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        display(search(title))
            
movie_input.observe(on_type, names = 'value')
display(movie_input, movie_list)

Text(value='Toy Story', description='Movie Title')

Output()

In [7]:
ratings = pd.read_csv("ratings.csv")

Now Colloborative filtering on the result got from search engine and printing best 10 recommendations

In [8]:
def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
    similar_user_recs = similar_user_recs[similar_user_recs > .1]
    
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_users_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    
    rec_percentages = pd.concat([similar_user_recs,all_users_recs],axis = 1)
    rec_percentages.columns = ["similar","all"]
    
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    
    rec_percentages = rec_percentages.sort_values("score", ascending = False)
    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score","title","genres"]]

Displaying widget to get resulting recommendations

In [9]:
movie_name_input =  widgets.Text(value = "Toy Story", description = "Movie Title", disabled = False)

recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))
            
movie_name_input.observe(on_type, names = 'value')
display(movie_name_input, recommendation_list)

Text(value='Toy Story', description='Movie Title')

Output()