In [1]:
import pandas as pd
import numpy as np
import zipfile
from sklearn.feature_extraction.text import TfidfVectorizer
import regex as re
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
with zipfile.ZipFile("C:/Users/Hp Pc/rec_datasets/movies.zip") as file:
    file_name = file.namelist()
    with file.open(file_name[0], 'r') as f:
        movies = pd.read_csv(f)
        print("Extraction Successful")


Extraction Successful


In [3]:
movies.head(10)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [4]:
with zipfile.ZipFile("C:/Users/Hp Pc/rec_datasets/ratings.zip") as file:
    rate = file.open("ratings.csv")
    ratings = pd.read_csv(rate)
    print("Extaction Successful")

Extaction Successful


In [5]:
ratings.head(10)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
5,1,1088,4.0,1147868495
6,1,1175,3.5,1147868826
7,1,1217,3.5,1147878326
8,1,1237,5.0,1147868839
9,1,1250,4.0,1147868414


DEVELOP A SEARCH

In [6]:
def clean_text(text):
    return re.sub(r'[^a-zA-Z0-9 ]', "", text)

In [7]:
movies['title'] = movies['title'].apply(clean_text)

In [8]:
movies['title']

0                          Toy Story 1995
1                            Jumanji 1995
2                   Grumpier Old Men 1995
3                  Waiting to Exhale 1995
4        Father of the Bride Part II 1995
                       ...               
62418                             We 2018
62419             Window of the Soul 2001
62420                      Bad Poems 2018
62421                   A Girl Thing 2001
62422         Women of Devils Island 1962
Name: title, Length: 62423, dtype: object

In [9]:
movies['tags'] = movies['title'].apply(clean_text) +" " + movies['genres'].apply(clean_text)

In [10]:
vectorizer = TfidfVectorizer(ngram_range=(1,2), stop_words ='english')

title_vec = vectorizer.fit_transform(movies['title'])
tags_vec = vectorizer.fit_transform(movies['tags'])

In [11]:
print(vectorizer.get_feature_names_out())

['00' '00 awakening' '00 schneider' ... 'zyzzyx rd' 'zzero' 'zzero 1974']


In [None]:
title = "Toy Story"

title = clean_text(title)
word_vec  = vectorizer.transform([title])
similarity = cosine_similarity(word_vec, title_vec)

In [21]:
#search function

def search(title):
    title = clean_text(title)
    word_vec = vectorizer.transform([title])
    similarity = cosine_similarity(word_vec, title_vec)
    indices = np.argpartition(similarity, -5)[-5:] #Top 5 in similarity
    results = movies.iloc[indices][::-1]
    return results

In [23]:
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
    value = "",
    description = "Moive Title: ",
    disabled = False
)

search_list = widgets.Output()

def on_type(data):
    with search_list:
        search_list.clear_output()
        title = data['new']
        if len(title) > 4:
            display(search(title))

movie_input.observe(on_type, names='value')
display(movie_input, search_list)

Text(value='', description='Moive Title: ')

Output()

RECOMMENDATION SYSTEM

In [26]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [89]:
movie_id = 1

similar_users = ratings[(ratings['movieId'] == movie_id) & (ratings['rating'] >= 4.0)]['userId'].unique()

In [90]:
similar_users_movies = ratings[(ratings['userId'].isin(similar_users)) & (ratings['rating'] >= 4.0)]['movieId']

In [91]:
similar_users_movies.value_counts()

movieId
1         37709
318       20725
260       20043
356       19504
296       18694
          ...  
153913        1
153917        1
6501          1
41704         1
198609        1
Name: count, Length: 30595, dtype: int64

In [92]:
similar_users_rec = similar_users_movies.value_counts() / len(similar_users_movies) * 10**2
#the percentage of distribution of each movie in this cluster; similar_users

similar_users_rec = similar_users_rec[similar_users_rec > .1]

In [93]:
similar_users_rec

movieId
1        0.739104
318      0.406214
260      0.392847
356      0.382282
296      0.366406
           ...   
68157    0.101098
1288     0.100392
1247     0.100392
1584     0.100373
1207     0.100000
Name: count, Length: 165, dtype: float64

In [111]:
#Compare these movies above to all users watching and rating

all_users = ratings[(ratings['movieId'].isin(similar_users_rec.index)) & (ratings['rating'] >= 4)]

In [112]:
all_users_rec =   all_users['movieId'].value_counts() / len(all_users['userId'].unique()) 

In [113]:
all_users_rec

movieId
318      0.443192
296      0.392294
356      0.370039
593      0.364344
2571     0.350347
           ...   
1356     0.068489
2987     0.068112
2797     0.066359
1517     0.066308
78499    0.058100
Name: count, Length: 165, dtype: float64

In [114]:
users_rec = pd.concat([similar_users_rec, all_users_rec], axis =1)
users_rec.columns = ['similar_users', 'all_users']

In [115]:
users_rec['scores'] = users_rec['similar_users'] / users_rec['all_users']
users_rec = users_rec.sort_values('scores', ascending=False).head(20)
users_rec

Unnamed: 0_level_0,similar_users,all_users,scores
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.739104,0.237007,3.11849
3114,0.243101,0.102932,2.361765
78499,0.119679,0.0581,2.059876
2355,0.141239,0.069445,2.033834
1073,0.171325,0.099783,1.716976
2797,0.112446,0.066359,1.694523
4886,0.228617,0.136482,1.675066
2987,0.111819,0.068112,1.641689
6377,0.217366,0.13503,1.609758
8961,0.193238,0.120587,1.60248


In [116]:
users_rec_result = users_rec.merge(movies, left_index =True , right_on = 'movieId')[['title', 'genres', 'scores']]

In [117]:
users_rec_result

Unnamed: 0,title,genres,scores
0,Toy Story 1995,Adventure|Animation|Children|Comedy|Fantasy,3.11849
3021,Toy Story 2 1999,Adventure|Animation|Children|Comedy|Fantasy,2.361765
14813,Toy Story 3 2010,Adventure|Animation|Children|Comedy|Fantasy|IMAX,2.059876
2264,Bugs Life A 1998,Adventure|Animation|Children|Comedy,2.033834
1047,Willy Wonka the Chocolate Factory 1971,Children|Comedy|Fantasy|Musical,1.716976
2705,Big 1988,Comedy|Drama|Fantasy|Romance,1.694523
4780,Monsters Inc 2001,Adventure|Animation|Children|Comedy|Fantasy,1.675066
2895,Who Framed Roger Rabbit 1988,Adventure|Animation|Children|Comedy|Crime|Fant...,1.641689
6258,Finding Nemo 2003,Adventure|Animation|Children|Comedy,1.609758
8246,Incredibles The 2004,Action|Adventure|Animation|Children|Comedy,1.60248


In [None]:
def get_recommendation(movie_id):
    similar_users = ratings[(ratings['movieId'] == movie_id) & (ratings['rating'] >= 4.0)]['userId'].unique()
    similar_users_movies = ratings[(ratings['userId'].isin(similar_users)) & (ratings['rating'] >= 4.0)]['movieId']

    similar_users_rec = similar_users_movies.value_counts() / len(similar_users_movies) * 10**2
    #the percentage of distribution of each movie in this cluster; similar_users
    similar_users_rec = similar_users_rec[similar_users_rec > .1]

    all_users = ratings[(ratings['movieId'].isin(similar_users_rec.index)) & (ratings['rating'] >= 4)]
    all_users_rec =   all_users['movieId'].value_counts() / len(all_users['userId'].unique()) 

    users_rec = pd.concat([similar_users_rec, all_users_rec], axis =1)
    users_rec.columns = ['similar_users', 'all_users']

    users_rec['scores'] = users_rec['similar_users'] / users_rec['all_users']
    users_rec = users_rec.sort_values('scores', ascending=False).head(15)

    #return movies that are not in the search results
    rec_results = users_rec[~(users_rec.index).isin(results.index)]
    
    return users_rec

RECOMMENDATION BY GENRES

In [None]:
def recommend_genre(genres):
    genre = clean_text(genres)
    genre_vec = vectorizer.transform([genre])
    similarity = cosine_similarity(genre_vec, tag_vec)
    indices = np.argpartition(similarity, -10)[-10:]
    genre_result = movies.iloc[indices][::-1]

    #Movies that are not in the recommended movies
    genre_result = genre_result[~(genre_result.index).isin(rec_results.index)]
    return genre_result

In [120]:
movie_input_name = widgets.Text(
    value = "Harry Potter",
    description = "Moive Title: ",
    disabled = False
)

recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data['new']
        if len(title) > 4:
            result = search(title)
            movie_id = result.iloc[0]['movieId']
            genre = result.iloc[0]['genres']
            display(get_recommendation(movie_id))

movie_input.observe(on_type, names='value')
display(movie_input, recommendation_list)

Text(value='harry potter', description='Moive Title: ')

Output()