In [1]:
import pandas as pd
import numpy as np
import zipfile
from sklearn.feature_extraction.text import TfidfVectorizer
import regex as re
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
with zipfile.ZipFile("C:/Users/Hp Pc/rec_datasets/movies.zip") as file:
    file_name = file.namelist()
    with file.open(file_name[0], 'r') as f:
        movies = pd.read_csv(f)
        print("Extraction Successful")


Extraction Successful


In [3]:
movies.head(10)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [4]:
with zipfile.ZipFile("C:/Users/Hp Pc/rec_datasets/ratings.zip") as file:
    rate = file.open("ratings.csv")
    ratings = pd.read_csv(rate)
    print("Extaction Successful")

Extaction Successful


In [5]:
ratings.head(10)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
5,1,1088,4.0,1147868495
6,1,1175,3.5,1147868826
7,1,1217,3.5,1147878326
8,1,1237,5.0,1147868839
9,1,1250,4.0,1147868414


DEVELOP A SEARCH

In [6]:
def clean_text(text):
    return re.sub(r'[^a-zA-Z0-9 ]', "", text)

In [7]:
movies['title'] = movies['title'].apply(clean_text)

In [8]:
movies['title']

0                          Toy Story 1995
1                            Jumanji 1995
2                   Grumpier Old Men 1995
3                  Waiting to Exhale 1995
4        Father of the Bride Part II 1995
                       ...               
62418                             We 2018
62419             Window of the Soul 2001
62420                      Bad Poems 2018
62421                   A Girl Thing 2001
62422         Women of Devils Island 1962
Name: title, Length: 62423, dtype: object

In [9]:
movies['tags'] = movies['title'].apply(clean_text) +" " + movies['genres'].apply(clean_text)

In [10]:
vectorizer = TfidfVectorizer(ngram_range=(1,2), stop_words ='english')

title_vec = vectorizer.fit_transform(movies['title'])
tags_vec = vectorizer.fit_transform(movies['tags'])

In [11]:
print(vectorizer.get_feature_names_out())

['00' '00 awakening' '00 schneider' ... 'zyzzyx rd' 'zzero' 'zzero 1974']


In [None]:
title = "Toy Story"

title = clean_text(title)
word_vec  = vectorizer.transform([title])
similarity = cosine_similarity(word_vec, title_vec)

In [21]:
#search function

def search(title):
    title = clean_text(title)
    word_vec = vectorizer.transform([title])
    similarity = cosine_similarity(word_vec, title_vec)
    indices = np.argpartition(similarity, -10)[-10:] #Top 10 in similarity
    results = movies.iloc[indices][::-1]
    return results

In [23]:
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
    value = "",
    description = "Moive Title: ",
    disabled = False
)

search_list = widgets.Output()

def on_type(data):
    with search_list:
        search_list.clear_output()
        title = data['new']
        if len(title) > 4:
            display(search(title))

movie_input.observe(on_type, names='value')
display(movie_input, search_list)

Text(value='', description='Moive Title: ')

Output()

RECOMMENDATION SYSTEM

In [26]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [29]:
movie_id = 2

similar_users = ratings[(ratings['movieId'] == movie_id) & (ratings['rating'] >= 4.0)]['userId'].unique()

In [34]:
similar_users_movies = ratings[(ratings['userId'].isin(similar_users)) & (ratings['rating'] >= 4.0)]['movieId']

In [37]:
similar_users_movies.value_counts()

movieId
2        8276
356      5386
480      4984
318      4496
364      4337
         ... 
25851       1
8518        1
7571        1
7479        1
7066        1
Name: count, Length: 22270, dtype: int64

In [42]:
similar_users_rec = similar_users_movies.value_counts() / len(similar_users_movies) * 10**2
#the percentage of distribution of each movie in this cluster; similar_users

similar_users_rec = similar_users_rec[similar_users_rec > .1]

In [43]:
similar_users_rec

movieId
2        0.606223
356      0.394528
480      0.365081
318      0.329335
364      0.317688
           ...   
2797     0.102551
1259     0.101818
21       0.101525
44191    0.101232
40815    0.100134
Name: count, Length: 147, dtype: float64

In [49]:
#Compare these movies above to all users watching and rating

all_users = ratings[(ratings['movieId'].isin(similar_users_rec.index)) & (ratings['rating'] >= 4)]['userId'].unique()

In [50]:
len(all_users)

157896

In [51]:
all_users_movies = ratings[(ratings['userId'].isin(all_users)) & (ratings['rating'] > 4)]['movieId']

In [62]:
all_users_rec = all_users_movies.value_counts() / len(all_users_movies) * 10**2

#all_users_rec = all_users_rec[all_users_rec > .1]

In [63]:
users_rec = pd.concat([similar_users_rec, all_users_rec], axis =1)
users_rec.columns = ['similar_users', 'all_users']

In [74]:
users_rec['scores'] = users_rec['similar_users'] / users_rec['all_users']
users_rec = users_rec.sort_values('scores', ascending=False).head(20)
users_rec

Unnamed: 0_level_0,similar_users,all_users,scores
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,0.606223,0.046112,13.146755
317,0.126138,0.024707,5.105309
153,0.104968,0.024206,4.336499
208,0.108045,0.027318,3.955087
350,0.102697,0.027456,3.740405
185,0.113246,0.032176,3.519526
367,0.19536,0.058803,3.32229
368,0.113099,0.034874,3.243116
586,0.168477,0.053685,3.138249
500,0.239163,0.080329,2.977317


In [75]:
users_rec_result = users_rec.merge(movies, left_index =True , right_on = 'movieId')[['title', 'genres', 'scores']]

In [76]:
users_rec_result

Unnamed: 0,title,genres,scores
1,Jumanji 1995,Adventure|Children|Fantasy,13.146755
313,Santa Clause The 1994,Comedy|Drama|Fantasy,5.105309
151,Batman Forever 1995,Action|Adventure|Comedy|Crime,4.336499
206,Waterworld 1995,Action|Adventure|Sci-Fi,3.955087
345,Client The 1994,Drama|Mystery|Thriller,3.740405
183,Net The 1995,Action|Crime|Thriller,3.519526
362,Mask The 1994,Action|Comedy|Crime|Fantasy,3.32229
363,Maverick 1994,Adventure|Comedy|Western,3.243116
578,Home Alone 1990,Children|Comedy,3.138249
495,Mrs Doubtfire 1993,Comedy|Drama,2.977317
