In [30]:
import pandas as pd
import random

# 1. Recommendation System with LSH

## 1.1 Data Preparation

Before we biggin let's download and explore our dataset.

In [None]:
# Loading all the data
movies_df = pd.read_csv('movie.csv')
g_scores_df = pd.read_csv('genome_scores.csv')
g_tags_df = pd.read_csv('genome_tags.csv')
link_df = pd.read_csv('link.csv')
rating_df = pd.read_csv('rating.csv')
tag_df = pd.read_csv('tag.csv')

For this part of the project we will need only the movies and rating df. So we are going to merge them and analyse them.

In [11]:
# Merge the two Data sets
titles_and_ratings_df = pd.merge(movies_df, rating_df) 

In [12]:
titles_and_ratings_df

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3,4.0,1999-12-11 13:36:47
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,6,5.0,1997-03-13 17:50:52
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8,4.0,1996-06-05 13:37:51
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,10,4.0,1999-11-25 02:44:47
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,11,4.5,2009-01-02 01:13:41
...,...,...,...,...,...,...
20000258,131254,Kein Bund für's Leben (2007),Comedy,79570,4.0,2015-03-30 19:32:59
20000259,131256,"Feuer, Eis & Dosenbier (2002)",Comedy,79570,4.0,2015-03-30 19:48:08
20000260,131258,The Pirates (2014),Adventure,28906,2.5,2015-03-30 19:56:32
20000261,131260,Rentun Ruusu (2001),(no genres listed),65409,3.0,2015-03-30 19:57:46


In [15]:
# Checking for missing values
titles_and_ratings_df.isnull().sum()

movieId      0
title        0
genres       0
userId       0
rating       0
timestamp    0
dtype: int64

In [16]:
titles_and_ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000263 entries, 0 to 20000262
Data columns (total 6 columns):
 #   Column     Dtype  
---  ------     -----  
 0   movieId    int64  
 1   title      object 
 2   genres     object 
 3   userId     int64  
 4   rating     float64
 5   timestamp  object 
dtypes: float64(1), int64(2), object(3)
memory usage: 915.5+ MB


In [None]:
# Total number of movies in the data set
len(titles_and_ratings_df["title"].unique())

26729

In [24]:
# All the kinds of genres
pd.DataFrame(titles_and_ratings_df["genres"].unique(), columns= ["Kinds"])

Unnamed: 0,Kinds
0,Adventure|Animation|Children|Comedy|Fantasy
1,Adventure|Children|Fantasy
2,Comedy|Romance
3,Comedy|Drama|Romance
4,Comedy
...,...
1324,Adventure|Children|Drama|Sci-Fi
1325,Children|Documentary|Drama
1326,Action|Adventure|Animation|Fantasy|Horror
1327,Animation|Children|Comedy|Fantasy|Sci-Fi


In [25]:
# Total number of users
len(titles_and_ratings_df["userId"].unique())

138493

## 1.2 Minhash Signatures

Using the <strong> userId </strong> and <strong> movieId </strong> columns, implement your own MinHash function. This function will hash each user's watched movie list, creating a representation that allows for quick comparisons of user similarities.

To start with, we are going to create a dictionary that will contain all the movies each user has watched. So as keys we are going to use userId and as values, the movieId.

In [28]:
# Create the dictionary
users_dict = titles_and_ratings_df.groupby('userId')['movieId'].apply(set).to_dict()
users_dict

{1: {2,
  29,
  32,
  47,
  50,
  112,
  151,
  223,
  253,
  260,
  293,
  296,
  318,
  337,
  367,
  541,
  589,
  593,
  653,
  919,
  924,
  1009,
  1036,
  1079,
  1080,
  1089,
  1090,
  1097,
  1136,
  1193,
  1196,
  1198,
  1200,
  1201,
  1208,
  1214,
  1215,
  1217,
  1219,
  1222,
  1240,
  1243,
  1246,
  1249,
  1258,
  1259,
  1261,
  1262,
  1266,
  1278,
  1291,
  1304,
  1321,
  1333,
  1348,
  1350,
  1358,
  1370,
  1374,
  1387,
  1525,
  1584,
  1750,
  1848,
  1920,
  1967,
  1994,
  1997,
  2021,
  2100,
  2118,
  2138,
  2140,
  2143,
  2173,
  2174,
  2193,
  2194,
  2253,
  2288,
  2291,
  2542,
  2628,
  2644,
  2648,
  2664,
  2683,
  2692,
  2716,
  2761,
  2762,
  2804,
  2872,
  2918,
  2944,
  2947,
  2959,
  2968,
  3000,
  3030,
  3037,
  3081,
  3153,
  3265,
  3438,
  3476,
  3479,
  3489,
  3499,
  3889,
  3932,
  3996,
  3997,
  4011,
  4027,
  4105,
  4128,
  4133,
  4226,
  4306,
  4446,
  4467,
  4571,
  4720,
  4754,
  4878,
  4896,
  4911,


Now we can move on and build our own MinHash function.

In [None]:
# First we have to define the hash function
def hash_function(hashes, values):
    # It creates a number of hash functions and puts them in a list
    hashes_list = []
    prime = 150001 # in order to avoid collisions.
    for i in range(hashes):
        a = random.randint(1, values)
        b = random.randint(0, values)
        hashes_list.append(lambda x, a=a, b=b, p=prime: (a * x + b) % p)
    return hashes_list

In [35]:
# Define MinHash Function
def minhash(set, hashes_list):
    minhash_vector = []
    for i in hashes_list:
        min_hash = min(i(title) for title in set)
        minhash_vector.append(min_hash)
    return minhash_vector

After building our own MinHash function, we are going to define the number of hashes as well as the maximum values (values), in order to generate signature vectors for each user based on their rated movies

In [36]:
# Number of hash functions and values 
hashes = 150
values = 700

hashes_list = hash_function(hashes, values)

# Save each user's signature in a dictionary
users_signatures = {}
for userid, set in users_dict.items():
    minhash_vector = minhash(set, hashes_list)
    users_signatures[userid] = minhash_vector

users_signatures

{1: [1260,
  519,
  829,
  192,
  544,
  786,
  303,
  1453,
  383,
  673,
  117,
  559,
  1177,
  99,
  605,
  371,
  1108,
  226,
  1510,
  997,
  635,
  75,
  859,
  232,
  386,
  425,
  334,
  367,
  18,
  834,
  869,
  864,
  253,
  1014,
  102,
  878,
  228,
  326,
  324,
  201,
  638,
  89,
  345,
  79,
  76,
  496,
  1253,
  606,
  482,
  79,
  675,
  160,
  297,
  126,
  715,
  938,
  586,
  842,
  514,
  807,
  612,
  1634,
  618,
  705,
  1869,
  781,
  1413,
  74,
  10,
  1011,
  263,
  601,
  686,
  704,
  492,
  1290,
  294,
  617,
  658,
  325,
  970,
  1876,
  1466,
  213,
  360,
  389,
  132,
  1700,
  307,
  15,
  648,
  185,
  349,
  1237,
  243,
  860,
  463,
  373,
  716,
  1593,
  147,
  674,
  23,
  6,
  912,
  1003,
  549,
  1094,
  520,
  669,
  1615,
  894,
  964,
  284,
  206,
  1307,
  420,
  626,
  1515,
  542,
  589,
  39,
  72,
  1379,
  475,
  1089,
  198,
  273,
  1145,
  44,
  755,
  1026,
  526,
  814,
  873,
  432,
  51,
  996,
  26,
  922,
  69,
  6

Now if we want to do quick comparisons of user similarities, we have to create a <strong> Jaccard similarity </strong> function.

In [38]:
# Define Jaccard Similarity function
def jaccard_similarity(user1, user2):
    # user1 = signature of the first user
    # user2 = signature of the second user
    similarity = sum(1 for a, b in zip(user1, user2) if a == b) / len(user1)
    return similarity

In [43]:
# Testing 
user1 = users_signatures[3]
user2 = users_signatures[58]
similarity = jaccard_similarity(user1, user2)
print("The similarity between the 2 users is: ", similarity)

The similarity between the 2 users is:  0.25333333333333335
