In [1]:
import pandas as pd
import random
import csv

# 1. Recommendation System with LSH

## 1.1 Data Preparation

Before we biggin let's download and explore our dataset.

In [2]:
# Loading all the data
movies_df = pd.read_csv('movie.csv')
g_scores_df = pd.read_csv('genome_scores.csv')
g_tags_df = pd.read_csv('genome_tags.csv')
link_df = pd.read_csv('link.csv')
rating_df = pd.read_csv('rating.csv')
tag_df = pd.read_csv('tag.csv')

For this part of the project we will need only the movies and rating df. So we are going to merge them and analyse them.

In [3]:
# Merge the two Data sets
titles_and_ratings_df = pd.merge(movies_df, rating_df) 

In [12]:
titles_and_ratings_df

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3,4.0,1999-12-11 13:36:47
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,6,5.0,1997-03-13 17:50:52
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8,4.0,1996-06-05 13:37:51
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,10,4.0,1999-11-25 02:44:47
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,11,4.5,2009-01-02 01:13:41
...,...,...,...,...,...,...
20000258,131254,Kein Bund für's Leben (2007),Comedy,79570,4.0,2015-03-30 19:32:59
20000259,131256,"Feuer, Eis & Dosenbier (2002)",Comedy,79570,4.0,2015-03-30 19:48:08
20000260,131258,The Pirates (2014),Adventure,28906,2.5,2015-03-30 19:56:32
20000261,131260,Rentun Ruusu (2001),(no genres listed),65409,3.0,2015-03-30 19:57:46


In [15]:
# Checking for missing values
titles_and_ratings_df.isnull().sum()

movieId      0
title        0
genres       0
userId       0
rating       0
timestamp    0
dtype: int64

In [16]:
titles_and_ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000263 entries, 0 to 20000262
Data columns (total 6 columns):
 #   Column     Dtype  
---  ------     -----  
 0   movieId    int64  
 1   title      object 
 2   genres     object 
 3   userId     int64  
 4   rating     float64
 5   timestamp  object 
dtypes: float64(1), int64(2), object(3)
memory usage: 915.5+ MB


In [None]:
# Total number of movies in the data set
len(titles_and_ratings_df["title"].unique())

26729

In [24]:
# All the kinds of genres
pd.DataFrame(titles_and_ratings_df["genres"].unique(), columns= ["Kinds"])

Unnamed: 0,Kinds
0,Adventure|Animation|Children|Comedy|Fantasy
1,Adventure|Children|Fantasy
2,Comedy|Romance
3,Comedy|Drama|Romance
4,Comedy
...,...
1324,Adventure|Children|Drama|Sci-Fi
1325,Children|Documentary|Drama
1326,Action|Adventure|Animation|Fantasy|Horror
1327,Animation|Children|Comedy|Fantasy|Sci-Fi


In [25]:
# Total number of users
len(titles_and_ratings_df["userId"].unique())

138493

## 1.2 Minhash Signatures

Using the <strong> userId </strong> and <strong> movieId </strong> columns, implement your own MinHash function. This function will hash each user's watched movie list, creating a representation that allows for quick comparisons of user similarities.

To start with, we are going to create a dictionary that will contain all the movies each user has watched. So as keys we are going to use userId and as values, the movieId.

In [4]:
# Create the dictionary
users_dict = titles_and_ratings_df.groupby('userId')['movieId'].apply(set).to_dict()

Now we can move on and build our own MinHash function.

In [21]:
# First we have to define the hash function
def hash_function(hashes, values, prime):
    # It creates a number of hash functions and puts them in a list
    hashes_list = []
    for i in range(hashes):
        a = random.randint(1, values)
        b = random.randint(0, values)
        hashes_list.append(lambda x, a=a, b=b, p=prime: (a * x + b) % p)
    return hashes_list

In [6]:
# Define MinHash Function
def minhash(set, hashes_list):
    minhash_vector = []
    for i in hashes_list:
        min_hash = min(i(title) for title in set)
        minhash_vector.append(min_hash)
    return minhash_vector

After building our own MinHash function, we are going to define the number of hashes as well as the maximum values (values), in order to generate signature vectors for each user based on their rated movies

In [9]:
# Number of hash functions and values 
hashes = 150
values = max(titles_and_ratings_df['movieId'])

hashes_list = hash_function(hashes, values)

# Save each user's signature in a dictionary
users_signatures = {}
for userid, set in users_dict.items():
    minhash_vector = minhash(set, hashes_list)
    users_signatures[userid] = minhash_vector

In [20]:
# Save it in A csv file
csv_filename = 'users_signatures.csv'

# Open the file in write mode
with open(csv_filename, mode='w', newline='') as file:
    writer = csv.writer(file)
    
    # Write the header 
    writer.writerow(['User', 'Signature'])
    
    # Write the dictionary data
    for user, signature in users_signatures.items():
        # Join the signature list into a string
        signature_str = ','.join(map(str, signature))
        # Write each row
        writer.writerow([user, signature_str])

Now if we want to do quick comparisons of user similarities, we have to create a <strong> Jaccard similarity </strong> function.

In [10]:
# Define Jaccard Similarity function
def jaccard_similarity(user1, user2):
    # user1 = signature of the first user
    # user2 = signature of the second user
    similarity = sum(1 for a, b in zip(user1, user2) if a == b) / len(user1)
    return similarity

In [19]:
# Testing 
user1 = users_signatures[6]
user2 = users_signatures[6]
similarity = jaccard_similarity(user1, user2)
print("The similarity between the 2 users is: ", similarity)

The similarity between the 2 users is:  1.0


Experiment with different hash functions and threshold values to find the most effective configurations. Report these results.

In [None]:
def experiment(users_dict, hash_configs, thresholds):
    """
    Experiment with different hash configurations and thresholds.
    - hash_configs: List of (num_hashes, prime) tuples.
    - thresholds: List of similarity thresholds to test.
    """
    results = []
    
    for num_hashes, prime in hash_configs:
        # Generate hash functions
        hash_funcs = hash_function(num_hashes, values, prime)
        
        # Compute MinHash signatures
        signatures = minhash(set, hashes_list)
        
        # Compare all user pairs
        users = list(signatures.keys())
        comparisons = []
        for i in range(len(users)):
            for j in range(i + 1, len(users)):
                user1, user2 = users[i], users[j]
                sim = jaccard_similarity(signatures[user1], signatures[user2])
                comparisons.append((user1, user2, sim))
        
        # Evaluate thresholds
        for threshold in thresholds:
            similar_pairs = [(u1, u2) for u1, u2, sim in comparisons if sim >= threshold]
            results.append({
                'num_hashes': num_hashes,
                'prime': prime,
                'threshold': threshold,
                'similar_pairs': similar_pairs,
                'num_similar_pairs': len(similar_pairs)
            })
    
    return results
