Download the dataset

In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("grouplens/movielens-20m-dataset")

print("Path to dataset files:", path)

Path to dataset files: /home/pavka/.cache/kagglehub/datasets/grouplens/movielens-20m-dataset/versions/1


In [2]:
import numpy
import pandas as pd
import random
from sympy import isprime

## <strong> 1. Recommendation System with LSH

## <strong> 1.1 Data Preparation
Download the MovieLens dataset from here. After downloading, explore the dataset to understand the structure and identify any preprocessing steps needed.

<strong> There are in total 6 datasets, let's have a quick look at them

In [3]:
genome_scores = pd.read_csv("dataset/genome_scores.csv", index_col =0)
genome_scores.head(3)

Unnamed: 0_level_0,tagId,relevance
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1,0.025
1,2,0.025
1,3,0.05775


In [4]:
genome_tags = pd.read_csv("dataset/genome_tags.csv", index_col = 0)
genome_tags.head(3)

Unnamed: 0_level_0,tag
tagId,Unnamed: 1_level_1
1,007
2,007 (series)
3,18th century


In [5]:
link = pd.read_csv("dataset/link.csv", index_col = 0)
link.head(3)

Unnamed: 0_level_0,imdbId,tmdbId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,114709,862.0
2,113497,8844.0
3,113228,15602.0


In [6]:
movie = pd.read_csv("dataset/movie.csv", index_col = 0)
movie.head(3)

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance


In [7]:
rating = pd.read_csv("dataset/rating.csv", nrows=10000)
rating.head(3)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39


In [8]:
tag = pd.read_csv("dataset/tag.csv", index_col = 0)
tag.head(3)

Unnamed: 0_level_0,movieId,tag,timestamp
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
18,4141,Mark Waters,2009-04-24 18:19:40
65,208,dark hero,2013-05-10 01:41:18
65,353,dark hero,2013-05-10 01:41:19



## <strong>1.2 Minhash Signatures</strong>

Using the userId and movieId columns, implement your own MinHash function. This function will hash each user's watched movie list, creating a representation that allows for quick comparisons of user similarities.

    -Important: Implement your MinHash function from scratch—do not use any pre-built hash functions.
    -Use your MinHash function to generate signature vectors for each user based on their rated movies.
    -Experiment with different hash functions and threshold values to find the most effective configurations. Report these results.


In [9]:
#Let's first group all the users and their correspective movies
user_movies = rating.groupby('userId')['movieId'].apply(list).to_dict()

Let's define a simple modulo hash function: <br> <br>
$
h(x) = (a \cdot x + b) \mod p
$

Where:  
- $ x $: Input value  
- $ a $: Coefficient (non-zero)  
- $ b $: Coefficient  
- $ p $: Prime modulus

In [10]:
def hash_function(x, a ,b ,p): #Simple modulo hashing
    return (a*x + b) % p

We generate multiple hash functions

In [11]:
def create_hash_functions(max_value, max_prime):
    
    primes = [i for i in range(10,max_prime) if isprime(i)]

    # Generate multiple hash functions
    hash_functions = []
    for i in range(10):
        a = random.randint(1, max_value)  # Random coefficient a (non-zero)
        b = random.randint(0, max_value)  # Random coefficient b
        p = random.choice(primes)  # Predefined prime values
        
        hash_functions.append(lambda x, a=a, b=b, p=p: hash_function(x, a, b, p))
    return hash_functions

In [12]:
def minhash(user_movies, hash_functions):
    signature_matrix = {}

    for user, movies in user_movies.items():
        signature_vector = []
        for h in hash_functions:
            min_hash = min(h(movie) for movie in movies)  # Compute min hash value for the user's movies
            signature_vector.append(min_hash)
        signature_matrix[user] = signature_vector
    
    return signature_matrix

In [13]:
hash_functions1 = create_hash_functions(10,20)
hash_functions2 = create_hash_functions(50,50)
hash_functions3 = create_hash_functions(2**32, 1000000)

In [14]:
signature_matrix1 = minhash(user_movies, hash_functions1)
signature_matrix2 = minhash(user_movies, hash_functions2)
signature_matrix3 = minhash(user_movies, hash_functions3)

In [16]:
from itertools import combinations

# True Jaccard similarity
def jaccard_similarity(set1, set2):
    return len(set1.intersection(set2)) / len(set1.union(set2))

# MinHash similarity
def minhash_similarity(sig1, sig2):
    return sum(1 for a, b in zip(sig1, sig2) if a == b) / len(sig1)

# Evaluate effectiveness
def evaluate_effectiveness(user_movies, signature_matrix):
    users = list(user_movies.keys())

    tot_error = 0
    for user1, user2 in combinations(users, 2):
        set1, set2 = set(user_movies[user1]), set(user_movies[user2])
        sig1, sig2 = signature_matrix[user1], signature_matrix[user2]
        
        # Compute similarities
        true_jaccard = jaccard_similarity(set1, set2)
        approx_minhash = minhash_similarity(sig1, sig2)
        
        # Record results
        error = abs(true_jaccard - approx_minhash)
        tot_error += error
    
    return tot_error / (len(users)**2)

# Run evaluation
results1 = evaluate_effectiveness(user_movies, signature_matrix1)
results2 = evaluate_effectiveness(user_movies, signature_matrix2)
results3 = evaluate_effectiveness(user_movies, signature_matrix3)



In [None]:
results1

0.4238409394542968

In [18]:
results2

0.3254228740032653

In [19]:
results3

0.02167691854106384