In [19]:
from tqdm import tqdm
import numpy as np
from src.structures import User, Movie
from src.data_methods import read_movies,read_viewers
import kagglehub
import os

# Data downloading and preprocessing

In [20]:
# Download latest version
#please ikke offentliggør min api key. 
os.environ['KAGGLE_USERNAME'] = "marcusgaleajacobsen"
os.environ['KAGGLE_KEY'] = "32a3003f52c97053841ea46c492128dc"
datapath = kagglehub.dataset_download("netflix-inc/netflix-prize-data")



In [40]:
n_lines = np.inf#1000000#number of reviews to read
datafiles = ["combined_data_1.txt"]#, "combined_data_2.txt", "combined_data_3.txt", "combined_data_4.txt"]
with_tqdm = True #set to True to see progress bar (reduce speed)
reviews_pr_user = 10 #number of reviews per user to read

movies = read_movies(datapath)
users = read_viewers(datapath, movies, datafiles = datafiles, with_tqdm= with_tqdm, n_lines=n_lines) #read only 100000 

100%|██████████| 24058263/24058263 [01:25<00:00, 282088.44it/s]


# Computing similarities between movies

In [22]:
#compute similarity matrix
n_movies = len(movies)
sim_matrix = np.zeros((n_movies, n_movies))
for i, movie_id1 in enumerate(tqdm(movies)):
    for j, movie_id2 in enumerate(movies):
        if j < i:
            sim_matrix[i, j] = movies[movie_id1].similarity(movies[movie_id2], method = "jaccard")
            sim_matrix[j, i] = sim_matrix[i, j]
        if j == i:
            sim_matrix[i, j] = np.nan

100%|██████████| 225/225 [00:38<00:00,  5.86it/s]


In [23]:
#heatmap of similarity matrix
import plotly.express as px
import plotly.graph_objects as go
#get parent folder
parent = os.path.dirname(os.getcwd())
fig = go.Figure(data=go.Heatmap(
                   z=sim_matrix,
                   x=list(movies.keys()),
                   y=list(movies.keys()),
                   hoverongaps = False))
fig.update_layout(
    title='Similarity matrix',
    xaxis_title='Movie ID',
    yaxis_title='Movie ID')
#save figure in plots as png
fig.write_image(os.path.join(parent, "plots", "similarity_matrix.png"), width=1200, height=800, scale=3)
fig.show()


In [39]:
#Threshold value
t = 0.04

#make histogram of of similarity values with threshold value
import plotly.express as px
import plotly.graph_objects as go
#only use upper triangle of similarity matrix
sim_values = sim_matrix[np.triu_indices(n_movies, k=1)]
#sim_values = sim_values[~np.isnan(sim_values)]
#sim_values = sim_values[sim_values > t]
fig = px.histogram(x=sim_values, nbins=100, title="Histogram of similarity values")
#vertical line at threshold value
fig.add_shape(
    dict(
        type="line",
        x0=t,
        y0=0,
        x1=t,
        y1=8000,
        line=dict(
            color="Red",
            width=3
        ),
        #name
        name="Threshold value"
    )
)
#x label
fig.update_xaxes(title_text='Similarity')
#put legend for line and histogram
fig.update_layout(showlegend=True)
#y label
fig.update_yaxes(title_text='Count')
#title
fig.update_layout(title_text='Histogram of similarity values')
fig.write_image(os.path.join(parent, "plots", "similarity_histogram.png"), width=1200, height=800, scale=3)
fig.show()

In [49]:
#find proportion of movies with similarity above threshold
n_similar = np.sum(sim_values > t)
prop = n_similar / len(sim_values)
n_similar

836

In [25]:
b = 20
r= int(np.ceil(-np.log(b)/np.log(t)))
n_hashes = b*r

# Finding similar movies with LSH

In [None]:
from src.similarity_methods import compute_signatures, create_buckets, get_candidates, trim_candidates
n_buckets = 2**16
SIG = compute_signatures(movies, n_hashes, with_tqdm = True) #compute signatures for all movies

100%|██████████| 225/225 [00:10<00:00, 21.85it/s]


In [None]:
buckets = create_buckets(SIG, bands = b, n_buckets= n_buckets) #create buckets
candidate_pairs = get_candidates(buckets) #get candidate pairs

100%|██████████| 165/165 [00:00<?, ?it/s]
100%|██████████| 181/181 [00:00<?, ?it/s]
100%|██████████| 144/144 [00:00<00:00, 144044.78it/s]
100%|██████████| 169/169 [00:00<?, ?it/s]
100%|██████████| 168/168 [00:00<00:00, 158954.00it/s]
100%|██████████| 157/157 [00:00<?, ?it/s]
100%|██████████| 170/170 [00:00<00:00, 126244.99it/s]
100%|██████████| 152/152 [00:00<?, ?it/s]
100%|██████████| 169/169 [00:00<?, ?it/s]
100%|██████████| 166/166 [00:00<?, ?it/s]
100%|██████████| 169/169 [00:00<?, ?it/s]
100%|██████████| 153/153 [00:00<00:00, 140452.73it/s]
100%|██████████| 167/167 [00:00<?, ?it/s]
100%|██████████| 153/153 [00:00<00:00, 151066.03it/s]
100%|██████████| 166/166 [00:00<00:00, 912522.23it/s]
100%|██████████| 161/161 [00:00<00:00, 161628.28it/s]
100%|██████████| 150/150 [00:00<00:00, 135650.19it/s]
100%|██████████| 170/170 [00:00<?, ?it/s]
100%|██████████| 143/143 [00:00<00:00, 143902.46it/s]
100%|██████████| 157/157 [00:00<00:00, 152396.60it/s]
100%|██████████| 20/20 [00:00<00:00, 249

In [28]:
# n_removed = trim_candidates(movies, candidates, threshold = t)
# print(f"Removed {n_removed} from {len(candidates)} candidates")

In [29]:
from src.performance_metrics import get_true_pairs, confusion_matrix, sensitivity_specificity
true_pairs = get_true_pairs(movies, sim_matrix, t)
true_pairs

[('2', '69'),
 ('2', '149'),
 ('3', '57'),
 ('3', '97'),
 ('3', '145'),
 ('3', '213'),
 ('4', '207'),
 ('5', '69'),
 ('5', '91'),
 ('5', '92'),
 ('6', '201'),
 ('7', '41'),
 ('7', '43'),
 ('7', '64'),
 ('7', '66'),
 ('7', '86'),
 ('7', '87'),
 ('7', '99'),
 ('7', '100'),
 ('7', '119'),
 ('7', '130'),
 ('7', '134'),
 ('7', '139'),
 ('7', '147'),
 ('7', '190'),
 ('7', '214'),
 ('7', '219'),
 ('7', '222'),
 ('8', '18'),
 ('8', '30'),
 ('8', '52'),
 ('8', '83'),
 ('8', '97'),
 ('8', '108'),
 ('8', '143'),
 ('8', '156'),
 ('8', '167'),
 ('8', '175'),
 ('8', '187'),
 ('8', '191'),
 ('8', '197'),
 ('8', '199'),
 ('8', '223'),
 ('9', '41'),
 ('9', '134'),
 ('10', '112'),
 ('12', '47'),
 ('12', '50'),
 ('12', '54'),
 ('12', '63'),
 ('12', '73'),
 ('12', '90'),
 ('12', '104'),
 ('12', '117'),
 ('12', '155'),
 ('12', '161'),
 ('12', '173'),
 ('14', '149'),
 ('14', '163'),
 ('14', '174'),
 ('14', '220'),
 ('15', '61'),
 ('15', '218'),
 ('16', '24'),
 ('16', '48'),
 ('16', '55'),
 ('16', '58'),
 ('

# Performance evaluation

In [30]:
CM = confusion_matrix(candidate_pairs, true_pairs, len(movies))
CM

array([[  271,  2830],
       [  565, 21534]])

In [31]:
TPR, TNR = sensitivity_specificity(CM)

In [32]:
#use TPR and FPR FNR and TNR to evaluate the method
FPR = 1 - TNR
FNR = 1 - TPR
print(f"TPR: {TPR:.2f}, FPR: {FPR:.2f}, FNR: {FNR:.2f}, TNR: {TNR:.2f}")

TPR: 0.32, FPR: 0.12, FNR: 0.68, TNR: 0.88
