In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
import random
import time
from tqdm import tqdm


In [2]:
ml_df = pd.read_csv('../datasets/movie_lens/ratings.csv.gz')
kgrec_music_df = pd.read_csv('../datasets/kgrec/music_ratings.csv.gz')
# netflix_df = pd.read_csv(f'../datasets/netflix/ratings.csv.gz')
# spotify_df = pd.read_csv(f'../datasets/spotify/ratings.csv.gz')
# spotify_df.rename(columns={'playlist_id': 'user_id'}, inplace=True)

ratings_datasets = [
    # ('MovieLens', ml_df),
    ('KGRec', kgrec_music_df),
    # ('Netflix', netflix_df),
    # ('Spotify', spotify_df),
]

In [3]:
# check sizes of datasets in gb
for dataset_name, dataset_df in ratings_datasets:
    total_mem_usage = dataset_df.memory_usage(deep=True, index=True).sum()
    total_mem_usage_gb = round(total_mem_usage / 1024 ** 3, 2)
    print(f'{dataset_name} {total_mem_usage_gb} gb')


KGRec 0.01 gb


In [4]:
def check_if_ids_sequential(dataset_df):
    max_user_id = dataset_df['user_id'].max()
    max_item_id = dataset_df['item_id'].max()
    count_user_id = dataset_df['user_id'].nunique()
    count_item_id = dataset_df['item_id'].nunique()

    print(f'Max user id: {max_user_id}')
    print(f'Max item id: {max_item_id}')
    print(f'Count user id: {count_user_id}')
    print(f'Count item id: {count_item_id}')

    assert  max_user_id == (count_user_id - 1)
    assert  max_item_id == (count_item_id - 1)

for dataset_name, dataset_df in ratings_datasets:
    print(f'Checking if {dataset_name} dataset is sequential...')
    check_if_ids_sequential(dataset_df)

Checking if KGRec dataset is sequential...
Max user id: 5198
Max item id: 8639
Count user id: 5199
Count item id: 8640


In [5]:
from collections import defaultdict
from scipy.sparse import lil_matrix

class UserRatings:
    items_rated = set()
    ratings = dict()

ratings_datasets_w_our_sparse = []

def transform_to_sparse_representation(dataset_df):
    max_user_id = dataset_df['user_id'].max()
    max_item_id = dataset_df['item_id'].max()

    has_rating = 'rating' in dataset_df.columns

    users_to_items_ratings = defaultdict(UserRatings)
    
    if has_rating:
        for user_id, item_id, rating, *_ in tqdm(dataset_df.itertuples(index=False), total=dataset_df.shape[0]):
            users_to_items_ratings[user_id].items_rated.add(item_id)
            users_to_items_ratings[user_id].ratings[item_id] = rating
    else:
        for user_id, item_id, *_ in tqdm(dataset_df.itertuples(index=False), total=dataset_df.shape[0]):
            users_to_items_ratings[user_id].items_rated.add(item_id)
    
    return users_to_items_ratings

for dataset_name, dataset_df in ratings_datasets:
    ratings_datasets_w_our_sparse.append((dataset_name, dataset_df, transform_to_sparse_representation(dataset_df)))

100%|██████████| 751531/751531 [00:01<00:00, 426049.63it/s]


In [6]:
for _, dataset_df in ratings_datasets:
    dataset_df.sort_values(by='user_id', inplace=True)

In [36]:
sparse = ratings_datasets_w_sparse[0][2]
# get 300th row and square values
print(sparse[300, :].toarray().reshape(-1))

# y = x.getrow(300)
# print(y)

[0. 0. 0. ... 0. 0. 0.]
97.0


In [None]:
from scipy.spatial.distance import cosine
import scipy.stats as ss

def get_random_users(max):
    while True:
        u1, u2 = random.randint(0, max), random.randint(0, max)
        if u1 != u2:
            return u1, u2

# def transform_to_sparse_matrix(dataset_df):
#     max_user_id = dataset_df['user_id'].max()
#     max_item_id = dataset_df['item_id'].max()
#     users_to_items_ratings = lil_matrix((max_user_id + 1, max_item_id + 1), dtype=np.float32)
#     for user_id, item_id, rating, _ in tqdm(dataset_df.itertuples(index=False), total=dataset_df.shape[0]):
#         users_to_items_ratings[user_id, item_id] = rating
#     return users_to_items_ratings


def draw_random_samples(dataset_df, sparse_rating_dataset, num_samples):
    max_user_id = dataset_df['user_id'].max()
    samples = []

    for _ in tqdm(range(num_samples), total=num_samples):
        u1, u2 = get_random_users(max_user_id)

        user_ratings = sparse_rating_dataset[u1, :].toarray().flatten()
        other_user_ratings = sparse_rating_dataset[u2, :].toarray().flatten()

        cosine_similarity = 1 - cosine(user_ratings, other_user_ratings)
        samples.append(cosine_similarity)

    return samples

def model_using_exp(samples):
    exp_params = ss.expon.fit(samples, method='MM')
    # (e1, e2) = exp_params
    # exp_params = (0, e2)
    return exp_params

datasets_modeled_params = {}

for dataset_name, dataset_df, ratings in ratings_datasets_w_sparse:
    samples = draw_random_samples(dataset_df, ratings, num_samples=1_000_000)
    exp_params = model_using_exp(samples)
    datasets_modeled_params[dataset_name] = exp_params
    print(f'{dataset_name}: {exp_params}')

In [None]:
# datasets_modeled_params = {}

# for dataset_name, dataset_df, ratings in ratings_datasets_w_sparse:
#     samples = draw_random_samples(dataset_df, ratings, num_samples=1000)
#     exp_params = model_using_exp(samples)
#     datasets_modeled_params[dataset_name] = exp_params
#     print(f'{dataset_name}: {exp_params}')