In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm
from scipy.stats import pearsonr
from scipy.sparse import coo_matrix, lil_matrix, csr_matrix
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity
import math
import json
import torch
from torch.utils.data import DataLoader, Dataset
from matplotlib import pyplot as plt
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
import re
# import faiss
import pyarrow.parquet as pq
import pyarrow as pa
import fastparquet
import joblib

os.chdir('../')
os.getcwd()

'f:\\NTU Learn\\DATA MINING\\ntu_sd6125_recsys'

In [266]:
def sample_by_user_nums(data, sample_size,data_set):
    
    if data_set == 'ml-20m':
        user_id = 'userId'
    elif data_set == 'book_crossing':
        user_id = 'User-ID'

    user_nums = data.groupby(user_id).size()
    # get the msot active users
    user_nums = user_nums.sort_values(ascending=False)
    user_nums = user_nums[:sample_size]

    data = data[data[user_id].isin(user_nums.index)]

    return data

def sample_by_item_nums(data, sample_size,data_set):
        
    if data_set == 'ml-20m':
        item_id = 'movieId'
    elif data_set == 'book_crossing':
        item_id = 'ISBN'

    item_nums = data.groupby(item_id).size()
    # get the msot active users
    item_nums = item_nums.sort_values(ascending=False)
    item_nums = item_nums[:sample_size]

    data = data[data[item_id].isin(item_nums.index)]

    return data

In [None]:
# book-crossing dataset
book_ratings = pd.read_csv('data/book_crossing/ratings_raw.csv')
# transfer itemId to integer
# map_dict is a pre-defined dictionary that maps ISBN to itemId
# with open('data/book_crossing/map_dict.json', 'r') as f:
#     map_dict = json.load(f)
# book_ratings['ISBN'] = book_ratings['ISBN'].apply(lambda x: map_dict[x])
# book_ratings = book_ratings.rename(columns={'User-ID':'userId', 'Book-Rating':'rating','ISBN':'itemId'})
books_df = pd.read_csv('data/book_crossing/Books.csv')
print(book_ratings.shape[0])
print(book_ratings['ISBN'].nunique(), book_ratings['User-ID'].nunique())
books = list(pd.unique(books_df['ISBN']))
book_ratings = book_ratings[book_ratings['ISBN'].isin(books)]
book_ratings = book_ratings[book_ratings['Book-Rating'] != 0]
print(book_ratings.shape[0])
print(book_ratings['ISBN'].nunique(), book_ratings['User-ID'].nunique())

In [108]:
# map_dict = dict(zip(books_df['ISBN'].unique(), range(books_df['ISBN'].nunique())))
# with open('data/book_crossing/map_dict.json', 'w') as f:
#     json.dump(map_dict, f)
# # map_dict

In [None]:
book_ratings_subset = sample_by_user_nums(book_ratings, 3000, 'book_crossing')
print(book_ratings_subset['User-ID'].nunique(), book_ratings_subset['ISBN'].nunique(), book_ratings_subset.shape[0])
book_ratings_subset = sample_by_item_nums(book_ratings_subset, 25000, 'book_crossing')
print(book_ratings_subset['User-ID'].nunique(), book_ratings_subset['ISBN'].nunique(), book_ratings_subset.shape[0])

In [269]:
book_ratings_subset.to_csv('data/book_crossing/ratings.csv', index=False)

In [None]:
book_rating_df = pd.read_csv('data/book_crossing/ratings.csv')
book_rating_df.loc[:,'Book-Rating'].max(), book_rating_df.loc[:,'Book-Rating'].min()

In [None]:
ml_df = pd.read_csv('data/ml-20m/ratings.csv')
ml_df.loc[:,'rating'].max(), ml_df.loc[:,'rating'].min()

In [None]:
# movie-lens dataset
ratings = pd.read_csv('data/ml-20m/ratings_raw.csv')
ratings.rename(columns={'userId':'userId', 'movieId':'itemId', 'rating':'rating'}, inplace=True)
# print('building user-item matrix...')
ratings['userId'].nunique(), ratings['itemId'].nunique(), ratings.shape[0]

In [None]:
ratings_subset = sample_by_user_nums(ratings, 3000, 'ml-20m')
ratings_subset['userId'].nunique(), ratings_subset['itemId'].nunique(), ratings_subset.shape[0]

In [271]:
ratings_subset.to_csv('data/ml-20m/ratings.csv', index=False)

# Collaborate Filtering-based Recommendation

Original data is in the format of ['userId', 'itemId', 'rating', 'timestamp']

In order perform CF, we only need the interation record between users and items, thus we consider prepare the data according to following process

- train_test_split
    - Time Order split
    - Random split
    - Leave-One(K)-Out split
    - Sliding window split

- load train/test data into dict form as {user_id: [item_id1, item_id2, ..., item_idn]}

- preparing features i.e. similarities (in most cases)

1. rating.csv
2. train_ratings.csv
3. test_ratings.csv
4. user_item_rating_matrix
5. user_user_sim_matrix
6. item_item_sim_matrix

### utils

In [44]:
# utils

def convert_df_to_dict(trn_data, val_data):

    """
    trn_data: pd.DataFrame, training data
    val_data: pd.DataFrame, validation data
    """

    trn_user_items_score = trn_data.groupby('userId').apply(lambda x: dict(zip(x['itemId'], x['rating']))).to_dict()
    val_user_items_score = val_data.groupby('userId').apply(lambda x: dict(zip(x['itemId'], x['rating']))).to_dict()

    trn_data = trn_data.groupby('userId')['itemId'].apply(list).reset_index()
    val_data = val_data.groupby('userId')['itemId'].apply(list).reset_index()

    trn_user_items = {}
    val_user_items = {}

    for user, m_df in zip(*(list(trn_data['userId']), list(trn_data['itemId']))):
        trn_user_items[user] = set(m_df)

    for user, m_df in zip(*(list(val_data['userId']), list(val_data['itemId']))):
        val_user_items[user] = set(m_df)
    
    return trn_user_items_score, val_user_items_score, trn_user_items, val_user_items

def dict_slice(adict, start, end):
    keys = adict.keys()
    dict_slice = {}
    for k in list(keys)[start:end]:
        dict_slice[k] = adict[k]
    return dict_slice

def get_i2u2s_reverse_dict(user_items_score):
    item_score_dict = defaultdict(lambda: [0] * len(user_items_score))

    user_ids = sorted(user_items_score.keys())

    user_index_map = {user_id: idx for idx, user_id in enumerate(user_ids)}

    for user_id, items in user_items_score.items():
        user_index = user_index_map[user_id]
        for item_id, score in items.items():
            item_score_dict[item_id][user_index] = score

    item_score_dict = dict(item_score_dict)

    return item_score_dict

def get_i2u_reverse_dict(user_items):
    item_users = {}
    print('Building item_users reverse dict...')
    for user_id, items in tqdm(user_items.items()):
        for item in items:
            if item not in item_users:
                item_users[item] = set()
            item_users[item].add(user_id)
    return item_users

def get_user_item_dict(user_items_score):

    user_item_dict = {}
    for user_id, items in user_items_score.items():
        for item_id, score in items.items():
            if user_id not in user_item_dict:
                user_item_dict[user_id] = set()
            user_item_dict[user_id].add(item_id)

    return user_item_dict

def convert_df_to_sparse_matrix(rating_df, flag='score'):

    """
    Input:
    flag: indicator for whether to use score or binary
    ----
    Output:
    sparse_matrix: scipy.sparse.lil_matrix, user-item matrix
    """

    user_ids = rating_df['userId'].unique()
    item_ids = rating_df['itemId'].unique()

    user_index_map = {user_id: idx for idx, user_id in enumerate(user_ids)}
    item_index_map = {item_id: idx for idx, item_id in enumerate(item_ids)}

    num_users = len(user_ids)
    num_items = len(item_ids)

    if flag == 'score':
        sparse_matrix = lil_matrix((num_users, num_items))
    elif flag == 'binary':
        sparse_matrix = lil_matrix((num_users, num_items), dtype=np.int8)

    for _, row in tqdm(rating_df.iterrows(), total=rating_df.shape[0], desc="Processing ratings"):
        user_id = row['userId']
        item_id = row['itemId']
        score = row['rating']

        user_index = user_index_map[user_id]
        item_index = item_index_map[item_id]

        if flag == 'score':
            sparse_matrix[int(user_index), int(item_index)] = int(score)
        elif flag == 'binary':
            sparse_matrix[int(user_index), int(item_index)] = 1

    sparse_matrix = sparse_matrix.tocsr()

    return list(user_ids), list(item_ids), sparse_matrix

def save_sparse_matrix(sim_dict, file_name):

    row = []
    col = []
    data = []
    
    for i, related_items in sim_dict.items():
        for j, similarity in related_items.items():
            row.append(i)
            col.append(j)
            data.append(similarity)
    
    row = np.array(row)
    col = np.array(col)
    data = np.array(data)
    
    sparse_matrix = coo_matrix((data, (row, col)))
    
    np.savez(file_name, data=sparse_matrix.data, row=sparse_matrix.row, col=sparse_matrix.col, shape=sparse_matrix.shape)

## Train_Test_split

### Random split

In [30]:
def train_test_split_random(ratings, test_size=0.2):

    trn_data, val_data, _, _ = train_test_split(ratings, ratings, test_size=test_size, random_state=42)

    return trn_data, val_data

**PS:**

Random split does not conform to the application scenario of the recommendation system, because in real scenarios, future behaviors cannot be randomly distributed.

### Time Order split

In [31]:
def train_test_split_timeorder(ratings, test_size=0.2):

    ratings_sorted = ratings.sort_values(by = ['userId','timestamp'])

    trn_data = pd.DataFrame(columns=ratings_sorted.columns)
    val_data = pd.DataFrame(columns=ratings_sorted.columns)

    for user_id, group in ratings_sorted.groupby('userId'):

        train_size = int(len(group) * (1-test_size))

        user_trn = group.iloc[:train_size]
        user_val = group.iloc[train_size:]

        trn_data = pd.concat([trn_data, user_trn], ignore_index=True)
        val_data = pd.concat([val_data, user_val], ignore_index=True)

    return trn_data, val_data

**PS:**

1. Users with less data: 

    For users with fewer interactions, the 80% split may result in no data in the test set. You can set a minimum interaction threshold or adjust the split strategy for these users. For example, for users with less than 5 interactions, put all their data in the training set, or use the leave-one-out method (the last one is used as the test set).

2. Cold start problem:

    a. New users: Users that do not appear in the training set may appear in the test set. Since our split method is based on the chronological order of each user, this situation generally does not occur.

    b. New m_df: m_df that do not appear in the training set may appear in the test set. If your model needs to predict the rating of a new movie, you need to include the feature information of the movie in the training set, or use a specific cold start processing method.

3. Data leakage: 

    Since we split according to the chronological order of users, the data in the training set occurs before the test set, avoiding the problem of data leakage.

### Leave-k-Out split

In [32]:
def train_test_split_leavekout(ratings, k=1):

    """
    k: number of items to leave out for validation
    """

    ratings_sorted = ratings.sort_values(by = ['userId','timestamp'])

    trn_data = pd.DataFrame(columns=ratings_sorted.columns)
    val_data = pd.DataFrame(columns=ratings_sorted.columns)

    for user_id, group in ratings_sorted.groupby('userId'):

        trn_data = pd.concat([trn_data, group.iloc[:-k]], ignore_index=True)
        val_data = pd.concat([val_data, group.iloc[-k:]], ignore_index=True)

    return trn_data, val_data

## Featuring Generation

In [None]:
print('train-test-split...')
trn_data, val_data = train_test_split_random(ratings, test_size=0.2)
print('convert_df_to_dict...')
trn_user_items_score, val_user_items_score, trn_user_items, val_user_items = convert_df_to_dict(trn_data, val_data)

In [None]:
print('convert_df_to_sparse_matrix...')
user_index, item_index, all_ratings_matrix = convert_df_to_sparse_matrix(trn_data, flag='score')
all_ratings_matrix.shape

In [46]:
user_index = [int(i) for i in user_index]
item_index = [int(i) for i in item_index]

In [None]:
print('train-test-split...')
trn_data, val_data = train_test_split_random(book_ratings, test_size=0.2)
print('convert_df_to_dict...')
trn_user_items_score, val_user_items_score, trn_user_items, val_user_items = convert_df_to_dict(trn_data, val_data)

In [None]:
print('convert_df_to_sparse_matrix...')
user_index, item_index, all_ratings_matrix = convert_df_to_sparse_matrix(trn_data, flag='score')
all_ratings_matrix.shape

In [101]:
user_index = [int(i) for i in user_index]
item_index = [int(i) for i in item_index]

### item CF

**Recommend items to users based on similar items:**

- Jaccard Similarity

$$
\text{sim}(item_{i}, item_{j}) = \frac{|U_{i} \cap U_{j}|}{|U_{i} \cup U_{j}|}
$$

$ U_i $ and $ U_j $ represent the set of users that interact with item $ i $ and $ j $；

The numerator is the number of users that interact with both item $ i $ and $ j $, and the denominator is the total number of users that interact with item $ i $ or item $ j $.

- Cosine Similarity

2 cases:

1. With socring:
$$
\text{sim}(item_{i}, item_{j}) = \frac{\sum_{u \in U_{i} \cap U_{j}} r_{u,i} \cdot r_{u,j}}{\sqrt{\sum_{u \in U_{i}} r_{u,i}^2} \cdot \sqrt{\sum_{u \in U_{j}} r_{u,j}^2}}
$$

$ U_i $ and $ U_j $ represent the set of users who have rated items $ i $ and $ j $ respectively.

$ r_{u,i} $ represents the rating of user $ u $ on item $ i $, and $ r_{u,j} $ represents the rating of user $ u $ on item $ j $.

The numerator is the product of the user's common ratings for the two items, and the denominator is the product of the modulus lengths of the rating vectors of the two items.

2. Withous scoring:

$$
\text{sim}(i, j) = \frac{|U_{i} \cap U_{j}|}{\sqrt{|U_{i}| \cdot |U_{j}|}}
$$

where $ |U_{i} \cap U_{j}| $ represents the number of users who have interacted with both item $ i $ and item $ j $, and $ |U_{i}| $ and $ |U_{j}| $ represent the number of users who have interacted with item $ i $ and $ j $, respectively.

- Pearson Correlation
$$
\text{sim}(item_{i}, item_{j}) = \frac{\sum_{u \in U_{i} \cap U_{j}} (r_{u,i} - \bar{r}_{i}) \cdot (r_{u,j} - \bar{r}_{j})}{\sqrt{\sum_{u \in U_{i} \cap U_{j}} (r_{u,i} - \bar{r}_{i})^2} \cdot \sqrt{\sum_{u \in U_{i} \cap U_{j}} (r_{u,j} - \bar{r}_{j})^2}}
$$

In [52]:
ROOT_PATH = 'f:\\NTU Learn\\DATA MINING\\DMproject'

def get_itemCF_sim_batch(rating_matrix, item_index, batch_size=1000, data_set='ml-20m'):
    item_user_matrix = rating_matrix.T  # 使用稀疏矩阵，避免大规模内存使用
    items = item_index
    num_items = len(items)

    output_file = os.path.join(ROOT_PATH, "data", data_set, "item_similarity.joblib")

    try:
        os.makedirs(os.path.dirname(output_file), exist_ok=True)

        with tqdm(total=num_items // batch_size, desc="Calculating item similarities", unit="batch") as pbar:
            try:
                similarity_data = {}

                for start in range(0, num_items, batch_size):
                    end = min(start + batch_size, num_items)
                    batch_items = item_user_matrix[start:end, :].toarray().astype(np.float32)
                    batch_index = [items[i] for i in range(start, end)]

                    # 手动不均匀切割 PQ
                    d = item_user_matrix.shape[1]  # 用户数即维度
                    m = 8  # 分成 m 个子向量
                    base_length = d // m
                    remainder = d % m  # 剩余维度

                    if remainder > 0:
                        # 对于最后的子向量使用剩余维度大小
                        sub_vectors = []
                        for i in range(m - 1):
                            sub_vectors.append(batch_items[:, i * base_length:(i + 1) * base_length])
                        sub_vectors.append(batch_items[:, (m - 1) * base_length:])  # 最后一个子向量的长度
                    else:
                        sub_vectors = np.hsplit(batch_items, m)

                    # 训练并使用 PQ
                    pq_index = faiss.IndexPQ(d, m, 256)
                    pq_index.train(batch_items)
                    gpu_res = faiss.StandardGpuResources()  # 初始化 GPU 资源
                    gpu_index = faiss.index_cpu_to_gpu(gpu_res, 0, pq_index)

                    gpu_index.add(batch_items)

                    try:
                        # 使用 PQ 进行相似度计算
                        D, I = gpu_index.search(batch_items, num_items)
                    except ValueError as ve:
                        print(f"ValueError in PQ similarity for batch {start}-{end}: {ve}")
                        continue  # 跳过这个批次，继续下一个
                    except MemoryError as me:
                        print(f"MemoryError: {me}")
                        break  # 发生内存错误时退出循环

                    for idx, item_id in enumerate(batch_index):
                        # 将距离转换为相似度 (1 - 距离, 这里假设距离归一化在 [0,1])
                        item_similarity = {
                            int(items[other_item_id]): round(1 - float(distance), 4)
                            for other_item_id, distance in zip(I[idx][:100], D[idx][:100])
                        }

                        similarity_data[item_id] = item_similarity

                    pbar.update(1)

                # 使用 joblib 保存相似度矩阵，并启用压缩
                joblib.dump(similarity_data, output_file, compress=3)
                
                print(f"Item similarity results have been saved to {output_file}")

            except IOError as e:
                print(f"IOError: Failed to open or write to file {output_file}: {e}")
            except Exception as e:
                print(f"Unexpected error while writing to file: {e}")

    except OSError as e:
        print(f"OSError: Failed to create directory for {output_file}: {e}")
    except Exception as e:
        print(f"Unexpected error: {e}")


In [None]:
all_ratings_matrix.T.shape

In [None]:
get_itemCF_sim_batch(all_ratings_matrix, item_index, batch_size=1000, data_set='ml-20m')

In [102]:
def get_itemCF_sim_batch(all_ratings_matrix, user_index, item_index, batch_size=1000, output_file='data/ml-20m/itemCF_sim.pkl'):
    item_user_matrix = all_ratings_matrix.T
    items = item_index
    num_items = len(items)

    try:
        
        os.makedirs(os.path.dirname(output_file), exist_ok=True)
        
        with tqdm(total=num_items // batch_size, desc="Calculating item similarities", unit="batch") as pbar:
            
            try:
                
                with open(output_file, mode='wb') as file:
                    
                    for start in range(0, num_items, batch_size):
                        end = min(start + batch_size, num_items)
                        batch_items = item_user_matrix[start:end, :]
                        batch_index = [item_index[i] for i in range(start, end)]
                        
                        try:
                            # calculate cosine similarity between batch users and all users
                            similarity_matrix = cosine_similarity(batch_items, item_user_matrix)
                        except ValueError as ve:
                            print(f"ValueError in cosine_similarity for batch {start}-{end}: {ve}")
                            continue  # 跳过这个批次，继续下一个
                        except MemoryError as me:
                            print(f"MemoryError: {me}")
                            break  # end the loop if memory error occurs
                        
                        for idx, item_id in enumerate(batch_index):
                            item_similarity = {
                                int(other_item_id): round(float(similarity), 4)
                                for other_item_id, similarity in zip(items, similarity_matrix[idx])
                            }
                            
                            try:
                                # save user similarity to file
                                pickle.dump({item_id: item_similarity}, file)
                            except pickle.PicklingError as pe:
                                print(f"PicklingError for user {item_id}: {pe}")
                                continue  # skip this user and continue with the next one

                        pbar.update(1)

            except IOError as e:
                print(f"IOError: Failed to open or write to file {output_file}: {e}")
            except Exception as e:
                print(f"Unexpected error while writing to file: {e}")
    
    except OSError as e:
        print(f"OSError: Failed to create directory for {output_file}: {e}")
    except Exception as e:
        print(f"Unexpected error: {e}")
    
    print(f"User similarity results have been saved to {output_file}")

In [None]:
output_file = 'data/ml-20m/itemCF_sim.pkl'
get_itemCF_sim_batch(all_ratings_matrix, user_index, item_index, batch_size=1000, output_file=output_file)

In [None]:
output_file = 'data/book_crossing/itemCF_sim.pkl'
get_itemCF_sim_batch(all_ratings_matrix, user_index, item_index, batch_size=1000, output_file=output_file)

### UserCF

**Recommend items to users based on similar users**

- Jaccard similarity

$$
sim_{uv}=\frac{|N(u) \cap N(v)|}{|N(u)| \cup|N(v)|}
$$

Where $N(u)$, $N(v)$ represent the set of items interacted by user $u$ and user $v$ respectively.

For users $u$ and $v$, this formula reflects the ratio of the number of intersections of the two interacted items to the number of unions of the two users' interacted items.

- Cosine similarity

$$
sim_{uv}=\frac{|N(u) \cap N(v)|}{\sqrt{|N(u)|\cdot|N(v)|}}
$$

Described from the perspective of vectors, let the matrix $A$ be the user-item interaction matrix, the rows of the matrix represent users, and the columns represent items.

Assume that the number of users and items is $m,n$ respectively, the interaction matrix $A$ is a matrix with $m$ rows and $n$ columns.

All elements in the matrix are $0/1$. If user $i$ interacts with item $j$, then $A_{i,j}=1$, otherwise it is $0$.

The vectors $u,v$ are both one-hot in form, and $u\cdot v$ represents the vector dot product.

The above user-item interaction matrix is ​​very sparse in reality. In order to save memory, the interaction matrix is ​​stored in a **dict**.

- Pearson correlation

$$
sim(u,v)=\frac{\sum_{i\in I}(r_{ui}-\bar r_u)(r_{vi}-\bar r_v)}{\sqrt{\sum_{i\in I }(r_{ui}-\bar r_u)^2}\sqrt{\sum_{i\in I }(r_{vi}-\bar r_v)^2}}
$$

Where $r_{ui},r_{vi}$ respectively represent whether user $u$ and user $v$ have interaction with item $i$ (or specific rating value);

$\bar r_u, \bar r_v$ respectively represent the average number of interactions or ratings of all items interacted by user $u$ and user $v$;

Compared with cosine similarity, the Pearson correlation reduces the impact of user rating bias by using the user's average score to correct each independent rating.

In [20]:
def get_userCF_sim_batch(all_ratings_matrix, user_index, item_index, batch_size=1000, output_file='data/ml-20m/userCF_sim.pkl'):
    user_item_matrix = all_ratings_matrix
    users = user_index
    num_users = len(users)

    try:
        
        os.makedirs(os.path.dirname(output_file), exist_ok=True)
        
        with tqdm(total=num_users // batch_size, desc="Calculating user similarities", unit="batch") as pbar:
            
            try:
                
                with open(output_file, mode='wb') as file:
                    
                    for start in range(0, num_users, batch_size):
                        end = min(start + batch_size, num_users)
                        batch_users = user_item_matrix[start:end, :]
                        batch_index = [user_index[i] for i in range(start, end)]
                        
                        try:
                            # calculate cosine similarity between batch users and all users
                            similarity_matrix = cosine_similarity(batch_users, user_item_matrix)
                        except ValueError as ve:
                            print(f"ValueError in cosine_similarity for batch {start}-{end}: {ve}")
                            continue  # 跳过这个批次，继续下一个
                        except MemoryError as me:
                            print(f"MemoryError: {me}")
                            break  # end the loop if memory error occurs
                        
                        for idx, user_id in enumerate(batch_index):
                            user_similarity = {
                                int(other_user_id): round(float(similarity), 4)
                                for other_user_id, similarity in zip(users, similarity_matrix[idx])
                            }
                            
                            try:
                                # save user similarity to file
                                pickle.dump({user_id: user_similarity}, file)
                            except pickle.PicklingError as pe:
                                print(f"PicklingError for user {user_id}: {pe}")
                                continue  # skip this user and continue with the next one

                        pbar.update(1)

            except IOError as e:
                print(f"IOError: Failed to open or write to file {output_file}: {e}")
            except Exception as e:
                print(f"Unexpected error while writing to file: {e}")
    
    except OSError as e:
        print(f"OSError: Failed to create directory for {output_file}: {e}")
    except Exception as e:
        print(f"Unexpected error: {e}")
    
    print(f"User similarity results have been saved to {output_file}")

In [None]:
output_file = 'data/ml-20m/userCF_sim.pkl'
userCF_sim = get_userCF_sim_batch(all_ratings_matrix, user_index, item_index, batch_size=1000, output_file=output_file)

### loading features

In [None]:
# reading from json

with open('data/ml-20m/itemCF_sim.json', 'r') as f:
    itemCF_sim1 = json.load(f)

In [None]:
df = pd.read_csv(r'data\book_crossing\ratings.csv')
df = df.rename(columns={'User-ID':'userId', 'ISBN':'itemId', 'Book-Rating':'rating'})
df['userId'].nunique(), df['itemId'].nunique()

In [None]:
df = pd.read_csv(r'data\ml-20m\ratings.csv')
df = df.rename(columns={'userId':'userId', 'movieId':'itemId', 'rating':'rating', 'timestamp':'timestamp'})
df['userId'].nunique(), df['itemId'].nunique()

In [None]:
df.shape[0]

In [None]:
# reading from pkl
itemCF_sim2 = {}
with open(r'data\ml-20m\item_similarity.pkl', 'rb') as file:
    i = 1
    while True:
        try:
            itemCF_sim2.update(pickle.load(file))
            i += 1
            if i % 1000 == 0:
                print(list(itemCF_sim2.keys())[-1])
                print(i)
            # if i == 2000:
            #     break
        except EOFError:
            break
# (125665, 15374, 1000013)

In [42]:
# reading from pkl
itemCF_sim1 = {}
with open(r'data\book_crossing\item_similarity.pkl', 'rb') as file:
    i = 1
    while True:
        try:
            itemCF_sim2.update(pickle.load(file))
            i += 1
            if i % 1000 == 0:
                print(list(itemCF_sim2.keys())[-1])
                print(i)
            # if i == 2000:
            #     break
        except EOFError:
            break
# (27653, 70403, 114978)
len(itemCF_sim1.keys()), len(itemCF_sim1[1].keys())

In [None]:
len(itemCF_sim2.keys())

In [None]:
len(itemCF_sim2[1].keys())

In [None]:
for key, value in itemCF_sim2.items():
    print(key, list(zip([i for i in value.keys()][:10],[i for i in value.values()][:10])))
    break

In [None]:
itemCF_sim2.keys()

## Improvement of weights in collaborative filtering algorithm
to be implement...

* base formula
  $$
  w_{i j}=\frac{|N(i) \bigcap N(j)|}{|N(i)|}
  $$

  + This formula represents the number of users who like both item $i$ and item $j$, as a percentage of the number of users who like item $i$.
  + Disadvantage: If item $j$ is a popular item, then its similarity to any item is very high.

* Penalize popular items
  $$
  w_{i j}=\frac{|N(i) \cap N(j)|}{\sqrt{|N(i)||N(j)|}}
  $$
  
  
  * According to the problem in the base formula, suppress item $j$. The starting point of the suppression is very simple, which is to divide the denominator by the number of items $j$ purchased.
  * At this time, if item $j$ is a popular item, then the corresponding $N(j)$ will also be large, and the penalty will be greater.
  
* Control the severity of penalties for popular items
  $$
  w_{i j}=\frac{|N(i) \cap N(j)|}{|N(i)|^{1-\alpha}|N(j)|^{\alpha}}
  $$

  * In addition to the method mentioned in the second point, popular items can be penalized when calculating the similarity between items.
  * On this basis, the parameter $\alpha$ can be further introduced, so that the intensity of the penalty for popular items can be determined by controlling the parameter $\alpha$.

* Penalty for active users

  * When calculating the similarity between items, the user's activity can be further taken into account.
    $$
    w_{i j}=\frac{\sum_{\operatorname{\text {u}\in N(i) \cap N(j)}} \frac{1}{\log 1+|N(u)|}}{|N(i)|^{1-\alpha}|N(j)|^{\alpha}}
    $$

  + For an abnormally active user, his contribution should be less than that of an inactive user when calculating the similarity between items.

## Problem analysis of collaborative filtering algorithm
to be implement...

One of the problems with collaborative filtering algorithms is that they have weak generalization capabilities:

+ That is, collaborative filtering cannot generalize the similarity between two items to the similarity between other items.
+ The resulting problem is that **hot items have a strong head effect and are easily similar to a large number of items, while tail items are rarely recommended due to sparse feature vectors**.

For example, the following example:

![图片](http://ryluo.oss-cn-chengdu.aliyuncs.com/JavaxxhHm3BAtMfsy2AV.png!thumbnail)

+ In the matrix on the left, $A, B, C, D$ represent items.
+ It can be seen that $D$ is a hot item, and its similarity with $A, B, C$ is relatively large. Therefore, the recommendation system is more likely to recommend $D$ to users who have used $A, B, C$.
+ However, the reason why the recommendation system cannot find the similarity between $A, B, C$ is that the interaction data is too sparse and there is a lack of direct data for similarity calculation.

So this is the natural defect of collaborative filtering: **The recommendation system has a significant head effect and is weak in processing sparse vectors**.

In order to solve this problem and increase the generalization ability of the model at the same time. In 2006, **Matrix Factorization (MF**) was proposed:

+ Based on the collaborative filtering co-occurrence matrix, this method uses denser latent vectors to represent users and items, and mines the implicit interests and implicit features of users and items.

+ To a certain extent, it makes up for the problem that the collaborative filtering model is not able to process sparse matrices.

# Two-Tower-Model based Recommendation

Given a query set $Query: \left\{x_{i}\right\}_{i=1}^{N}$ and an item set $Item:\left\{y_{j}\right\}_{j=1}^{M}$.

+ $x_{i} \in X,\quad y_{j} \in \mathcal{Y}$ is a high-dimensional mixture of multiple features (e.g., sparse ID and Dense features).

+ The goal of recommendation is to retrieve a series of $item$ subsets for subsequent ranking recommendation tasks for a given $query$.

<img src="https://ryluo.oss-cn-chengdu.aliyuncs.com/%E5%9B%BE%E7%89%87image-20220506202824884.png" alt="image-20220506202824884" style="zoom:50%;" />


## feature generation

### Movielens dataset

1. ratings.csv
    - movieId
    - userId
    - rating
    - timestamp
2. m_df.csv
    - movieId
    - title
    - genre
3. tags.csv
    - userId
    - movieId
    - tag
    - timestamp

**Generate Features for 2 tower candidate generation**

1. User tower

- sparse_feature:
    - userId
    - user_hist

- dense_feature:
    - user_mean_rating

2. Item tower

- sparse_feature:
    - movieId
    - title
    - genres

- dense_feature:
    - item_mean_rating

In [273]:
movies_ratings = pd.read_csv('data/ml-20m/ratings.csv')
movies = pd.read_csv('data/ml-20m/movies.csv')

movies_ratings.rename(columns={'userId': 'userId', 'movieId': 'itemId', 'rating': 'rating', 'timestamp': 'timestamp'}, inplace=True)
movies.rename(columns={'movieId': 'itemId', 'title': 'title', 'genres': 'genres'}, inplace=True)


def process_movies_features(movies):
    try:
        def split_title_time(title):
            year_pattern = re.compile(r'\d{4}')
            title = title.strip().replace('\xa0', '')
            title_time = title.split('(')
            try:
                year = re.findall(year_pattern, title_time[-1])
                year = year[0]
                title = ' '.join(title_time[:-1])
            except IndexError:
                year = np.nan
            return title.strip(), year
        
        m_df = movies.copy()
        print('splitting title and year...')
        m_df['title'], m_df['year'] = zip(*m_df['title'].apply(split_title_time))

        m_df['title'] = m_df['title'].astype('str')
        # use mode to fill na
        m_df['year'] = m_df['year'].fillna(int(m_df['year'].mode()[0])).astype('int')

        # title_tfidf
        print('initializing title tfidf...')

        m_df['title_clean'] = m_df['title'].apply(lambda x: x.lower()) # lowercase
        m_df['title_clean'] = m_df['title_clean'].str.replace(r'[^\w\s]', '') # remove punctuation
        m_df['title_clean'] = m_df['title_clean'].str.replace(r'\d+', '')  # remove digits
        m_df['title_clean'] = m_df['title_clean'].str.replace(r'\s+', ' ') # remove extra spaces

        title_tfidf = TfidfVectorizer(stop_words='english', max_features=100)   # control dimensionality - 100 features
        title_tfidf_matrix = title_tfidf.fit_transform(m_df['title_clean'])
        m_df.drop(columns=['title_clean'], inplace=True)

        m_df = pd.concat([m_df, pd.DataFrame(title_tfidf_matrix.toarray(), columns=[f'title_tfidf_{i}' for i in range(title_tfidf_matrix.shape[1])])], axis=1)
        
        # year binarization
        print('generating year binarization features...')
        m_df['is_erlier'] = m_df['year'].apply(lambda x: 1 if int(x) < 1980 else 0)
        m_df['is_80s'] = m_df['year'].apply(lambda x: 1 if 1980 <= int(x) < 1990 else 0)
        m_df['is_90s'] = m_df['year'].apply(lambda x: 1 if 1990 <= int(x) < 2000 else 0)
        m_df['is_00s'] = m_df['year'].apply(lambda x: 1 if 2000 <= int(x) < 2010 else 0)
        m_df['is_latest'] = m_df['year'].apply(lambda x: 1 if 2010 <= int(x) < 2020 else 0)

        # genre identification
        print('generating genre indentification features...')
        m_df['genres'].replace('(no genres listed)', '', inplace=True)
        m_df['is_comedy'] = m_df['genres'].apply(lambda x: 1 if 'Comedy' in x else 0)
        m_df['is_romance'] = m_df['genres'].apply(lambda x: 1 if 'Romance' in x else 0)
        m_df['is_action'] = m_df['genres'].apply(lambda x: 1 if 'Action' in x else 0)

        # genre_tfidf
        print('initializing genre tfidf...')
        genre_tfidf = TfidfVectorizer()

        m_df['genres_str'] = m_df['genres'].apply(lambda x: x.replace('|', ' '))
        
        genre_tfidf_matrix = genre_tfidf.fit_transform(m_df['genres_str'])

        m_df.drop(columns=['genres_str'], inplace=True)

        m_df = pd.concat([m_df, pd.DataFrame(genre_tfidf_matrix.toarray(), columns=[f'genre_tfidf_{i}' for i in range(genre_tfidf_matrix.shape[1])])], axis=1)

        print('-'*30+'movie feature processing completed'+'-'*30)

        return m_df
    
    except KeyError as e:
        print(f"During movie feature process, key error during data processing: {e}")
    except MemoryError as e:
        print(f"During movie feature process, memory error, possibly due to long history strings: {e}")
    except Exception as e:
        print(f"During movie feature process, an unexpected error occurred: {e}")

def feature_process(mr_df, m_df, MAX_HISTORY_LENGTH = 50):
    try:

        feature_df = mr_df.copy()

        feature_df['item_mean_rating'] = feature_df.groupby('itemId')['rating'].transform('mean').apply(lambda x: round(x, 2)).astype('float32')
        feature_df['user_mean_rating'] = feature_df.groupby('userId')['rating'].transform('mean').apply(lambda x: round(x, 2)).astype('float32')

        print('sorting by userId and timestamp...')
        feature_df = feature_df.sort_values(by=['userId', 'timestamp'])

        feature_df['user_hist'] = ''
        user_history_dict = {}

        # Iterate through rows, building user history, limiting history length to MAX_HISTORY_LENGTH
        for index, row in tqdm(feature_df.iterrows(), total=len(feature_df), desc="Processing user history"):
            user_id = row['userId']
            item_id = row['itemId']

            if user_id not in user_history_dict:
                user_history_dict[user_id] = []

            # Limit history to the most recent MAX_HISTORY_LENGTH records
            feature_df.at[index, 'user_hist'] = '|'.join(map(str, user_history_dict[user_id][-MAX_HISTORY_LENGTH:]))

            user_history_dict[user_id].append(item_id)

        print('merging with item features...')
        feature_df = pd.merge(feature_df, m_df, on='itemId', how='left')

        print('sorting by itemId and userId...')
        feature_df.sort_values(by=['itemId', 'userId'], inplace=True)
        feature_df.reset_index(drop=True, inplace=True)

        print('-'*30+'interaction feature processing completed'+'-'*30)

    except KeyError as e:
        print(f"During interaction feature process, key error during data processing: {e}")
    except MemoryError as e:
        print(f"During interaction feature process, memory error, possibly due to long history strings: {e}")
    except Exception as e:
        print(f"During interaction feature process, an unexpected error occurred: {e}")

    return feature_df

# feature_df = feature_process(mr_df, m_df, MAX_HISTORY_LENGTH = 50)

In [274]:
def batch_feature_process(mr_df, m_df, batch_size=10000):
    try:
        
        users = list(pd.unique(mr_df.userId))
        num_users = len(users)
        for i in range(0, num_users, batch_size):
            
            print('processing users {} to {}'.format(i, i+batch_size))
            users_batch = users[i:i+batch_size]
            mr_df_batch = mr_df[mr_df.userId.isin(users_batch)]
            feature_df = feature_process(mr_df_batch, m_df, MAX_HISTORY_LENGTH = 50)

            feature_df.to_parquet(f'data/ml-20m/feature_df_{int(i/10000)}.parquet')

    except Exception as e:
        print(f"During batch_feature_process, an unexpected error occurred: {e}")

    return feature_df

In [None]:
movies = process_movies_features(movies)
mr_df = movies_ratings.copy()
batch_feature_process(mr_df, movies, batch_size=10000)

In [None]:
movies_ratings['userId'].value_counts().max(), movies_ratings['userId'].value_counts().min(), movies_ratings['userId'].value_counts().mean()

In [None]:
user_activeness = pd.DataFrame(movies_ratings['userId'].value_counts())
# flatten multi-index
user_activeness.reset_index(inplace=True)
user_activeness.columns = ['userId', 'num_ratings']
user_activeness.sort_values(by='num_ratings', ascending=False, inplace=True)
user_activeness.head(5)

In [None]:
user_activeness.shape[0], user_activeness[user_activeness['num_ratings'] < 3].shape[0]

In [None]:
user_activeness['log_num_ratings'] = np.log10(user_activeness['num_ratings'])
# log-log plot
plt.figure(figsize=(10, 6))
density, bins,_ = plt.hist(user_activeness['num_ratings'], bins=20, density=True, alpha=0)
log_density = np.log(density + 1e-10)
log_bins = np.log(bins)

plt.plot(log_bins[:-1], log_density, marker='o',label='PDF')
plt.xlabel('log(num_ratings)')
plt.ylabel('log(density)')
plt.xlim(-0.5, 7)
plt.title('Log-log plot of user activity')
plt.legend()

In [None]:
# plot distribution of number of ratings per user
plt.figure(figsize=(10, 6))
plt.hist(movies_ratings['userId'].value_counts(), bins=100, color='skyblue', edgecolor='black', linewidth=1.2)
plt.title('Distribution of Number of Ratings per User')
plt.xlabel('Number of Ratings')
plt.ylabel('Number of Users')

In [None]:
# load paquet files
feature_df = pd.read_parquet('data/ml-20m/feature_df_0.parquet')
feature_df.head()

In [14]:
movies.to_csv('data/ml-20m/m_df_features.csv', index=False)
feature_df.to_csv('data/ml-20m/interaction_features.csv', index=False)

In [5]:
# save feature df to a pkl file
feature_df.to_pickle('data/ml-20m/feature_df.pkl')

### Book-Crossing dataset

- books_df
    - title: sparse
    - author: sparse
    - year: density
    - publisher: sparse
    - itemId: sparse
- user_df
    - age: density
    - userId: sparse
    - location: sparse

In [277]:
with open ('data/book_crossing/map_dict.json', 'r') as f:
    map_dict = json.load(f)
books_ratings = pd.read_csv('data/book_crossing/ratings.csv')
books_ratings['itemId'] = books_ratings['ISBN'].apply(lambda x: map_dict[x])
books_ratings = books_ratings.rename(columns={'User-ID':'userId', 'Book-Rating':'rating'})
books_ratings.drop(columns=['ISBN'], inplace=True)

books_df = pd.read_csv('data/book_crossing/Books_modified.csv', na_values="NULL")
books_df = books_df[['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher']]
books_df['itemId'] = books_df['ISBN'].map(map_dict)
books_df.dropna(inplace=True, axis=0)
books_df['itemId'] = books_df['itemId'].astype('int')
books_df = books_df.drop(columns=['ISBN'])
books_df = books_df.rename(columns={'Book-Title':'title', 'Book-Author':'author', 'Year-Of-Publication':'year', 'Publisher':'publisher'})

user_df = pd.read_csv('data/book_crossing/Users.csv')
user_df = user_df.rename(columns={'User-ID':'userId', 'Location':'location', 'Age':'age'})
user_df['userId'] = user_df['userId'].astype('int')
user_df['age'] = user_df['age'].replace(0, np.nan)
user_df['age'] = user_df['age'].fillna(user_df['age'].mean()).astype('int')

In [198]:
# books_df = pd.read_csv('data/book_crossing/Books.csv')
# # books_df[books_df['Year-Of-Publication']=='DK Publishing Inc']['Image-URL-L'] = books_df[books_df['Year-Of-Publication']=='DK Publishing Inc']['Image-URL-M']
# # books_df[books_df['Year-Of-Publication']=='DK Publishing Inc']['Image-URL-M'] = books_df[books_df['Year-Of-Publication']=='DK Publishing Inc']['Image-URL-S']
# # books_df[books_df['Year-Of-Publication']=='DK Publishing Inc']['Image-URL-S'] = books_df[books_df['Year-Of-Publication']=='DK Publishing Inc']['Publisher']
# # books_df[books_df['Year-Of-Publication']=='DK Publishing Inc']['Publisher'] = books_df[books_df['Year-Of-Publication']=='DK Publishing Inc']['Year-Of-Publication']
# # books_df[books_df['Year-Of-Publication']=='DK Publishing Inc']['Book-Author'] = np.nan
# # books_df[books_df['Year-Of-Publication']=='DK Publishing Inc']['Year-Of-Publication'] = 2000
# books_df.iloc[209538,2:] = books_df.iloc[209538,2:].shift(1)
# books_df.iloc[221678,2:] = books_df.iloc[221678,2:].shift(1)
# books_df.iloc[220731,2:] = books_df.iloc[220731,2:].shift(1)
# # books_df['Book-Author'] = books_df['Book-Author'].astype('str')
# # books_df['Year-Of-Publication'] = books_df['Year-Of-Publication'].astype('int')
# # books_df['Publisher'] = books_df['Publisher'].astype('str')
# # books_df['Image-URL-S'] = books_df['Image-URL-S'].astype('str')
# # books_df['Image-URL-M'] = books_df['Image-URL-M'].astype('str')
# # books_df['Image-URL-L'] = books_df['Image-URL-L'].astype('str')
# # books_df['Book-Title'] = books_df['Book-Title'].astype('str')
# books_df['ISBN'] = books_df['ISBN'].astype('str')
# books_df['Book-Title'] = books_df['Book-Title'].astype('str')
# books_df['Year-Of-Publication'] = books_df['Year-Of-Publication'].astype('int')

In [197]:
# books_df[books_df.isnull().any(axis=1)]

In [199]:
# books_df.to_csv('data/book_crossing/Books_modified.csv', index=False, na_rep="NULL")

In [None]:
user_df.head()

In [None]:
books_df.head()

In [278]:
def generate_book_features(books_df):
    
    print('processing book features...')
    b_df = books_df.copy()

    print('tfidf on book title...')
    b_df['title'] = b_df['title'].astype('str')
    title_tfidf = TfidfVectorizer(stop_words='english', max_features=100)
    b_df['title_clean'] = b_df['title'].apply(lambda x: x.lower()) # lowercase
    b_df['title_clean'] = b_df['title_clean'].str.replace(r'[^\w\s]', '') # remove punctuation
    b_df['title_clean'] = b_df['title_clean'].str.replace(r'\d+', '')  # remove digits
    b_df['title_clean'] = b_df['title_clean'].str.replace(r'\s+', ' ') # remove extra spaces
    
    title_tfidf_matrix = title_tfidf.fit_transform(b_df['title_clean'])
    b_df.drop(columns=['title_clean'], inplace=True)
    b_df = pd.concat([b_df, pd.DataFrame(title_tfidf_matrix.toarray(), columns=[f'title_tfidf_{i}' for i in range(title_tfidf_matrix.shape[1])])], axis=1)

    print('binarizing year...')
    b_df['year'] = b_df['year'].fillna(int(b_df['year'].mode()[0])).astype('int')
    b_df['is_erlier'] = b_df['year'].apply(lambda x: 1 if int(x) < 1980 else 0)
    b_df['is_80s'] = b_df['year'].apply(lambda x: 1 if 1980 <= int(x) < 1990 else 0)
    b_df['is_90s'] = b_df['year'].apply(lambda x: 1 if 1990 <= int(x) < 2000 else 0)
    b_df['is_00s'] = b_df['year'].apply(lambda x: 1 if 2000 <= int(x) < 2010 else 0)
    b_df['is_latest'] = b_df['year'].apply(lambda x: 1 if 2010 <= int(x) < 2020 else 0)

    b_df['book_age'] = 2024 - b_df['year']

    print('-'*30+'book feature processing completed'+'-'*30)
    return b_df


def generate_user_features(user_df):

    print('processing user features...')
    u_df = user_df.copy()

    # lower case location
    print('pricessing location...')
    u_df['location'] = u_df['location'].apply(lambda x: x.lower())
    u_df['location'] = u_df['location'].apply(lambda x: x if len(x.split(',')) <=4 or len(x.split(',')) >= 2 else "Other")

    # extract country
    print('extracting country...')
    u_df['country'] = u_df['location'].apply(lambda x: x.split(',')[-1].strip())
    # convert rare countries to 'Other' -- counts less than 100
    countr_map = u_df['country'].value_counts().to_dict()
    u_df['country'] = u_df['country'].apply(lambda x: x if countr_map[x] > 100 else 'Other')
    countr_map = {}

    print('processing age binarization...')
    # binarize age
    u_df['generation'] = u_df['age'].apply(lambda x: 'Gen Z' if 0 <= x < 25 else 'Millenial' if 25 <= x < 40 else 'Gen X' if 40 <= x < 55 else 'Boomer' if 55 <= x < 75 else 'Silent' if 75 <= x < 95 else 'Greatest')

    u_df.rename(columns={'age':'user_age'}, inplace=True)

    print('-'*30+'user feature processing completed'+'-'*30)
    return u_df

def feature_process(br_df, b_df, u_df):
    try:

        feature_df = br_df.copy()

        feature_df['item_mean_rating'] = feature_df.groupby('itemId')['rating'].transform('mean').apply(lambda x: round(x, 2)).astype('float32')
        feature_df['user_mean_rating'] = feature_df.groupby('userId')['rating'].transform('mean').apply(lambda x: round(x, 2)).astype('float32')

        print('merging with item features...')
        feature_df = pd.merge(feature_df, b_df, on='itemId', how='left')

        print('merging with user features...')
        feature_df = pd.merge(feature_df, u_df, on='userId', how='left')

        print('sorting by itemId and userId...')
        feature_df.sort_values(by=['itemId', 'userId'], inplace=True)
        feature_df.reset_index(drop=True, inplace=True)

        feature_df['book-user_age_crossing'] = feature_df['user_age'] * feature_df['book_age']
        feature_df['book-user_age_crossing'] = feature_df['book-user_age_crossing'].astype('int')

        print('-'*30+'interaction feature processing completed'+'-'*30)

    except KeyError as e:
        print(f"During interaction feature process, key error during data processing: {e}")
    except MemoryError as e:
        print(f"During interaction feature process, memory error, possibly due to long history strings: {e}")
    except Exception as e:
        print(f"During interaction feature process, an unexpected error occurred: {e}")

    return feature_df

def batch_feature_process(br_df, b_df, u_df, batch_size):

    try:
        
        users = list(pd.unique(br_df.userId))
        num_users = len(users)
        
        for i in range(0, num_users, batch_size):
            
            print('processing users {} to {}'.format(i, i+batch_size))
            users_batch = users[i:i+batch_size]
            br_df_batch = br_df[br_df.userId.isin(users_batch)]
            feature_df = feature_process(br_df_batch, b_df, u_df)

            feature_df.to_parquet(f'data/book_crossing/feature_df_{int(i/batch_size)}.parquet')

    except Exception as e:
        print(f"During batch_feature_process, an unexpected error occurred: {e}")

    return feature_df

In [None]:
b_df = generate_book_features(books_df)
u_df = generate_user_features(user_df)
br_df = books_ratings.copy()
batch_feature_process(br_df, b_df, u_df, batch_size=10000)

# Test

In [288]:
# reading from pkl
itemCF_sim2 = {}
with open('data/book_crossing/item_similarity.pkl', 'rb') as file:
    i = 0
    while True:
        try:
            item = pickle.load(file)
            itemCF_sim2.update(item)
            # print(item)
            i+=1
            # print(i)
            # if i == 10:
            #     break
        except EOFError:
            break

In [None]:
itemCF_sim1 = pd.read_parquet(r'data\book_crossing\item_similarity_matrix.parquet')
userCF_sim1 = pd.read_parquet(r'data\book_crossing\user_similarity_matrix.parquet')
print(itemCF_sim1.shape, userCF_sim1.shape)

In [None]:
itemCF_sim1.index

In [None]:
# values != 0
print(f'itemCF sim matrix sparsity_rate:{itemCF_sim2[itemCF_sim2 != 0].count().sum()/itemCF_sim2.size * 100}%')
print(f'itemCF sim matrix sparsity_rate:{userCF_sim2[userCF_sim2 != 0].count().sum()/itemCF_sim2.size * 100}%')

In [None]:
# values != 0
print(f'itemCF sim matrix sparsity_rate:{itemCF_sim1[itemCF_sim1 != 0].count().sum()/itemCF_sim1.size * 100}%')
print(f'userCF sim matrix sparsity_rate:{userCF_sim1[userCF_sim1 != 0].count().sum()/itemCF_sim1.size * 100}%')

In [None]:
itemCF_sim2 = pd.read_parquet(r'data\ml-20m\item_similarity_matrix.parquet')
userCF_sim2 = pd.read_parquet(r'data\ml-20m\user_similarity_matrix.parquet')
print(itemCF_sim2.shape, userCF_sim2.shape)

In [None]:
# values != 0
print(f'itemCF sim matrix sparsity_rate:{itemCF_sim2[itemCF_sim2 != 0].count().sum()/itemCF_sim2.size * 100}%')
print(f'userCF sim matrix sparsity_rate:{userCF_sim2[userCF_sim2 != 0].count().sum()/itemCF_sim2.size * 100}%')