In [2]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm
from scipy.stats import pearsonr
from scipy.sparse import coo_matrix, lil_matrix, csr_matrix
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity
import math
import json
import torch
from torch.utils.data import DataLoader, Dataset
from matplotlib import pyplot as plt
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import pyarrow.parquet as pq
import pyarrow as pa
import fastparquet
import joblib
from sklearn.preprocessing import MinMaxScaler

os.chdir('../')
os.getcwd()

'f:\\NTU Learn\\DATA MINING\\ntu_sd6125_recsys'

# Read Data

In [49]:
ml_20m_df = pd.read_parquet('data/ml-20m/feature_df_0.parquet')
book_crossing_df = pd.read_parquet('data/book_crossing/feature_df_0.parquet')
min_max_scaler = MinMaxScaler(feature_range=(0.5, 5))
ml_20m_df['rating'] = min_max_scaler.fit_transform(ml_20m_df[['rating']])
book_crossing_df['rating'] = min_max_scaler.fit_transform(book_crossing_df[['rating']])
ml_20m_df.shape, book_crossing_df.shape

((3794270, 139), (124710, 120))

# Train Test Split

In [50]:
ml_20m_df_train, ml_20m_df_test = train_test_split(ml_20m_df, test_size=0.2, random_state=42)
book_crossing_df_train, book_crossing_df_test = train_test_split(book_crossing_df, test_size=0.2, random_state=42)

In [51]:
ml_20m_df_train.shape, ml_20m_df_test.shape, book_crossing_df_train.shape, book_crossing_df_test.shape

((3035416, 139), (758854, 139), (99768, 120), (24942, 120))

In [52]:
ml_20m_df_train.to_parquet('data/ml-20m/ml_20m_df_train.parquet')
ml_20m_df_test.to_parquet('data/ml-20m/ml_20m_df_test.parquet')
book_crossing_df_train.to_parquet('data/book_crossing/book_crossing_df_train.parquet')
book_crossing_df_test.to_parquet('data/book_crossing/book_crossing_df_test.parquet')

# Generate User-Item-Rating Matrix

In [53]:
def convert_ratings_to_matrix(data, num_users, num_items, usr_ids_dict, item_ids_dict):
    
    user_item_matrix = np.zeros((num_users, num_items))

    for row in data.itertuples():
        user_item_matrix[usr_ids_dict[row.userId], item_ids_dict[row.itemId]] = row.rating

    return user_item_matrix

## ml-20m

In [54]:
ml_num_usrs = ml_20m_df.userId.nunique()
ml_num_items = ml_20m_df.itemId.nunique()
ml_user_ids = pd.unique(ml_20m_df.userId).tolist()
ml_item_ids = pd.unique(ml_20m_df.itemId).tolist()
ml_user_ids_dict = {ml_user_ids[i]: i for i in range(len(ml_user_ids))}
ml_item_ids_dict = {ml_item_ids[i]: i for i in range(len(ml_item_ids))}

ml_20m_train_matrix = convert_ratings_to_matrix(ml_20m_df_train, ml_num_usrs, ml_num_items, ml_user_ids_dict, ml_item_ids_dict)

ml_20m_train_matrix

array([[3., 2., 2., ..., 0., 0., 0.],
       [5., 5., 2., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [55]:
ml_20m_train_matrix_df = pd.DataFrame(ml_20m_train_matrix, columns=ml_item_ids, index=ml_user_ids)

In [56]:
ml_20m_train_matrix_df.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,131239,131241,131243,131248,131250,131252,131254,131256,131258,131260
116,3.0,2.0,2.0,0.0,0.0,1.5,0.0,0.0,1.5,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
156,5.0,5.0,2.0,3.0,3.0,4.0,4.0,0.0,3.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
208,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
298,4.0,3.0,3.0,0.0,3.0,5.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
359,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [57]:
ml_20m_train_matrix_df.to_parquet('data/ml-20m/ml_20m_train_matrix_df.parquet')

## Book-crossing

In [58]:
bc_num_usrs = book_crossing_df.userId.nunique()
bc_num_items = book_crossing_df.itemId.nunique()
bc_user_ids = pd.unique(book_crossing_df.userId).tolist()
bc_item_ids = pd.unique(book_crossing_df.itemId).tolist()
bc_user_ids_dict = {bc_user_ids[i]: i for i in range(len(bc_user_ids))}
bc_item_ids_dict = {bc_item_ids[i]: i for i in range(len(bc_item_ids))}

book_crossing_train_matrix = convert_ratings_to_matrix(book_crossing_df_train, bc_num_usrs, bc_num_items, bc_user_ids_dict, bc_item_ids_dict)

book_crossing_train_matrix

array([[0. , 0. , 4.5, ..., 0. , 0. , 0. ],
       [4.5, 0. , 0. , ..., 0. , 0. , 0. ],
       [3.5, 0. , 0. , ..., 0. , 0. , 0. ],
       ...,
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ]])

In [59]:
book_crossing_train_matrix_df = pd.DataFrame(book_crossing_train_matrix, columns=bc_item_ids, index=bc_user_ids)

In [60]:
book_crossing_train_matrix_df.head()

Unnamed: 0,1,3,5,18,19,20,26,27,28,29,...,255055,256010,256061,256500,261322,262275,262696,262753,263383,269554
11676,0.0,0.0,4.5,4.5,4.0,3.5,3.0,0.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
116866,4.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
219008,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
263325,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
35704,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [61]:
book_crossing_train_matrix_df.to_parquet('data/book_crossing/book_crossing_train_matrix_df.parquet')

# Calculate Similarity

In [90]:
def get_userCF_sim_batch(rating_matrix, user_ids, output_file=None, batch_size=1000):
    
    user_item_matrix = rating_matrix
    users = user_ids
    num_users = len(users)

    similarity_matrix = np.zeros((num_users, num_users))

    try:
        # os.makedirs(os.path.dirname(output_file), exist_ok=True)
        
        with tqdm(total=num_users // batch_size, desc="Calculating batch user similarities", unit="batch") as pbar:
            for start in range(0, num_users, batch_size):
                end = min(start + batch_size, num_users)
                batch_users = user_item_matrix[start:end, :]
                
                try:
                    batch_similarity = cosine_similarity(batch_users, user_item_matrix)
                except ValueError as ve:
                    print(f"ValueError in cosine_similarity for batch {start}-{end}: {ve}")
                    continue
                except MemoryError as me:
                    print(f"MemoryError: {me}")
                    break

                similarity_matrix[start:end, :] = batch_similarity
                
                pbar.update(1)

        similarity_df = pd.DataFrame(similarity_matrix, index=users, columns=users)

        if output_file is not None:
            similarity_df.to_parquet(output_file)
            print(f"User similarity matrix has been saved to {output_file}")
            return similarity_df
        else:
            return similarity_df

    except OSError as e:
        print(f"OSError: Failed to create directory for {output_file}: {e}")

In [91]:
def get_itemCF_sim_batch(rating_matrix, item_ids, output_file=None, batch_size=1000):
    
    item_user_matrix = rating_matrix.T
    items = item_ids
    num_items = len(items)

    similarity_matrix = np.zeros((num_items, num_items))

    try:
        with tqdm(total=num_items // batch_size, desc="Calculating batch item similarities", unit="batch") as pbar:
            for start in range(0, num_items, batch_size):
                end = min(start + batch_size, num_items)
                batch_items = item_user_matrix[start:end, :]
                
                try:
                    batch_similarity = cosine_similarity(batch_items, item_user_matrix)
                except ValueError as ve:
                    print(f"ValueError in cosine_similarity for batch {start}-{end}: {ve}")
                    continue
                except MemoryError as me:
                    print(f"MemoryError: {me}")
                    break

                similarity_matrix[start:end, :] = batch_similarity
                
                pbar.update(1)

        similarity_df = pd.DataFrame(similarity_matrix, index=items, columns=items)

        if output_file is not None:
            similarity_df.to_parquet(output_file)
            print(f"Item similarity matrix has been saved to {output_file}")
            return similarity_df
        else:
            return similarity_df

    except OSError as e:
        print(f"OSError: Failed to create directory for {output_file}: {e}")

## Book-Crossing

In [92]:
output_file_path = 'data/book_crossing/book_crossing_user_sim.parquet'
book_crossing_user_sim = get_userCF_sim_batch(book_crossing_train_matrix_df.values, bc_user_ids, output_file=output_file_path, batch_size=1000)

Calculating batch user similarities: 3batch [00:02,  1.18batch/s]                    


User similarity matrix has been saved to data/book_crossing/book_crossing_user_sim.parquet


In [94]:
book_crossing_user_sim.shape

(2996, 2996)

In [95]:
output_file_path = 'data/book_crossing/book_crossing_item_sim.parquet'
book_crossing_item_sim = get_itemCF_sim_batch(book_crossing_train_matrix_df.values, bc_item_ids, output_file=output_file_path, batch_size=1000)

Calculating batch item similarities: 100%|██████████| 25/25 [00:20<00:00,  1.23batch/s]


Item similarity matrix has been saved to data/book_crossing/book_crossing_item_sim.parquet


In [98]:
book_crossing_item_sim.shape

(25000, 25000)

## ml-20m

In [96]:
output_file_path = 'data/ml-20m/ml_20m_user_sim.parquet'
ml_20m_user_sim = get_userCF_sim_batch(ml_20m_train_matrix_df.values, ml_user_ids, output_file=output_file_path, batch_size=1000)

Calculating batch user similarities: 100%|██████████| 3/3 [00:02<00:00,  1.15batch/s]


User similarity matrix has been saved to data/ml-20m/ml_20m_user_sim.parquet


In [99]:
ml_20m_user_sim.shape

(3000, 3000)

In [97]:
output_file_path = 'data/ml-20m/ml_20m_item_sim.parquet'
ml_20m_item_sim = get_itemCF_sim_batch(ml_20m_train_matrix_df.values, ml_item_ids, output_file=output_file_path, batch_size=1000)

Calculating batch item similarities: 26batch [00:23,  1.11batch/s]                     


Item similarity matrix has been saved to data/ml-20m/ml_20m_item_sim.parquet


In [100]:
ml_20m_item_sim.shape

(25511, 25511)

## Check dimensions

In [101]:
ml_20m_train_matrix_df.shape, ml_20m_user_sim.shape, ml_20m_item_sim.shape

((3000, 25511), (3000, 3000), (25511, 25511))

In [102]:
book_crossing_train_matrix_df.shape, book_crossing_user_sim.shape, book_crossing_item_sim.shape

((2996, 25000), (2996, 2996), (25000, 25000))

In [104]:
ml_20m_train_matrix_df.columns, ml_20m_item_sim.columns

(Index([     1,      2,      3,      4,      5,      6,      7,      8,      9,
            10,
        ...
        131239, 131241, 131243, 131248, 131250, 131252, 131254, 131256, 131258,
        131260],
       dtype='int64', length=25511),
 Index([     1,      2,      3,      4,      5,      6,      7,      8,      9,
            10,
        ...
        131239, 131241, 131243, 131248, 131250, 131252, 131254, 131256, 131258,
        131260],
       dtype='int64', length=25511))