<font color='tomato'><font color="#CC3D3D"><p>
# Data Preprocessing for AE-based RecSys

### Global Setting & Imports

In [None]:
import numpy as np
import pandas as pd
import sys, os
from pathlib import Path
import tensorflow as tf
import warnings
warnings.filterwarnings('ignore')

sys.path.append('/home/work/yhcho/2023-02/RS')
from msr.constants import (
    DEFAULT_USER_COL,
    DEFAULT_ITEM_COL,
    DEFAULT_RATING_COL,
    DEFAULT_TIMESTAMP_COL,
    DEFAULT_PREDICTION_COL,
)
from msr.split_utils import min_rating_filter_pandas
from msr.python_splitters import numpy_stratified_split
from msr.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k
from msr.sparse import AffinityMatrix
from msr.python_utils import binarize

In [None]:
HELDOUT_USERS = 600  # 검증 및 평가용 사용자 수
SEED = 2023

### Data Loading

In [None]:
# MovieLens 1M 데이터 다운로드 & 압축 해제
path = os.getcwd()
movielens_zipped_file = tf.keras.utils.get_file(
    path + "/ml-1m.zip", 
    "http://files.grouplens.org/datasets/movielens/ml-1m.zip", 
    extract=True, cache_subdir='movielens', cache_dir=path
)

In [None]:
# 포맷에 맞춰 평점 데이터 로딩
file_name = Path(path + '/movielens/ml-1m/ratings.dat')
cols = [DEFAULT_USER_COL, DEFAULT_ITEM_COL, DEFAULT_RATING_COL, DEFAULT_TIMESTAMP_COL]
df = pd.read_table(file_name, sep='::', header=None, names=cols, engine='python')
df

### Data Filtering

In [None]:
# Binarize the data (only keep ratings >= 4)
df_preferred = df[df['rating'] > 3.5]
print (df_preferred.shape)
df_low_rating = df[df['rating'] <= 3.5]

df_preferred.head(10)

In [None]:
# Keep users who clicked on at least 5 movies
df = min_rating_filter_pandas(df_preferred, min_rating=5, filter_by="user")

# Keep movies that were clicked on by at least on 1 user
df = min_rating_filter_pandas(df, min_rating=1, filter_by="item")

In [None]:
# Obtain both usercount and itemcount after filtering
usercount = df[[DEFAULT_USER_COL]].groupby(DEFAULT_USER_COL, as_index = False).size()
itemcount = df[[DEFAULT_ITEM_COL]].groupby(DEFAULT_ITEM_COL, as_index = False).size()

# Compute sparsity after filtering
sparsity = 1. * df.shape[0] / (usercount.shape[0] * itemcount.shape[0])

print("After filtering, there are %d watching events from %d users and %d movies (sparsity: %.3f%%)" % 
      (df.shape[0], usercount.shape[0], itemcount.shape[0], sparsity * 100))

### Data Splitting

In [None]:
unique_users = sorted(df[DEFAULT_USER_COL].unique())
np.random.seed(SEED)
unique_users = np.random.permutation(unique_users)

In [None]:
# Create train/validation/test users
n_users = len(unique_users)
print("Number of unique users:", n_users)

train_users = unique_users[:(n_users - HELDOUT_USERS * 2)]
print("\nNumber of training users:", len(train_users))

val_users = unique_users[(n_users - HELDOUT_USERS * 2) : (n_users - HELDOUT_USERS)]
print("\nNumber of validation users:", len(val_users))

test_users = unique_users[(n_users - HELDOUT_USERS):]
print("\nNumber of test users:", len(test_users))

In [None]:
# For training set keep only users that are in train_users list
train_set = df.loc[df[DEFAULT_USER_COL].isin(train_users)]
print("Number of training observations: ", train_set.shape[0])

# For validation set keep only users that are in val_users list
val_set = df.loc[df[DEFAULT_USER_COL].isin(val_users)]
print("\nNumber of validation observations: ", val_set.shape[0])

# For test set keep only users that are in test_users list
test_set = df.loc[df[DEFAULT_USER_COL].isin(test_users)]
print("\nNumber of test observations: ", test_set.shape[0])

# train_set/val_set/test_set contain user - movie interactions with rating 4 or 5 

In [None]:
# Obtain list of unique movies used in training set
unique_train_items = pd.unique(train_set[DEFAULT_ITEM_COL])
print("Number of unique movies that rated in training set", unique_train_items.size)

In [None]:
# For validation set keep only movies that used in training set
val_set = val_set.loc[val_set[DEFAULT_ITEM_COL].isin(unique_train_items)]
print("Number of validation observations after filtering: ", val_set.shape[0])

# For test set keep only movies that used in training set
test_set = test_set.loc[test_set[DEFAULT_ITEM_COL].isin(unique_train_items)]
print("\nNumber of test observations after filtering: ", test_set.shape[0])

### Click matrix (binary rating matrix) Generating

In [None]:
# Instantiate the sparse matrix generation for train, validation and test sets
# use list of unique items from training set for all sets
train, _, _ = AffinityMatrix(df=train_set, items_list=unique_train_items).gen_affinity_matrix()
valid, _, _ = AffinityMatrix(df=val_set, items_list=unique_train_items).gen_affinity_matrix()
test, _, _  = AffinityMatrix(df=test_set, items_list=unique_train_items).gen_affinity_matrix()

In [None]:
# Split test data into training and testing parts
test_tr, test_te = numpy_stratified_split(test, ratio=0.75, seed=SEED)

In [None]:
# Binarize train and validation
train = binarize(a=train, threshold=3.5)
valid = binarize(a=valid, threshold=3.5)

# Binarize test data: training part 
test_tr = binarize(a=test_tr, threshold=3.5)
# Binarize test data: testing part (save non-binary version in the separate object, will be used for calculating NDCG)

In [None]:
# Save data all together using pickle
import pickle
with open('MultiVAE_data_v2.pkl', 'wb') as f:
    vae_data = (train, valid, test_tr, test_te)
    pickle.dump(vae_data, f)

<font color='tomato'><font color="#CC3D3D"><p>
# End