https://datascienceub.medium.com/1-3-recommendation-vanilla-pipeline-for-recommender-systems-rs-ab7425b86d9

Link notebook: https://colab.research.google.com/drive/12Nu2lDlWhkdLZcSn__lTCHFjw7Bz_g85?usp=sharing

In [1]:
from torch.utils.data import DataLoader, Dataset
from IPython import embed
from sklearn.metrics import roc_auc_score
import pandas as pd
import numpy as np
import csv
import os
import scipy.sparse as sp
from tqdm import tqdm, trange

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import os.path as osp

In [3]:
DATA_FOLDER = "../../Datasets/ml-100k/"

In [4]:
if osp.isfile(osp.join(DATA_FOLDER, "interactions.csv")):
    df = pd.read_csv(osp.join(DATA_FOLDER, "interactions.csv"))
else:
    df = pd.read_csv(osp.join(DATA_FOLDER, "u.data"), delimiter="\t", 
    names=['user', 'item', 'rating', 'timestamp'])
    df['user'] = pd.Categorical(df['user']).codes
    df['item'] = pd.Categorical(df['item']).codes
    df['rating'] = 1

    df.to_csv(osp.join(DATA_FOLDER, "interactions.csv"))

In [5]:
print(df.head())
print("df shape: ", df.shape)
print("number of user: ", len(df['user'].unique()))

   Unnamed: 0  user  item  rating  timestamp
0           0   195   241       1  881250949
1           1   185   301       1  891717742
2           2    21   376       1  878887116
3           3   243    50       1  880606923
4           4   165   345       1  886397596
df shape:  (100000, 5)
number of user:  943


       Unnamed: 0  user  item  rating  timestamp
1103         1103    61   301       1  879371909
2051         2051    61   257       1  879371909
16623       16623    61   327       1  879371909
60513       60513    61   270       1  879371909
1461         1461    61   287       1  879371909
       Unnamed: 0  user  item  rating  timestamp
74098       74098    61   166       1  879376727
2621         2621    61   400       1  879376727
(98114, 5)
(1886, 5)


In [15]:
data = df[['user', 'item', 'timestamp']].astype('int32').to_numpy()
print(data)

[[      195       241 881250949]
 [      185       301 891717742]
 [       21       376 878887116]
 ...
 [      275      1089 874795795]
 [       12       224 882399156]
 [       11       202 879959583]]


In [17]:
# Convert id of product to after user
add_dims = 0
for i in range(data.shape[1]):
    # Reset index to zero
    data[:, i] -= np.min(data[:, i])
    # re index
    data[:, i] += add_dims
    add_dims = np.max(data[:, i]) + 1

num_user = np.max(data, axis=0) + 1
print(data)

[[     195     1184  6528864]
 [     185     1244 16995657]
 [      21     1319  4165031]
 ...
 [     275     2032    73710]
 [      12     1167  7677071]
 [      11     1145  5237498]]


In [20]:
# Split dataset with time leave one method
def split_train_test(data, num_user):
    train_data = []
    test_data = []
    columns = []
    columns.append("user")
    for col in range(data.shape[1] - 1):
        columns.append(str(col))
    df = pd.DataFrame(data, columns=columns)
    unique_users = df['user'].unique()
    for idx, user in enumerate(unique_users):
        user_data = data.loc[data['user'] == user]
        user_data = user_data.sort_values(by=['timestamp'], ascending=True)
        user_data_train = user_data[:-2]
        user_data_test = user_data[-2:]
        train_data.append(user_data_train)
        test_data.append(user_data_test)
        if idx == 10:
            print(user_data_train.head())
            print(user_data_test.head())
    return pd.concat(train_data).to_numpy(), pd.concat(test_data).to_numpy()

train_data, test_data = split_train_test(df, num_user)
print(train_data.shape)
print(test_data.shape)

       Unnamed: 0  user  item  rating  timestamp
1103         1103    61   301       1  879371909
2051         2051    61   257       1  879371909
16623       16623    61   327       1  879371909
60513       60513    61   270       1  879371909
1461         1461    61   287       1  879371909
       Unnamed: 0  user  item  rating  timestamp
74098       74098    61   166       1  879376727
2621         2621    61   400       1  879376727
(98114, 5)
(1886, 5)


In [None]:
def build_adj_matrix(n_feat, data):
    train_mat = sp.dok_matrix((n_feat, n_feat), dtype=np.float32)
    for row in __annotations__