# Install libraries

In [None]:
!pip install --upgrade pip setuptools
!pip install recommenders
!pip install papermill
!pip install scrapbook
!pip install --upgrade tf_slim

# Download Dataset

In [None]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"mercurio117","key":"aa32c9add37b4c6d6128fd12fa1a54ad"}'}

In [None]:
!mkdir ~/.kaggle

In [None]:
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle competitions download -c h-and-m-personalized-fashion-recommendations -f articles.csv
!kaggle competitions download -c h-and-m-personalized-fashion-recommendations -f customers.csv
!kaggle competitions download -c h-and-m-personalized-fashion-recommendations -f transactions_train.csv

Downloading articles.csv.zip to /content
  0% 0.00/4.26M [00:00<?, ?B/s]
100% 4.26M/4.26M [00:00<00:00, 146MB/s]
Downloading customers.csv.zip to /content
 82% 80.0M/97.9M [00:00<00:00, 218MB/s]
100% 97.9M/97.9M [00:00<00:00, 141MB/s]
Downloading transactions_train.csv.zip to /content
 97% 569M/584M [00:03<00:00, 150MB/s]
100% 584M/584M [00:03<00:00, 165MB/s]


In [None]:
!mkdir H-and-M
!unzip articles.csv.zip -d H-and-M
!unzip customers.csv.zip -d H-and-M
!unzip transactions_train.csv.zip -d H-and-M

Archive:  articles.csv.zip
  inflating: H-and-M/articles.csv    
Archive:  customers.csv.zip
  inflating: H-and-M/customers.csv   
Archive:  transactions_train.csv.zip
  inflating: H-and-M/transactions_train.csv  


# Filter Dataset

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [None]:
transactions = pd.read_csv("H-and-M/transactions_train.csv")

In [None]:
transactions.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2


In [None]:
transactions.tail()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
31788319,2020-09-22,fff2282977442e327b45d8c89afde25617d00124d0f999...,929511001,0.059305,2
31788320,2020-09-22,fff2282977442e327b45d8c89afde25617d00124d0f999...,891322004,0.042356,2
31788321,2020-09-22,fff380805474b287b05cb2a7507b9a013482f7dd0bce0e...,918325001,0.043203,1
31788322,2020-09-22,fff4d3a8b1f3b60af93e78c30a7cb4cf75edaf2590d3e5...,833459002,0.006763,1
31788323,2020-09-22,fffef3b6b73545df065b521e19f64bf6fe93bfd450ab20...,898573003,0.033881,2


Last Day 2020-09-22

Last Week 2020-09-15

Select last two month to train the model

In [None]:
df_train_orig = transactions[(transactions['t_dat'] >= '2020-08-01') & (transactions['t_dat'] < '2020-09-01')]
df_test = transactions[transactions['t_dat'] >='2020-09-01']

In [None]:
def filter_min_items(df, min_items):
  customers = df['customer_id'].values
  values, counts = np.unique(customers, return_counts=True)
  print("Original customers:", len(values))
  idxs = np.where(counts>=min_items)
  print("Customers after filtering:", len(idxs[0]))
  df_temp = df.set_index('customer_id')
  df_temp = df_temp.loc[values[idxs]]
  df_temp.reset_index(inplace=True)

  return df_temp

def create_rating_table(df):
  df = df.drop(["t_dat", "price", "sales_channel_id"], axis=1)
  df = df.groupby(["customer_id", "article_id"], as_index=False).count()
  df["rating"] = 1
  df.rename(columns={'customer_id':'userID', 'article_id':'itemID'}, inplace=True)
  return df

def filter_by_customer(train, test):
  le = LabelEncoder()
  customers_train = np.unique(train['customer_id'].values)
  customers_test = np.unique(test['customer_id'].values)
  customers = np.intersect1d(customers_train, customers_test)
  le.fit(customers)
  print("Original customers:", len(customers_train))
  train = train.set_index('customer_id')
  train = train.loc[customers]
  train.reset_index(inplace=True)
  new_customers = np.unique(train['customer_id'].values)
  print("Customers after filtering:", len(new_customers))
  test = test.set_index('customer_id')
  test = test.loc[customers]
  test.reset_index(inplace=True)

  train = create_rating_table(train)
  train['userID'] = le.transform(train['userID'])
  test = create_rating_table(test)
  test['userID'] = le.transform(test['userID'])

  return train, test, le

In [None]:
min_items = 10
df_train = filter_min_items(df_train_orig, min_items)
df_train.head()

Original customers: 254163
Customers after filtering: 29978


Unnamed: 0,customer_id,t_dat,article_id,price,sales_channel_id
0,00009d946eec3ea54add5ba56d5210ea898def4b46c685...,2020-08-14,884319008,0.022695,2
1,00009d946eec3ea54add5ba56d5210ea898def4b46c685...,2020-08-14,921226001,0.015136,2
2,00009d946eec3ea54add5ba56d5210ea898def4b46c685...,2020-08-14,706016001,0.030254,2
3,00009d946eec3ea54add5ba56d5210ea898def4b46c685...,2020-08-14,881244001,0.030254,2
4,00009d946eec3ea54add5ba56d5210ea898def4b46c685...,2020-08-17,903326005,0.025407,2


In [None]:
train, test, le = filter_by_customer(df_train, df_test)

print("\nTest dataset")
print("Before filtering:", len(np.unique(test['userID'].values)))
#Filter out any users or items in the test set that do not appear in the training set.
test = test[test["userID"].isin(train["userID"].unique())]
test = test[test["itemID"].isin(train["itemID"].unique())]
print("After filtering:", len(np.unique(test['userID'].values)))

Original customers: 29978
Customers after filtering: 13462

Test dataset
Before filtering: 13462
After filtering: 12944


In [None]:
train.head()

Unnamed: 0,userID,itemID,rating
0,0,572998001,1
1,0,572998007,1
2,0,684824006,1
3,0,713253003,1
4,0,808685002,1


In [None]:
test.head()

Unnamed: 0,userID,itemID,rating
1,0,734592001,1
4,0,865929003,1
5,0,888024005,1
6,0,909869004,1
7,0,923134003,1


In [None]:
train.to_csv("Train.csv")
test.to_csv("Test.csv")

# Train Model NCF

In [None]:
from recommenders.models.ncf.ncf_singlenode import NCF
from recommenders.models.ncf.dataset import Dataset as NCFDataset
from recommenders.datasets.python_splitters import python_chrono_split
from recommenders.utils.constants import SEED as DEFAULT_SEED

In [None]:
# Initial parameters
TOP_K = 12
EPOCHS = 5
BATCH_SIZE = 64
SEED = DEFAULT_SEED
LEARNING_RATE = 1e-3

In [None]:
data = NCFDataset(train_file = "/content/Train.csv", test_file = "/content/Test.csv", seed=SEED)

INFO:recommenders.models.ncf.dataset:Indexing /content/Train.csv ...
INFO:recommenders.models.ncf.dataset:Indexing /content/Test.csv ...
INFO:recommenders.models.ncf.dataset:Indexing /content/Test_full.csv ...


In [None]:
model = NCF (
    n_users=data.n_users, 
    n_items=data.n_items,
    model_type="NeuMF",
    n_factors=4,
    layer_sizes=[16,8,4],
    n_epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    verbose=1,
    seed=SEED
)



In [None]:
from recommenders.utils.timer import Timer
with Timer() as train_time:
    model.fit(data)

print("Took {} seconds for training.".format(train_time.interval))
#model.save(dir_name='/content/NCF')

INFO:recommenders.models.ncf.ncf_singlenode:Epoch 1 [355.31s]: train_loss = 0.410645 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 2 [276.16s]: train_loss = 0.375584 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 3 [277.06s]: train_loss = 0.350254 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 4 [279.34s]: train_loss = 0.335448 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 5 [277.75s]: train_loss = 0.324278 


Took 1465.6354239039993 seconds for training.


In [None]:
test.head()

Unnamed: 0,userID,itemID,rating
1,0,734592001,1
4,0,865929003,1
5,0,888024005,1
6,0,909869004,1
7,0,923134003,1


In [None]:
def predict_k_user(model, userID, train, k):
  items = list(train.itemID.unique())
  predictions=[]
  for i, item in enumerate(items):
    predictions.append([userID, item, model.predict(userID, item)])
  predictions = pd.DataFrame(predictions, columns=['userID', 'itemID', 'prediction'])
  predictions.sort_values(by='prediction', ascending=False, inplace=True)
  return predictions#.head(k)

def predict_perro(model, users, train, topK):
  items = list(train.itemID.unique())
  predictions = []
  for user in users:
    user_items = list(train[train['userID'] == user].itemID.unique())
    for item in items:
      if item not in user_items:
        predictions.append([user, item, model.predict(user, item)])

  predictions = pd.DataFrame(predictions, columns=['userID', 'itemID', 'prediction'])

  predictions["rank"] = predictions.groupby("userID")["prediction"].rank("dense", ascending=False)
  predictions = predictions[predictions['rank'] <= topK ]

  return predictions

In [None]:
n_users = 50

userID = train['userID'].unique()[0:n_users]
predictions = predict_perro(model, userID, train, TOP_K)
predictions

Unnamed: 0,userID,itemID,prediction,rank
28,0,863595006,0.864354,7.0
82,0,610776002,0.842183,12.0
163,0,916468003,0.891823,1.0
315,0,865929003,0.842689,11.0
381,0,896152002,0.870590,4.0
...,...,...,...,...
979299,49,933838002,0.944550,6.0
979468,49,794468001,0.937441,10.0
979600,49,827968004,0.942302,8.0
979849,49,926387001,0.932740,11.0


In [None]:
test_temp = test.set_index('userID')
test_temp = test_temp.loc[userID]
test_temp.reset_index(inplace=True)
test_temp.head()

Unnamed: 0,userID,itemID,rating
0,0,734592001,1
1,0,865929003,1
2,0,888024005,1
3,0,909869004,1
4,0,923134003,1


In [None]:
from recommenders.evaluation.python_evaluation import (rmse, mae, rsquared, exp_var, map_at_k, ndcg_at_k, precision_at_k, 
                                                     recall_at_k, get_top_k_items)

eval_map = map_at_k(test_temp, predictions, col_prediction='prediction', k=TOP_K)
eval_precision = precision_at_k(test_temp, predictions, col_prediction='prediction', k=TOP_K)
eval_recall = recall_at_k(test_temp, predictions, col_prediction='prediction', k=TOP_K)

print("MAP:\t%f" % eval_map,
      "Precision@K:\t%f" % eval_precision,
      "Recall@K:\t%f" % eval_recall, sep='\n')

MAP:	0.008121
Precision@K:	0.011667
Recall@K:	0.035690


## Kaggle submission

In [None]:
predictions['userID'] = le.inverse_transform(predictions['userID'].values)
predictions.reset_index(drop=True, inplace=True)
predictions.sample(5)

Unnamed: 0,userID,itemID,prediction,rank
48,001c1f8d70782f450524d3b3f404474dbd4a7d0d2ad78a...,817361007,0.934947,5.0
73,00250644a9628fb8c4aebabf555d3caabe44c99c74fbcf...,884319006,0.814418,11.0
206,005e89af159f63c1280904d911e7f872683ab6223ecf33...,916468003,0.952398,1.0
596,00f1c3895749444bd89d21b2892f26e87efbe93464d068...,794468001,0.937441,10.0
236,0064cd1ee810d4caabd1182a8f177479b82b18961bd76b...,741040001,0.947676,10.0


# Train Model ViBae

In [None]:
import sys
import os
import torch
import cornac
import papermill as pm
import scrapbook as sb
import pandas as pd
from recommenders.datasets import movielens
from recommenders.datasets.python_splitters import python_random_split
from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k
from recommenders.models.cornac.cornac_utils import predict_ranking
from recommenders.utils.timer import Timer
from recommenders.utils.constants import SEED

print("System version: {}".format(sys.version))
print("PyTorch version: {}".format(torch.__version__))
print("Cornac version: {}".format(cornac.__version__))

System version: 3.7.13 (default, Apr 24 2022, 01:04:09) 
[GCC 7.5.0]
PyTorch version: 1.11.0+cu113
Cornac version: 1.14.2


In [None]:
# top k items to recommend
TOP_K = 12

# Model parameters
LATENT_DIM = 50
ENCODER_DIMS = [100]
ACT_FUNC = "tanh"
LIKELIHOOD = "pois"
NUM_EPOCHS = 5
BATCH_SIZE = 128
LEARNING_RATE = 0.001

In [None]:
train_set = cornac.data.Dataset.from_uir(train.itertuples(index=False), seed=SEED)

print('Number of  customers: {}'.format(train_set.num_users))
print('Number of articles: {}'.format(train_set.num_items))

Number of  customers: 13462
Number of articles: 19962


In [None]:
bivae = cornac.models.BiVAECF(
    k=LATENT_DIM,
    encoder_structure=ENCODER_DIMS,
    act_fn=ACT_FUNC,
    likelihood=LIKELIHOOD,
    n_epochs=NUM_EPOCHS,
    batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    seed=SEED,
    use_gpu=torch.cuda.is_available(),
    verbose=True
)

In [None]:
with Timer() as t:
    bivae.fit(train_set)
print("Took {} seconds for training.".format(t))

  0%|          | 0/5 [00:00<?, ?it/s]

Took 30.5439 seconds for training.


In [None]:
def predict_gato(bivae, users, train, k):
  user_idxs = range(len(users))
  items = list(train.itemID.unique())
  items_idxs = range(len(items))
  predictions = []

  for user_idx in user_idxs:
    user_items = list(train[train['userID'] == users[user_idx]].itemID.unique())
    items_enc, prediction = bivae.rank(user_idx=user_idx, item_indices=items_idxs)
    for i, items_idx in enumerate(items_idxs):
      item = items[items_idx]
      if item not in user_items:
        predictions.append( [users[user_idx], items[items_idx], prediction[i]] )
  
  predictions = pd.DataFrame(predictions, columns=['userID', 'itemID', 'prediction'])

  predictions["rank"] = predictions.groupby("userID")["prediction"].rank("dense", ascending=False)
  predictions = predictions[predictions['rank'] <= k ]
  return predictions

In [None]:
predictions_bivae = predict_gato(bivae, userID, train, TOP_K)
predictions_bivae

Unnamed: 0,userID,itemID,prediction,rank
326,0,456163085,0.999996,4.0
407,0,921906001,1.000000,1.0
518,0,882757003,0.999993,6.0
528,0,913340002,1.000000,2.0
570,0,850917001,0.999991,7.0
...,...,...,...,...
979181,49,921906003,0.999999,1.0
979416,49,924250001,0.999417,10.0
979499,49,894668002,0.999996,2.0
980119,49,891591001,0.999978,5.0


In [None]:
eval_map = map_at_k(test_temp, predictions_bivae, col_prediction='prediction', k=TOP_K)
eval_precision = precision_at_k(test_temp, predictions_bivae, col_prediction='prediction', k=TOP_K)
eval_recall = recall_at_k(test_temp, predictions_bivae, col_prediction='prediction', k=TOP_K)

print("MAP:\t%f" % eval_map,
      "Precision@K:\t%f" % eval_precision,
      "Recall@K:\t%f" % eval_recall, sep='\n')

MAP:	0.004321
Precision@K:	0.006667
Recall@K:	0.030485


## Kaggle submission

In [None]:
predictions_bivae['userID'] = le.inverse_transform(predictions_bivae['userID'].values)
predictions_bivae.reset_index(drop=True, inplace=True)
predictions_bivae.sample(5)

Unnamed: 0,userID,itemID,prediction,rank
508,00c160fd797be9c52491373b16ec194d4edb129c57266e...,850917001,0.999974,9.0
266,00754012108569f9c99871720111a2b50aa7b6ebebe2a4...,882757003,0.999657,11.0
582,00de0f442480958d4c86892efe81b5871beb79805dbdd0...,921906001,0.999873,3.0
476,00bce12709bdc1536ee25bc63e229d3746960b20cbbb75...,924250001,0.991922,11.0
400,009a85913aa6f503ed0d2b5ac02ab919d6565bbbaa934a...,850917001,0.992643,10.0
