In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [2]:
BASE_PATH = '../data/'

# make sure the same data preprocessing as in the radek notebook have been performed
# (see 02 FE/DataProcessingRadek.ipynb)
transactions = pd.read_parquet(BASE_PATH + 'parquet/transactions_train.parquet')
customers = pd.read_parquet(BASE_PATH + 'parquet/customers.parquet')
articles = pd.read_parquet(BASE_PATH + 'parquet/articles.parquet')

articles.head()

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,6290,22897,253,9,0,1010016,0,9,0,...,10,0,0,1,0,16,30,1002,2,10954
1,108775044,6290,22897,253,9,0,1010016,0,10,2,...,10,0,0,1,0,16,30,1002,2,10954
2,108775051,6290,45824,253,9,0,1010017,3,11,11,...,10,0,0,1,0,16,30,1002,2,10954
3,110065001,6291,11405,306,13,4,1010016,0,9,0,...,132,7,7,1,0,61,5,1017,4,10885
4,110065002,6291,11405,306,13,4,1010016,0,10,2,...,132,7,7,1,0,61,5,1017,4,10885


Generate x% sample of the data for training LightGCN

15% is the maximum that worked for me, larger didn't fit into VRAM anymore.

In [None]:
%%time

transactions.to_parquet(BASE_PATH + 'parquet/transactions_train.parquet')
customers.to_parquet(BASE_PATH + 'parquet/customers.parquet')
articles.to_parquet(BASE_PATH + 'parquet/articles.parquet')

# let's create a 15% sample of all the data to speed up dev
sample = 0.15
customers_sample = customers.sample(frac=sample, replace=False)
customers_sample_ids = set(customers_sample['customer_id'])
transactions_sample = transactions[transactions["customer_id"].isin(customers_sample_ids)]
articles_sample_ids = set(transactions_sample["article_id"])
articles_sample = articles[articles["article_id"].isin(articles_sample_ids)]

test_week = 104

train = transactions_sample[transactions_sample.week != test_week]
test = transactions_sample[transactions_sample.week == test_week]

customers_sample.to_parquet(BASE_PATH + f'parquet/customers_sample_{sample}.parquet', index=False)
train.to_parquet(BASE_PATH + f'parquet/transactions_train_sample_{sample}.parquet', index=False)
test.to_parquet(BASE_PATH + f'parquet/transactions_test_sample_{sample}.parquet', index=False)
articles_sample.to_parquet(BASE_PATH + f'parquet/articles_sample_{sample}.parquet', index=False)

In [None]:
!cp ../data/parquet/transactions_train_sample_0.15.parquet ../data/HMDataset/raw/train.parquet
!cp ../data/parquet/transactions_test_sample_0.15.parquet ../data/HMDataset/raw/test.parquet
!cp ../data/parquet/customers_sample_0.15.parquet ../data/HMDataset/raw/customers.parquet
!cp ../data/parquet/articles_sample_0.15.parquet ../data/HMDataset/raw/articles.parquet

The following code is for an experiment that I did. 
In the experiment I made sure that every user in the training dataset was also in the test dataset and vice versa.

This didn't improve the results, so I didn't use it in the end.

In [None]:
BASE_PATH = '../data/'

# make sure the same data preprocessing as in the radek notebook have been performed
# (see 02 FE/DataProcessingRadek.ipynb)
transactions = pd.read_parquet(BASE_PATH + 'parquet/transactions_train.parquet')
customers = pd.read_parquet(BASE_PATH + 'parquet/customers.parquet')
articles = pd.read_parquet(BASE_PATH + 'parquet/articles.parquet')

articles.head()

In [3]:
test_week = 104

train = transactions[transactions.week != test_week]
test = transactions[transactions.week == test_week]

In [4]:
train = train.sort_values(ascending=True, by=['customer_id'])
train_count = train.groupby(['customer_id']).size().reset_index(name='count')

train_customers = set(train_count[train_count['count'] > 0]['customer_id'])

In [5]:
test = test.sort_values(ascending=True, by=['customer_id'])
test_count = test.groupby(['customer_id']).size().reset_index(name='count')

test_customers = set(test_count[test_count['count'] > 0]['customer_id'])

In [6]:
sample_customers = train_customers.intersection(test_customers) 
customers = customers[customers['customer_id'].isin(sample_customers)]

print(len(sample_customers), len(customers))

In [7]:
train['purchased'] = 1

In [16]:
# let's create a 15% sample of all the data to speed up dev
sample = 0.15
customers_sample = customers.sample(frac=sample, replace=False)
customers_sample_ids = set(customers_sample['customer_id'])
transactions_train_sample = train[train["customer_id"].isin(customers_sample_ids)]
transactions_test_sample = test[test["customer_id"].isin(customers_sample_ids)]
articles_sample_ids = set(transactions_train_sample["article_id"]).union(set(transactions_test_sample["article_id"]))
articles_sample = articles[articles["article_id"].isin(articles_sample_ids)]

transactions_test_sample = transactions_test_sample.sample(frac=0.3, replace=False)

customers_sample.to_parquet(BASE_PATH + f'parquet/customers_sample_{sample}.parquet', index=False)
transactions_train_sample.to_parquet(BASE_PATH + f'parquet/transactions_train_sample_{sample}.parquet', index=False)
transactions_test_sample.to_parquet(BASE_PATH + f'parquet/transactions_test_sample_{sample}.parquet', index=False)
articles_sample.to_parquet(BASE_PATH + f'parquet/articles_sample_{sample}.parquet', index=False)

In [17]:
!cp ../data/parquet/transactions_train_sample_0.15.parquet ../data/HMDataset/raw/train.parquet
!cp ../data/parquet/transactions_test_sample_0.15.parquet ../data/HMDataset/raw/test.parquet
!cp ../data/parquet/customers_sample_0.15.parquet ../data/HMDataset/raw/customers.parquet
!cp ../data/parquet/articles_sample_0.15.parquet ../data/HMDataset/raw/articles.parquet

# Next steps in training
Run the [train.py](./train.py) python file to train the model.

The data loading is done by the HMDataset class in [dataset.py](./Dataset.py), between runs with different data (e.g. different sample size) you need to delete the [processed data](../data/HMDataset/processed) in the data folder. The raw data will be overwritten by the copy from the code block above, but as long as the file in processed is there, it will be used instead of the new raw data.

# Prediction
Run the [predict.py](./predict.py) python file to generate the predictions.

# Submission and comparison to the baseline
Run the [evaluation notebook](./Evaluation.ipynb) to generate the submission file and compare it to the baseline on the metrics.