This code is from [https://github.com/radekosmulski/personalized_fashion_recs](https://github.com/radekosmulski/personalized_fashion_recs) with some preprocessing changes/options. Cells that either gave general comments on the competition or ran a code cell only to view a variable have been removed.

Added since lecture 6: simplified_colour_group_name (e.g. Light Red -> Red, Other Turquoise -> Turquoise)

In [76]:
# -1 fills customer na values with -1
# edited instead fills them with zeros, or in case of age with the median
fillna_values = '-1'  # '-1' or 'edited'
assert fillna_values in ['-1','edited']

In [77]:
# EDITED: the original code fetched this through
# !wget https://raw.githubusercontent.com/benhamner/Metrics/master/Python/ml_metrics/average_precision.py
# But windows doesn't have wget, so I copy-pasted it.

import numpy as np

def apk(actual, predicted, k=12):
    """
    Computes the average precision at k.

    This function computes the average prescision at k between two lists of
    items.

    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The average precision at k over the input lists

    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

# EDITED: k=10 to k=12
def mapk(actual, predicted, k=12):
    """
    Computes the mean average precision at k.

    This function computes the mean average precision at k between two lists
    of lists of items.

    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The mean average precision at k over the input lists

    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [78]:
# helper functions
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

# https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations/discussion/308635
def customer_hex_id_to_int(series):
    return series.str[-16:].apply(hex_id_to_int)

def hex_id_to_int(str):
    return int(str[-16:], 16)

def article_id_str_to_int(series):
    return series.astype('int32')

def article_id_int_to_str(series):
    return '0' + series.astype('str')

class Categorize(BaseEstimator, TransformerMixin):
    def __init__(self, min_examples=0):
        self.min_examples = min_examples
        self.categories = []
        
    def fit(self, X):
        for i in range(X.shape[1]):
            vc = X.iloc[:, i].value_counts()
            self.categories.append(vc[vc > self.min_examples].index.tolist())
        return self

    def transform(self, X):
        data = {X.columns[i]: pd.Categorical(X.iloc[:, i], categories=self.categories[i]).codes for i in range(X.shape[1])}
        return pd.DataFrame(data=data)

In [79]:
import pandas as pd

transactions = pd.read_csv('../../data/transactions_train.csv', dtype={"article_id": "str"})
customers = pd.read_csv('../../data/customers.csv')
articles = pd.read_csv('../../data/articles.csv', dtype={"article_id": "str"})
print(articles.columns)
print(articles["perceived_colour_value_name"].unique())

Index(['article_id', 'product_code', 'prod_name', 'product_type_no',
       'product_type_name', 'product_group_name', 'graphical_appearance_no',
       'graphical_appearance_name', 'colour_group_code', 'colour_group_name',
       'perceived_colour_value_id', 'perceived_colour_value_name',
       'perceived_colour_master_id', 'perceived_colour_master_name',
       'department_no', 'department_name', 'index_code', 'index_name',
       'index_group_no', 'index_group_name', 'section_no', 'section_name',
       'garment_group_no', 'garment_group_name', 'detail_desc'],
      dtype='object')
['Dark' 'Light' 'Dusty Light' 'Medium Dusty' 'Bright' 'Medium' 'Undefined'
 'Unknown']


In [80]:
print(articles["perceived_colour_value_name"].unique())
print(articles["perceived_colour_master_name"].unique())
print(articles["colour_group_name"].unique())
print(articles["graphical_appearance_name"].unique())
print(articles["garment_group_name"].unique())

['Dark' 'Light' 'Dusty Light' 'Medium Dusty' 'Bright' 'Medium' 'Undefined'
 'Unknown']
['Black' 'White' 'Beige' 'Grey' 'Blue' 'Pink' 'Lilac Purple' 'Red' 'Mole'
 'Orange' 'Metal' 'Brown' 'Turquoise' 'Yellow' 'Khaki green' 'Green'
 'undefined' 'Unknown' 'Yellowish Green' 'Bluish Green']
['Black' 'White' 'Off White' 'Light Beige' 'Beige' 'Grey' 'Light Blue'
 'Light Grey' 'Dark Blue' 'Dark Grey' 'Pink' 'Dark Red' 'Greyish Beige'
 'Light Orange' 'Silver' 'Gold' 'Light Pink' 'Dark Pink' 'Yellowish Brown'
 'Blue' 'Light Turquoise' 'Yellow' 'Greenish Khaki' 'Dark Yellow'
 'Other Pink' 'Dark Purple' 'Red' 'Transparent' 'Dark Green' 'Other Red'
 'Turquoise' 'Dark Orange' 'Other' 'Orange' 'Dark Beige' 'Other Yellow'
 'Light Green' 'Other Orange' 'Purple' 'Light Red' 'Light Yellow' 'Green'
 'Light Purple' 'Dark Turquoise' 'Other Purple' 'Bronze/Copper'
 'Other Turquoise' 'Other Green' 'Other Blue' 'Unknown']
['Solid' 'Stripe' 'All over pattern' 'Melange' 'Transparent' 'Metallic'
 'Application/3D'

In [81]:
# This cell and the two below conatin some unfinished code that could be used to convert "dark colour" into "colour"
selected_columns = articles[["colour_group_name",'colour_group_code']]
selected_columns = selected_columns.drop_duplicates(["colour_group_name",'colour_group_code'])
selected_columns.head(50)

Unnamed: 0,colour_group_name,colour_group_code
0,Black,9
1,White,10
2,Off White,11
5,Light Beige,12
7,Beige,13
11,Grey,7
12,Light Blue,71
14,Light Grey,6
15,Dark Blue,73
16,Dark Grey,8


In [82]:
selected_list = selected_columns["colour_group_name"].tolist()
selected_list

['Black',
 'White',
 'Off White',
 'Light Beige',
 'Beige',
 'Grey',
 'Light Blue',
 'Light Grey',
 'Dark Blue',
 'Dark Grey',
 'Pink',
 'Dark Red',
 'Greyish Beige',
 'Light Orange',
 'Silver',
 'Gold',
 'Light Pink',
 'Dark Pink',
 'Yellowish Brown',
 'Blue',
 'Light Turquoise',
 'Yellow',
 'Greenish Khaki',
 'Dark Yellow',
 'Other Pink',
 'Dark Purple',
 'Red',
 'Transparent',
 'Dark Green',
 'Other Red',
 'Turquoise',
 'Dark Orange',
 'Other',
 'Orange',
 'Dark Beige',
 'Other Yellow',
 'Light Green',
 'Other Orange',
 'Purple',
 'Light Red',
 'Light Yellow',
 'Green',
 'Light Purple',
 'Dark Turquoise',
 'Other Purple',
 'Bronze/Copper',
 'Other Turquoise',
 'Other Green',
 'Other Blue',
 'Unknown']

In [83]:
print(len(selected_list))

50


In [84]:
simplified_list = []
for full_colour in selected_list:
    done = False
    if len(full_colour.split(" ")) > 1:
        if full_colour.split(" ")[1] in full_colour:
            simplified_list.append(full_colour.split(" ")[1])
            done = True
    if not done:
        simplified_list.append(full_colour)

print(simplified_list)

selected_columns = selected_columns.assign(simplified_colour_group_name = simplified_list)
selected_columns.head(70)

['Black', 'White', 'White', 'Beige', 'Beige', 'Grey', 'Blue', 'Grey', 'Blue', 'Grey', 'Pink', 'Red', 'Beige', 'Orange', 'Silver', 'Gold', 'Pink', 'Pink', 'Brown', 'Blue', 'Turquoise', 'Yellow', 'Khaki', 'Yellow', 'Pink', 'Purple', 'Red', 'Transparent', 'Green', 'Red', 'Turquoise', 'Orange', 'Other', 'Orange', 'Beige', 'Yellow', 'Green', 'Orange', 'Purple', 'Red', 'Yellow', 'Green', 'Purple', 'Turquoise', 'Purple', 'Bronze/Copper', 'Turquoise', 'Green', 'Blue', 'Unknown']


Unnamed: 0,colour_group_name,colour_group_code,simplified_colour_group_name
0,Black,9,Black
1,White,10,White
2,Off White,11,White
5,Light Beige,12,Beige
7,Beige,13,Beige
11,Grey,7,Grey
12,Light Blue,71,Blue
14,Light Grey,6,Grey
15,Dark Blue,73,Blue
16,Dark Grey,8,Grey


In [85]:
articles = pd.merge(articles,selected_columns[["colour_group_name","simplified_colour_group_name"]],on=["colour_group_name"],how="left")

In [86]:
articles.head()

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc,simplified_colour_group_name
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.,Black
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.,White
2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1010017,Stripe,11,Off White,...,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.,White
3,110065001,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,9,Black,...,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde...",Black
4,110065002,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,10,White,...,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde...",White


In [87]:
transactions['customer_id'] = customer_hex_id_to_int(transactions['customer_id'])

In [88]:
transactions.t_dat = pd.to_datetime(transactions.t_dat, format='%Y-%m-%d')

In [89]:
transactions['week'] = (104 - (transactions.t_dat.max() - transactions.t_dat).dt.days // 7).astype(np.int8)
print(transactions["week"].max())
print(transactions["week"].min())

104
0


Let's do something about the `article_id` (both here and on `articles`) and let's take a closer look at `price`, `sales_channel_id` and `week`.

In [90]:
transactions.article_id = article_id_str_to_int(transactions.article_id)
articles.article_id = article_id_str_to_int(articles.article_id)

transactions.week = transactions.week.astype('int8')  # EDITED: added astype
transactions.sales_channel_id = transactions.sales_channel_id.astype('int8')
transactions.price = transactions.price.astype('float32')

In [91]:
transactions.drop(columns='t_dat').info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31788324 entries, 0 to 31788323
Data columns (total 5 columns):
 #   Column            Dtype  
---  ------            -----  
 0   customer_id       uint64 
 1   article_id        int32  
 2   price             float32
 3   sales_channel_id  int8   
 4   week              int8   
dtypes: float32(1), int32(1), int8(2), uint64(1)
memory usage: 545.7 MB


Well, this is interesting. There are very few unique `t_dat` values hence despite it being a scary `datetime64` it takes up very little memory!

Keeping it for convenience is definitely the way to go.

Let's take a brief look at the `customers` and `articles` dfs.

In [92]:
customers['club_member_status'].unique()

array(['ACTIVE', nan, 'PRE-CREATE', 'LEFT CLUB'], dtype=object)

In [93]:
customers.customer_id = customer_hex_id_to_int(customers.customer_id)
print(fillna_values)
if fillna_values == '-1':
    customers['fashion_news_frequency'] = customers['fashion_news_frequency'].replace('none','NONE')  # EDITED: added this
    for col in ['FN', 'Active', 'age']:
        customers[col].fillna(-1, inplace=True)
        customers[col] = customers[col].astype('int8')
# EDITED: added alternate preprocessing
elif fillna_values == 'edited':
    articles['detail_desc'] = articles['detail_desc'].fillna("")
    customers['FN'] = customers['FN'].fillna(0)
    customers['Active'] = customers['Active'].fillna(0)
    customers['age'] = customers['age'].fillna(int(customers['age'].mean()))
    customers['fashion_news_frequency'] = customers['fashion_news_frequency'].fillna('NONE')
    customers['fashion_news_frequency'] = customers['fashion_news_frequency'].replace('none','NONE')
    customers['club_member_status'] = customers['club_member_status'].fillna('PRE-CREATE')
else:
    exit(1)


-1


In [94]:
# TODO: check out how Categorize works
customers.club_member_status = Categorize().fit_transform(customers[['club_member_status']]).club_member_status
customers.postal_code = Categorize().fit_transform(customers[['postal_code']]).postal_code
customers.fashion_news_frequency = Categorize().fit_transform(customers[['fashion_news_frequency']]).fashion_news_frequency

In [95]:
# TODO: check out how Categorize works
for col in articles.columns:
    if articles[col].dtype == 'object':
        articles[col] = Categorize().fit_transform(articles[[col]])[col]

In [96]:
for col in articles.columns:
    if articles[col].dtype == 'int64':
        articles[col] = articles[col].astype('int32')

And this concludes our raw data preparation step! Let's now write everything back to disk.

In [97]:
transactions.sort_values(['t_dat', 'customer_id'], inplace=True)

In [98]:
%%time

transactions.to_parquet(f'../../data/transactions_train_{fillna_values}.parquet')
customers.to_parquet(f'../../data/customers_{fillna_values}.parquet')
articles.to_parquet(f'../../data/articles_{fillna_values}.parquet')

CPU times: total: 2 s
Wall time: 3.15 s


Let's also generate a sample we will be able to use to speed up development.

In [99]:
# %%time
# # let's create a 5% sample of the entirity of the data to speed up dev
#
# sample = 0.05
# customers_sample = customers.sample(frac=sample, replace=False)
# customers_sample_ids = set(customers_sample['customer_id'])
# transactions_sample = transactions[transactions["customer_id"].isin(customers_sample_ids)]
# articles_sample_ids = set(transactions_sample["article_id"])
# articles_sample = articles[articles["article_id"].isin(articles_sample_ids)]
#
# customers_sample.to_parquet(f'../../data/customers_sample_{sample}_{fillna_values}.parquet', index=False)
# transactions_sample.to_parquet(f'../../data/transactions_train_sample_{sample}_{fillna_values}.parquet', index=False)
# articles_sample.to_parquet(f'../../data/articles_train_sample_{sample}_{fillna_values.parquet}.parquet', index=False)

## Evaluation

In [100]:
from collections import defaultdict

val_week_purchases_by_cust = defaultdict(list)

val_week_purchases_by_cust.update(
    transactions[transactions.week == transactions.week.max()] \
        .groupby('customer_id')['article_id'] \
        .apply(list) \
        .to_dict()
)

pd.to_pickle(dict(val_week_purchases_by_cust), '../../data/val_week_purchases_by_cust.pkl')

sample_sub = pd.read_csv('../../data/sample_submission.csv')
valid_gt = customer_hex_id_to_int(sample_sub.customer_id) \
    .map(val_week_purchases_by_cust) \
    .apply(lambda xx: ' '.join('0' + str(x) for x in xx))

sample_sub.prediction = valid_gt
sample_sub.to_parquet(f'../../data/validation_ground_truth_{fillna_values}.parquet', index=False)

In [101]:

def calculate_apk(list_of_preds, list_of_gts):
    # for fast validation this can be changed to operate on dicts of {'cust_id_int': [art_id_int, ...]}
    # using 'data/val_week_purchases_by_cust.pkl'
    apks = []
    for preds, gt in zip(list_of_preds, list_of_gts):
        apks.append(apk(gt, preds, k=12))
    return np.mean(apks)

def eval_sub(sub_csv, skip_cust_with_no_purchases=True):
    sub=pd.read_csv(sub_csv)
    validation_set=pd.read_parquet('../../data/validation_ground_truth.parquet')

    apks = []

    no_purchases_pattern = []
    for pred, gt in zip(sub.prediction.str.split(), validation_set.prediction.str.split()):
        if skip_cust_with_no_purchases and (gt == no_purchases_pattern): continue
        apks.append(mapk(gt, pred, k=12))  # Changed to mapk, was apk
    return np.mean(apks)

print("Done")

Done
