In [1]:
import random
import lightgbm as lgb
import pandas as pd

from utils import *
import matplotlib.pyplot as plt
from sklearn import preprocessing


Disable warnings

In [2]:
def warn(*args, **kwargs):
    pass

import warnings
warnings.warn = warn

# Initialization

## Define Label encoder class

In [3]:
class MultiLabelEncoder:
    """
    Object that holds all encoders so that they can be called upon at any time.
    """

    def __init__(self):
        self.encoders = {}
        self.decoders = {}  # holds the translation dicts

    def create_encoder(self, name):
        """
        Create a new encoder, make sure that this name is unique to avoid problems
        """
        new_encoder = preprocessing.LabelEncoder()
        self.encoders[name] = new_encoder

    def encode(self, encoder_name, label, dataframe):
        """
        Label encode a column in a dataframe.
        : encoder_name: name of the encoder that is to be used
        : label: name of column
        : dataframe: the actual dataframe

        """
        # create decoder dict and store original
        decoder = pd.DataFrame(columns=['original', 'encoded'])
        decoder["original"] = dataframe[label]

        # call encoder
        encoder = self.encoders[encoder_name]
        dataframe[label] = encoder.fit_transform(dataframe[label])

        # store encoded
        decoder["encoded"] = dataframe[label]
        self.decoders[encoder_name] = decoder

        return dataframe

    def decode_df(self, encoder_name, label, dataframe):
        """
        Decode a label encoded dataframe column
        : encoder_name: name of the encoder that was used
        : label: name of column
        : dataframe: the actual dataframe containing the encoded column
        """
        decoder = self.decoders[encoder_name]
        dataframe.rename(columns={label: 'encoded'}, inplace=True)
        dataframe = dataframe.merge(decoder, how="left", on="encoded")
        dataframe.drop(columns={"encoded"}, inplace=True)
        dataframe.rename(columns={"original": label}, inplace=True)
        return dataframe


## initialise label encoder

In [4]:
multi_encoder = MultiLabelEncoder()

## Configure run parameters

In [5]:
PARAM = {
    "article": ["article_id", "colour_group_code", "department_no", "garment_group_no", "product_type_no", "section_no", "index_group_no", "detail_desc", "index_code", "graphical_appearance_no"],
    "customer": ["customer_id", "age"],
    "transaction": ['customer_id', 'article_id', 'sales_channel_id', 'price', 't_dat'],
    "samples": ["customer_id", "ordered", "t_dat", "article_id", "department_no", "colour_group_code", "garment_group_no", "product_type_no", "section_no", "index_group_no","index_code", "age", "graphical_appearance_no", "material_season", "colour_group_code_season", "price_cat_cheap", "price_cat_expensive", "price_cat_normal", "price_cat_affordable"],
    "article_ranking": ["article_id", "department_no", "colour_group_code", "garment_group_no", "product_type_no","section_no", "index_group_no", "index_code", "age", "graphical_appearance_no", "material_season", "colour_group_code_season", "price_cat_cheap", "price_cat_expensive", "price_cat_normal", "price_cat_affordable"],
    "eval": False,      # evaluate learning curve

}

# LGBMRanker parameters that were not altered during tuning
FIXED_PARAMS={
    'objective': 'lambdarank',
    'metric': 'map',
    'boosting':'dart',
    'importance_type': "gain",
    'eval_at': 12,
    'n_jobs': 2
    }

# LGBMRanker parameters that were altered during tuning
SEARCH_PARAMS = {
    'learning_rate': 0.0075,
    'depth': 15,
    'child': 20,
    'estimators': 200,
    'subsample': 0.1,
    'verbose': 0,
    'leaves': 20
    }

## load data

In [6]:
articles = pd.read_csv('data/articles_sample1.csv.gz')
customers = pd.read_csv('data/customers_sample1.csv.gz')
transactions = pd.read_csv('data/transactions_sample1.csv.gz')

# Preprocessing

## preprocess transactions

In [7]:
# select columns
transactions_processed = transactions[PARAM["transaction"]].copy()
# fill na
transactions_processed = transactions_processed.fillna(0)

multi_encoder.create_encoder("customer_id")
# encode customer_id
transactions_processed = multi_encoder.encode("customer_id", "customer_id", transactions_processed)

transactions = transactions_processed
del transactions_processed
transactions.head(10)

Unnamed: 0,customer_id,article_id,sales_channel_id,price,t_dat
0,6,688545001,2,0.134237,2018-09-20
1,36,677341001,2,0.080492,2018-09-20
2,99,619561010,2,0.022017,2018-09-20
3,99,619561015,2,0.022017,2018-09-20
4,212,616337001,2,0.016932,2018-09-20
5,212,560325001,2,0.020322,2018-09-20
6,232,668766002,2,0.042356,2018-09-20
7,232,652946001,2,0.050831,2018-09-20
8,232,691275008,2,0.06778,2018-09-20
9,309,657497006,2,0.025407,2018-09-20


## preprocess customers

In [8]:
# select columns
customers_processed = customers[PARAM["customer"]].copy()
# encode customer id
customers_processed = multi_encoder.encode("customer_id", "customer_id", customers_processed)
# fill unknown customer ages
customers_processed = customers_processed.fillna(20)

customers = customers_processed
del customers_processed
customers.head(10)

Unnamed: 0,customer_id,age
0,7644,24.0
1,2214,51.0
2,2829,54.0
3,11032,50.0
4,11685,47.0
5,2946,41.0
6,366,48.0
7,3321,32.0
8,9983,45.0
9,2910,47.0


## preprocess articles

In [9]:
articles_processed = articles[PARAM["article"]].copy()
articles_processed = articles_processed.fillna(0)

# encoding index_code
multi_encoder.create_encoder("index_code")
articles_processed = multi_encoder.encode("index_code", "index_code", articles_processed)


# extract materials
articles_processed = extract_article_material(articles_processed)
multi_encoder.create_encoder("material")

# extract seasons
articles_processed = extract_season(transactions, articles_processed)
multi_encoder.create_encoder("season")

# extract price categories
articles_processed = extract_price_category(transactions, articles_processed)
multi_encoder.create_encoder("price_cat")
# articles_processed = multi_encoder.encode("price_cat", "price_cat", articles_processed)

# combine material-season
multi_encoder.create_encoder("material_season")
articles_processed = combine_features(articles_processed, "material", "season")
articles_processed = multi_encoder.encode("material_season", "material_season", articles_processed)
articles_processed.drop(columns=["material"])

# combine color-season
multi_encoder.create_encoder("colour_group_code_season")
articles_processed = combine_features(articles_processed, "colour_group_code", "season")
articles_processed = multi_encoder.encode("colour_group_code_season", "colour_group_code_season", articles_processed)

# fill leftover unknowns
articles_processed = articles_processed.fillna(0)

# drop unused columns
articles_processed.drop(columns=["material", "season"], inplace=True)

articles = articles_processed
del articles_processed
articles.head(10)

Unnamed: 0,article_id,colour_group_code,department_no,garment_group_no,product_type_no,section_no,index_group_no,index_code,graphical_appearance_no,material_season,colour_group_code_season
0,108775015,9,1676,1002,253,16,1,0,1010016,118,194
1,108775044,10,1676,1002,253,16,1,0,1010016,119,5
2,110065001,9,1339,1017,306,61,1,1,1010016,118,194
3,110065002,10,1339,1017,306,61,1,1,1010016,120,6
4,110065011,12,1339,1017,306,61,1,1,1010016,118,12
5,111565001,9,3608,1021,304,62,1,1,1010016,166,197
6,111586001,9,3608,1021,273,62,1,1,1010016,118,194
7,111593001,9,3608,1021,304,62,1,1,1010016,80,197
8,111609001,9,3608,1021,304,62,1,1,1010016,121,197
9,114428030,6,1334,1017,306,61,1,1,1010016,120,138


## create negative samples based on popular articles and most active buyers

### Retrieve most active buyers

In [10]:
n_most_active_buyers = 1000
active_buyers = transactions["customer_id"].value_counts().rename_axis("customer_id").reset_index(name="count").sort_values(by="count", ascending=False).head(n_most_active_buyers)
total_sum = active_buyers["count"].sum()
active_buyers["s_weight"] = active_buyers["count"] / total_sum
active_buyers.drop(columns=["count"], inplace=True)
active_buyers.head(10)

Unnamed: 0,customer_id,s_weight
0,232,0.00876
1,12248,0.008086
2,5636,0.004172
3,7244,0.004028
4,5431,0.00374
5,1333,0.003581
6,11439,0.003369
7,10649,0.003347
8,365,0.003316
9,12612,0.003286


### Retrieve most popular articles together with their weights

In [11]:
n_most_popular_items = 200
# take number of most sold articles
popular_items_w_weight = transactions["article_id"].value_counts().rename_axis("article_id").reset_index(name="count").sort_values(by="count", ascending=False).head(n_most_popular_items)
# total amount of articles sold
total_sum = transactions.shape[0]
# calculate weight for each article
popular_items_w_weight["s_weight"] = popular_items_w_weight["count"] / total_sum
# Glue everything together
prices = transactions[["article_id", "price"]].merge(popular_items_w_weight[["article_id"]], on=["article_id"], how='inner').groupby(["article_id"]).min("price")
popular_items_w_weight = popular_items_w_weight.merge(articles, on=["article_id"], how="inner")
popular_items_w_weight = popular_items_w_weight.merge(prices, on=["article_id"], how='inner')
popular_items_w_weight.drop(columns=["count"], inplace=True)
popular_items_w_weight.head(10)

Unnamed: 0,article_id,s_weight,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,...,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc,price
0,706016001,0.001511,706016,Jade HW Skinny Denim TRS,272,Trousers,Garment Lower body,1010016,Solid,9,...,D,Divided,2,Divided,53,Divided Collection,1009,Trousers,High-waisted jeans in washed superstretch deni...,0.020322
1,372860001,0.000982,372860,7p Basic Shaftless,302,Socks,Socks & Tights,1010016,Solid,9,...,B,Lingeries/Tights,1,Ladieswear,62,"Womens Nightwear, Socks & Tigh",1021,Socks and Tights,Fine-knit trainer socks in a soft cotton blend.,0.006763
2,706016002,0.000979,706016,Jade HW Skinny Denim TRS,272,Trousers,Garment Lower body,1010016,Solid,71,...,D,Divided,2,Divided,53,Divided Collection,1009,Trousers,High-waisted jeans in washed superstretch deni...,0.018627
3,759871002,0.000917,759871,Tilda tank,253,Vest top,Garment Upper body,1010016,Solid,9,...,D,Divided,2,Divided,80,Divided Complements Other,1002,Jersey Basic,"Cropped, fitted top in cotton jersey with narr...",0.001678
4,610776002,0.000867,610776,Tilly (1),255,T-shirt,Garment Upper body,1010016,Solid,9,...,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,T-shirt in lightweight jersey with a rounded h...,0.001678
5,156231001,0.000839,156231,Box 4p Tights,304,Underwear Tights,Socks & Tights,1010016,Solid,9,...,B,Lingeries/Tights,1,Ladieswear,62,"Womens Nightwear, Socks & Tigh",1021,Socks and Tights,Matt tights with an elasticated waist. 20 denier.,0.003356
6,464297007,0.000792,464297,Greta Thong Mynta Low 3p,286,Underwear bottom,Underwear,1010014,Placement print,9,...,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear",Thong briefs in cotton jersey with a wide lace...,0.006085
7,372860002,0.000748,372860,7p Basic Shaftless,302,Socks,Socks & Tights,1010016,Solid,10,...,B,Lingeries/Tights,1,Ladieswear,62,"Womens Nightwear, Socks & Tigh",1021,Socks and Tights,Fine-knit trainer socks in a soft cotton blend.,0.003373
8,610776001,0.000698,610776,Tilly (1),255,T-shirt,Garment Upper body,1010016,Solid,10,...,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,T-shirt in lightweight jersey with a rounded h...,0.001678
9,568601006,0.000695,568601,Mariette Blazer,264,Blazer,Garment Upper body,1010016,Solid,9,...,A,Ladieswear,1,Ladieswear,11,Womens Tailoring,1008,Dressed,Fitted jacket in woven fabric with notch lapel...,0.032525


##  Create actual samples

In [12]:
transactions['ordered'] = 1
positive_pairs = list(map(tuple, transactions[['customer_id', 'article_id']].drop_duplicates().values))
# Extract real values
real_dates = transactions["t_dat"].unique()
real_customers = transactions["customer_id"].unique()
real_articles = transactions["article_id"].unique()
real_channels = transactions["sales_channel_id"].unique()
article_and_price = transactions[["article_id", "price"]].drop_duplicates("article_id").set_index(
    "article_id").squeeze()
num_neg_pos = transactions.shape[0]

random.seed(42)

num_neg_samples = int(num_neg_pos * 1.1)  # number of total samples

num_popular = int(num_neg_samples)

# popularity based negative sampling
popular_neg_dates = np.random.choice(real_dates, size=num_popular)
popular_neg_articles = popular_items_w_weight["article_id"].sample(n=num_popular, weights=popular_items_w_weight["s_weight"], replace=True, random_state=42)
popular_neg_channels = np.random.choice(real_channels, size=num_popular)
popular_ordered = np.array([0] * num_popular)
popular_neg_customers = active_buyers["customer_id"].sample(n=num_popular, weights=active_buyers["s_weight"], replace=True, random_state=42)
popular_article_and_price = popular_items_w_weight[["article_id", "price"]].set_index(
    "article_id").squeeze()
popular_neg_prices = popular_article_and_price[popular_neg_articles].values

indexes = pd.DataFrame({'index': np.arange(num_popular)})
popular_neg_transactions = pd.DataFrame([np.arange(num_popular),popular_neg_customers, popular_neg_articles, popular_neg_channels, popular_neg_prices, popular_neg_dates, popular_ordered],
                                        index=["index", "customer_id", "article_id", "sales_channel_id", "price", "t_dat", "ordered"]).T
popular_neg_transactions = popular_neg_transactions[["customer_id", "article_id", "sales_channel_id", "price", "t_dat", "ordered"]]


# Remove random negative samples that actually coincide with positives
df = popular_neg_transactions[
    ~popular_neg_transactions.set_index(["customer_id", "article_id"]).index.isin(positive_pairs)
]

# Remove any excess
chosen_neg_transactions = df.sample(num_neg_pos)
# Concat the negative samples to the positive samples:
samples = pd.concat([transactions, chosen_neg_transactions])
samples = samples.merge(customers, how="inner", on='customer_id')
samples = samples.merge(articles, how="inner", on='article_id')
samples.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 639188 entries, 0 to 639187
Data columns (total 31 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   customer_id                   639188 non-null  object 
 1   article_id                    639188 non-null  object 
 2   sales_channel_id              639188 non-null  object 
 3   price                         639188 non-null  object 
 4   t_dat                         639188 non-null  object 
 5   ordered                       639188 non-null  object 
 6   age                           639188 non-null  float64
 7   product_code                  639188 non-null  int64  
 8   prod_name                     639188 non-null  object 
 9   product_type_no               639188 non-null  int64  
 10  product_type_name             639188 non-null  object 
 11  product_group_name            639188 non-null  object 
 12  graphical_appearance_no       639188 non-nul

# Candidate Selection

## 100 most popular items in last 2 months

In [13]:
# Calculate age of transactions in months
dated_transactions = transactions.loc[transactions["ordered"] == 1][["article_id", "t_dat"]]
dated_transactions["t_dat"] = pd.to_datetime(dated_transactions["t_dat"])
latest_date = dated_transactions["t_dat"].max()
dated_transactions["transaction_age_months"] = 12 * (latest_date.year - dated_transactions["t_dat"].dt.year) + (
            latest_date.month - dated_transactions["t_dat"].dt.month)

# Getting all sold items within the last 6 months
recent_items = dated_transactions.loc[dated_transactions["transaction_age_months"] < 2]
recent_items = recent_items["article_id"].value_counts().rename_axis("article_id").reset_index(name="count")
# recent_items = recent_items.sort_values(by="count", ascending=False)["article_id"].head(100)
recent_items = recent_items.sort_values(by="count", ascending=False).head(100)
recent_items.drop(columns=["count"], inplace=True)
recent_items = recent_items.merge(articles, how="inner", on="article_id")

In [14]:
# Merge recent items with transactions and concat with samples
recent_items_samples = pd.merge(transactions, recent_items["article_id"], on=["article_id"], how="inner")
samples = pd.concat([samples, recent_items_samples])
samples.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 646065 entries, 0 to 6876
Data columns (total 31 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   customer_id                   646065 non-null  object 
 1   article_id                    646065 non-null  object 
 2   sales_channel_id              646065 non-null  object 
 3   price                         646065 non-null  object 
 4   t_dat                         646065 non-null  object 
 5   ordered                       646065 non-null  object 
 6   age                           639188 non-null  float64
 7   product_code                  639188 non-null  float64
 8   prod_name                     639188 non-null  object 
 9   product_type_no               639188 non-null  float64
 10  product_type_name             639188 non-null  object 
 11  product_group_name            639188 non-null  object 
 12  graphical_appearance_no       639188 non-null 

# Ranking

## Prepare data

In [15]:
# select columns
samples = samples[PARAM["samples"]].copy()
samples["article_id"] = pd.to_numeric(samples["article_id"])
samples["customer_id"] = pd.to_numeric(samples["customer_id"])
samples["ordered"] = pd.to_numeric(samples["ordered"])


# setting up data
validation_data, training_data = split_samples(samples, 1)

training_data = training_data.reset_index(drop=True)
validation_data = validation_data.reset_index(drop=True)
samples.info()

KeyError: "['material_season', 'colour_group_code_season', 'price_cat'] not in index"

## Split training data

In [None]:
qids_train = training_data.groupby(["transaction_age_weeks", "customer_id"])["article_id"].count().values
training_data = training_data.drop(["t_dat", "transaction_age_weeks"], axis=1)
x_train = training_data.drop(columns=["ordered", "customer_id"])
y_train = training_data["ordered"]
x_train.info()

## Split validation data

In [None]:
qids_validation = validation_data.groupby(["transaction_age_weeks", "customer_id"])["article_id"].count().values
validation_data = validation_data.drop(["t_dat", "transaction_age_weeks"], axis=1)
x_test = validation_data.drop(columns=["ordered", "customer_id"])
y_test = validation_data["ordered"]
x_test.info()

## Train LGBMRanker

In [None]:
ranker = lgb.LGBMRanker(
    objective=FIXED_PARAMS["objective"],
    metric=FIXED_PARAMS["metric"],
    boosting_type=FIXED_PARAMS["boosting"],
    importance_type=FIXED_PARAMS["importance_type"],
    n_jobs=FIXED_PARAMS["n_jobs"],
    n_estimators=SEARCH_PARAMS["estimators"],
    verbose=SEARCH_PARAMS["verbose"],
    min_child_samples=SEARCH_PARAMS["child"],
    max_depth=SEARCH_PARAMS["depth"],
    learning_rate=SEARCH_PARAMS["learning_rate"],
    subsample=SEARCH_PARAMS["subsample"],
    num_leaves=SEARCH_PARAMS["leaves"],

)
# if eval parameter is true, evaluation will take place
if PARAM["eval"]:
    ranker.fit(
        X=x_train,
        y=y_train,
        group=qids_train,
        eval_set=[(x_train, y_train),(x_test, y_test)],
        eval_group=[qids_train,qids_validation],
        eval_names=["training set","validation set"],
        eval_at=12,

    )
    lgb.plot_metric(ranker)
    plt.show()
else:
    ranker.fit(
        X=x_train,
        y=y_train,
        group=qids_train,
    )

## Feature importances

In [None]:
for i in ranker.feature_importances_.argsort()[::-1]:
    print(ranker.feature_name_[i], ranker.feature_importances_[i] / ranker.feature_importances_.sum())

## Predict for customers

In [None]:
validation_data["prediction"] = ranker.predict(x_test)
predictions_for_customer = validation_data.sort_values(['customer_id', 'prediction'], ascending=False).groupby('customer_id')['article_id'].apply(list).to_dict()

## Get 12 most popular items to fill up predictions

In [None]:
popular_12 = list(recent_items["article_id"].head(12))
print(popular_12)

## Load in submission and encode customers

In [None]:
submission = pd.read_csv('data/sample_submission.csv')
# encode customer_id's to allow matching
submission = multi_encoder.encode("customer_id", "customer_id", submission)

## Store predictions in submission dataframe using correct format

In [None]:
predictions = []
for customer_id in submission["customer_id"]:
    prediction = predictions_for_customer.get(customer_id, [])  # retrieve all predictions for given customer
    prediction = prediction + popular_12    # fill up predictions with popular_12
    predictions.append(prediction[:12]) # take 12 first items

# parse to string format
predictions = [' '.join(['0' + str(p) for p in ps]) for ps in predictions]
submission["prediction"] = predictions

## Decode customer id's and store submission to csv

In [None]:
submission = multi_encoder.decode_df("customer_id", "customer_id", submission)
submission.to_csv(f'output/submission.csv.gz', index=False)