# Sales Channels
Objectives:
- Try to figure out if the kaggle contest assumes only online (type 2) transactions are relevant

Method:
- Include sales channel when training model (LightGBM used)
- Use only popular items from online sales channel for candidate generation


In [1]:
import numpy as np 
import pandas as pd
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn import preprocessing
import time

In [2]:
transactions = pd.read_csv('./data/transactions_train.csv')

Replace customer ids with label encoding for space preservation purposes.

In [3]:
def encode_transactions_customers(transactions_df):
    customers = pd.read_csv('./data/customers.csv')
    customer_encoder = preprocessing.LabelEncoder()
    customer_encoder.fit(customers['customer_id'])
    transactions['customer_id'] = customer_encoder.transform(transactions['customer_id'])
    np.save('customer_ids.npy', customer_encoder.classes_) 
    return customer_encoder

In [4]:
customer_encoder = encode_transactions_customers(transactions)

In [5]:
transactions['purchased'] = 1

Transform string dates into weeks with the start of week 0 being the week for the final calculation (meaning everything in the data becomes a negative week)

In [6]:
def transform_string_dates_to_int(transactions_df):
    import datetime

    lookup = dict()
    def str_dat_to_weeks_int(datestring):
        return lookup.setdefault(datestring, (datetime.datetime.strptime(datestring, "%Y-%m-%d") - datetime.datetime(2020, 9, 23)).days//7)
    
    transactions_df["t_dat"] = transactions_df["t_dat"].map(str_dat_to_weeks_int)


In [7]:
transform_string_dates_to_int(transactions)

Drop all transactions which happened more than 20 weeks before the end of the data collection period

In [8]:
transactions.drop(transactions[transactions["t_dat"] < -20].index, inplace=True)

Perform random negative sampling, most of this code is copied from the 2nd lecture

In [9]:
def generate_negative_samples_np_version(transactions_df, num_neg_pos):
    real_dates = transactions["t_dat"].unique()
    real_customers = transactions["customer_id"].unique()
    real_articles = transactions["article_id"].unique()
    real_channels = transactions["sales_channel_id"].unique()
    article_and_price = transactions[["article_id","price"]].drop_duplicates("article_id").set_index("article_id").squeeze()

    random.seed(42)
    num_neg_samples = int(num_neg_pos * 1.1)

    neg_dates = np.random.choice(real_dates, size=num_neg_samples)
    neg_articles = np.random.choice(real_articles, size=num_neg_samples)
    neg_customers = np.random.choice(real_customers, size=num_neg_samples)
    neg_channels = np.random.choice(real_channels, size=num_neg_samples)
    ordered = np.array([0] * num_neg_samples)

    neg_prices = article_and_price[neg_articles].values
    
    return np.column_stack((neg_dates, neg_customers, neg_articles, neg_prices, neg_channels, ordered))
    
def generate_negative_samples(transactions_df):
    num_neg_pos = transactions_df.shape[0]
    positive_pairs = list(map(tuple, transactions_df[['customer_id', 'article_id']].drop_duplicates().values))
    neg_transactions = pd.DataFrame(generate_negative_samples_np_version(transactions_df, num_neg_pos), columns=transactions_df.columns)
    duplicate_indexes = neg_transactions[["customer_id", "article_id"]].apply(tuple, 1).isin(positive_pairs)
    neg_transactions = neg_transactions[~duplicate_indexes]
    return neg_transactions.sample(num_neg_pos)

def negative_sampling(transactions_df):
    transactions_df = pd.concat([transactions_df, generate_negative_samples(transactions_df)])
    transactions_df.reset_index(drop=True, inplace=True)
    return transactions_df

In [10]:
transactions = negative_sampling(transactions)

In [11]:
transactions.to_feather('./data/negativesampled.feather')

This is intended as a checkpoint for if I need to reset the jupyter notebook kernel for whatever reason. 

In [12]:
articles = pd.read_csv('./data/articles.csv')

In [13]:
transactions = pd.read_feather("./data/negativesampled.feather")

In [14]:
customers = pd.read_csv('./data/customers.csv')

In [15]:
customer_encoder = preprocessing.LabelEncoder()
customer_encoder.classes_ = np.load("customer_ids.npy", allow_pickle=True)

Apply the label encoding to the customer table so it can be joined with transactions

In [16]:
customers['customer_id'] = customer_encoder.transform(customers['customer_id'])

In [17]:
zip_encoder = preprocessing.LabelEncoder()
customers["postal_code"] = zip_encoder.fit_transform(customers["postal_code"])

In [30]:
def correct_types_and_merge_transactions(transactions_df, customers_df, articles_df):
    customers_df["age"] = customers_df["age"].fillna(25)
    customers_df["age"] = customers_df["age"].astype(int)
    articles_df[['article_id', 'product_code', 'product_type_no','graphical_appearance_no','colour_group_code',
       'perceived_colour_value_id', 'perceived_colour_master_id', 'department_no',
       'index_group_no', 'section_no', 'garment_group_no']] = articles_df[['article_id', 'product_code',
       'product_type_no','graphical_appearance_no','colour_group_code',
       'perceived_colour_value_id', 'perceived_colour_master_id', 'department_no',
       'index_group_no', 'section_no', 'garment_group_no']].astype(int)
    transactions_df[['t_dat', 'customer_id', 'article_id', 'sales_channel_id', 'purchased']] = transactions_df[['t_dat', 'customer_id', 'article_id', 'sales_channel_id', 'purchased']].astype(int)

    transactions_df = transactions_df.merge(customers_df[["customer_id", "age", "postal_code"]], how="inner", on='customer_id')
    transactions_df = transactions_df.merge(articles_df[["article_id", "product_code", "product_type_no", "graphical_appearance_no", "colour_group_code", "department_no", "index_group_no", "section_no", "garment_group_no"]], how="inner", on='article_id')
    return transactions_df

In [31]:
transactions = correct_types_and_merge_transactions(transactions, customers, articles)

In [32]:
X_train, X_test, y_train, y_test = train_test_split(transactions.drop(['purchased', "price"], axis=1), transactions['purchased'], test_size=0.10, random_state=42)

Calculate the popular items which I will be using as candidates for the submission.

One of the big changes from the other files occurs here: only items from sales channel 2 are considered for the popular candidates.

In [33]:
def get_popular_items(transaction_df):
    popular_all_time = transaction_df[(transaction_df["purchased"] == 1) & (transaction_df["sales_channel_id"] == 2)][["article_id", "purchased"]].groupby("article_id").count().sort_values(ascending=False, by="purchased").head(24).index.to_series().reset_index(drop=True)
    popular_by_month = transaction_df[(transaction_df["purchased"] == 1) & (transaction_df["t_dat"] >= -4) & (transaction_df["sales_channel_id"] == 2)][["article_id", "purchased"]].groupby("article_id").count().sort_values(ascending=False, by="purchased").head(48).index.to_series().reset_index(drop=True)
    popular_by_month2 = popular_by_month[~popular_by_month.isin(popular_all_time)]
    popular_by_month2 = popular_by_month2.reset_index(drop=True).head(24)
    popular_candidates = pd.DataFrame(pd.concat([popular_all_time, popular_by_month2])).astype(int).reset_index(drop=True)
    return popular_candidates, popular_by_month

popular_candidates, popular_by_month = get_popular_items(transactions)

Generate dataframe with all active customers and the candidates

This is where one of the big changes from the other files occurs: sales_channel_id: 2 gets appended to all the candidates

In [34]:
def generate_ranker_input_df(transactions_df, candidates, customers_df, articles_df):
    return pd.DataFrame(data={"t_dat": 0}, index=[0]).merge(transactions_df[["customer_id"]].drop_duplicates(subset="customer_id"), how="cross").merge(candidates, how="cross").merge(pd.DataFrame(data={"sales_channel_id": 2}, index=[0]), how="cross").merge(customers_df[["customer_id", "age", "postal_code"]], how="inner", on="customer_id").merge(articles_df[["article_id", "product_code", "product_type_no", "graphical_appearance_no", "colour_group_code", "department_no", "index_group_no", "section_no", "garment_group_no"]], how="inner", on="article_id")

ranker_input = generate_ranker_input_df(transactions, popular_candidates, customers, articles)

Functions to perform predictions and write to file

In [35]:
# Copied from the radekosmulski notebook
def generate_ranking(df):
    return df.sort_values(by=["customer_id", "p1"], ascending=[True, False]).groupby("customer_id")["article_id"].apply(list).to_dict()

In [36]:
def output_preds(df, classifier, filename, filler_candidates):
    predictions = classifier.predict_proba(df)
    df[["p0", "p1"]] = predictions
    ranking = generate_ranking(df)
    
    # Copied from the radekosmulski notebook
    submission_df = pd.read_csv('data/sample_submission.csv')
    preds = []
    for c_id in customer_encoder.transform(submission_df.customer_id):
        pred = ranking.get(c_id, filler_candidates)
        preds.append(pred[:12])
        
    preds = [' '.join(['0' + str(p) for p in ps]) for ps in preds]
    submission_df.prediction = preds
    
    submission_df.to_csv('./data/{}.csv'.format(filename), index=False)
    df.drop(['p0', 'p1'], inplace=True, axis=1)

Let's start with lightgbm binary classifier

In [37]:
# copying from https://github.com/microsoft/LightGBM/blob/master/examples/python-guide/simple_example.py
# combined with https://github.com/angelotc/LightGBM-binary-classification-example/blob/master/CCData.ipynb

import lightgbm as lgb
print('Starting training...')

gbm = lgb.LGBMClassifier(learning_rate = 0.1, metric = 'l1', 
                        n_estimators = 20)
gbm.fit(X_train, y_train,
        eval_set=[(X_test, y_test)],
        eval_metric=['auc', 'binary_logloss'],
        callbacks=[lgb.early_stopping(stopping_rounds=5)])

Starting training...
Training until validation scores don't improve for 5 rounds
Did not meet early stopping. Best iteration is:
[20]	valid_0's auc: 0.820167	valid_0's binary_logloss: 0.533065	valid_0's l1: 0.382037


LGBMClassifier(metric='l1', n_estimators=20)

In [38]:
output_preds(ranker_input, gbm, "sales_channel_2_only", popular_by_month)