In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import os
import sys

import random

sys.path.append('/home/juravlik/PycharmProjects/kaggle_hnm_recsys/')

from lightgbm import LGBMRanker

from scripts.metrics.mapk import mapk

pd.set_option('display.max_columns', 500)

import warnings
warnings.filterwarnings("ignore")

In [2]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [3]:
path_to_set = '/home/juravlik/PycharmProjects/kaggle_hnm_recsys/data/train_set_and_labels/test_4/'
selected_customers = None #list(pd.read_parquet('/home/juravlik/PycharmProjects/kaggle_hnm_recsys/data/train_set_and_labels/test_2/week_0__part_0.parquet')['customer_id'].unique())
weeks_for_train = [2, 3, 4, 5, 6, 7, 8]
week_for_test = 1

In [4]:
for i in weeks_for_train:
    if i == weeks_for_train[0]:
        df_train = pd.read_parquet(os.path.join(path_to_set, 'week_{}.parquet'.format(i)))
    else:
        df_train = pd.concat([df_train,
                              pd.read_parquet(os.path.join(path_to_set, 'week_{}.parquet'.format(i)))],
                              ignore_index=True)
    
    if selected_customers:
        df_train = df_train[df_train['customer_id'].isin(selected_customers)]

In [5]:
df_train = reduce_mem_usage(df_train)

Memory usage of dataframe is 5676.01 MB
Memory usage after optimization is: 4881.76 MB
Decreased by 14.0%


In [6]:
# df_train.fillna(-1000, inplace=True)

In [7]:
# sample_weight_dict = {2: 1, 3: 0.7, 4: 0.7, 5: 0.7, 6: 0.4, 7: 0.4, 8: 0.3}
# sample_weight_dict = {2: 1, 3: 0.8, 4: 0.7, 5: 0.65, 6: 0.5, 7: 0.4, 8: 0.3}
# sample_weight_dict = {2: 1, 3: 1, 4: 1, 5: 1, 6: 0.5, 7: 0.5, 8: 0.5}


# sample_weights = df_train['weeks_before_sub'].apply(lambda x: sample_weight_dict[x])

In [8]:
features = list(df_train.columns)
features.remove('article_id')
features.remove('customer_id')
features.remove('weeks_before_sub')
features.remove('label')

In [9]:
ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    eval_at=12,
    boosting_type="gbdt", #gbdt #goss
    
    force_col_wise=True,
    num_leaves=40, #31 #40
    max_depth=-1, #10 #-1
    learning_rate=0.1, #0.1
    
#     reg_lambda=0.1, #0 #0.3
    
    importance_type='split',
    
    min_split_gain=0, #0 #0.7
    
    colsample_bytree=1, #1
    
    n_estimators=200, #100 #150
    
    random_state=42,
    verbose=1
)

In [None]:
ranker = ranker.fit(
    X=df_train[features],
    y=df_train[['label']],
#     sample_weight=sample_weights,
    group=df_train.groupby(['customer_id'])['article_id'].count().values
)

In [11]:
del df_train

df_test = pd.read_parquet(os.path.join(path_to_set, 'week_{}.parquet'.format(week_for_test)))
   
labels = pd.read_parquet('/home/juravlik/PycharmProjects/kaggle_hnm_recsys/data/ranker_train_labels/labels_{}.parquet'.format(week_for_test))

In [12]:
df_test = reduce_mem_usage(df_test)

Memory usage of dataframe is 666.86 MB
Memory usage after optimization is: 573.54 MB
Decreased by 14.0%


In [13]:
# df_test.fillna(-1000, inplace=True)

In [14]:
df_test['predict'] = ranker.predict(
    df_test[features]
)

In [15]:
df_test = df_test.sort_values(['customer_id', 'predict'], ascending=False).groupby('customer_id').head(12)

In [16]:
df_test

Unnamed: 0,customer_id,article_id,weeks_before_sub,score_ARulesRecommender,score_GruRecommender,score_ItemsPurchasedTogetherRecommender,score_KaggleCustomerAgeRecommender,score_KaggleExponentialDecayRecommender,score_KaggleTrendingRecommender,score_KMeansRecommender,score_LastPurchasesPopularity,score_LightFMRecommender,score_PopularByGroupsRecommender,score_SVDRecommender,colour_Beige,colour_Black,colour_Blue,colour_Bluish Green,colour_Brown,colour_Green,colour_Grey,colour_Khaki green,colour_Lilac Purple,colour_Metal,colour_Mole,colour_Orange,colour_Pink,colour_Red,colour_Turquoise,colour_Unknown,colour_White,colour_Yellow,colour_Yellowish Green,colour_undefined,product_Accessories,product_Bags,product_Cosmetic,product_Fun,product_Furniture,product_Garment Full body,product_Garment Lower body,product_Garment Upper body,product_Garment and Shoe care,product_Interior textile,product_Items,product_Nightwear,product_Shoes,product_Socks & Tights,product_Stationery,product_Swimwear,product_Underwear,product_Underwear/nightwear,product_Unknown,product_name_Accessories,product_name_Bags,product_name_Cosmetic,product_name_Fun,product_name_Furniture,product_name_Garment Full body,product_name_Garment Lower body,product_name_Garment Upper body,product_name_Garment and Shoe care,product_name_Interior textile,product_name_Items,product_name_Nightwear,product_name_Shoes,product_name_Socks & Tights,product_name_Stationery,product_name_Swimwear,product_name_Underwear,product_name_Underwear/nightwear,product_name_Unknown,article__frequency_purchases,article__mean_sales_channel_id,article__num_days_from_first_purchase,article__num_days_from_last_purchase,article__num_purchased_customers,article__num_unique_purchased_customers,article__num_purchased_customers_last90days,article__num_unique_purchased_customers_last90days,article__num_purchased_customers_last30days,article__num_unique_purchased_customers_last30days,article__num_purchased_customers_last7days,article__num_unique_purchased_customers_last7days,article__num_purchased_customers_last1days,article__num_unique_purchased_customers_last1days,article__unique_ratio,article__unique_ratio_last30days,article__unique_ratio_last7days,article__unique_ratio_last1days,article__mean_price,article__last_price,article__last_price_ratio,age,club_member_status_ACTIVE,club_member_status_LEFT CLUB,club_member_status_PRE-CREATE,fashion_news_frequency_Monthly,fashion_news_frequency_Regularly,sex_Woman,sex_Man,sex_Divided,have_children,sport_person,customer__mean_price,customer__mean_sales_channel_id,customer__num_days_from_first_purchase,customer__num_days_from_last_purchase,customer__num_purchased_articles,customer__num_unique_purchased_articles,customer__num_purchased_articles_last90days,customer__num_unique_purchased_articles_last90days,customer__num_purchased_articles_last30days,customer__num_unique_purchased_articles_last30days,customer__num_purchased_articles_last7days,customer__num_unique_purchased_articles_last7days,customer__unique_ratio,customer__unique_ratio_last30days,customer__unique_ratio_last7days,customer_article__num_days_from_last_purchase,customer_article__num_purchased,customer_article__num_purchased_last90days,customer_article__num_purchased_last30days,customer_article__num_purchased_last7days,label,predict
498353,1371960,102628,1,,,0.916504,0.916504,1.000000,0.333252,108770.679688,1.000000,-376.75,,0.969727,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.065857,2.000000,27,0,410,324,410,324,410,324,68,59,11,10,0.790039,0.790039,0.867676,0.909180,0.066895,0.067810,1.013672,29.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.037048,1.904297,345.0,3.0,63.0,54.0,48.0,43.0,17.0,14.0,10.0,7.0,0.856934,0.82373,0.700195,6,2,2,2,2,0.0,1.580528
231479,1371960,66500,1,,6.953125,1.000000,1.000000,0.958496,1.000000,150140.843750,0.500000,-377.00,,0.969727,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.198853,1.540039,468,1,2353,2129,216,187,115,98,26,24,2,2,0.904785,0.852051,0.922852,1.000000,0.023285,0.025040,1.075195,29.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.037048,1.904297,345.0,3.0,63.0,54.0,48.0,43.0,17.0,14.0,10.0,7.0,0.856934,0.82373,0.700195,3,2,2,2,2,0.0,1.412874
442966,1371960,99010,1,,,0.958496,0.958496,0.916504,0.500000,150140.843750,0.333252,,,0.969727,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.126465,2.000000,11,0,87,79,87,79,87,79,60,55,8,7,0.908203,0.908203,0.916504,0.875000,0.025055,0.025406,1.014648,29.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.037048,1.904297,345.0,3.0,63.0,54.0,48.0,43.0,17.0,14.0,10.0,7.0,0.856934,0.82373,0.700195,3,2,2,2,2,0.0,1.272221
578545,1371960,80932,1,,,0.875000,0.875000,0.750000,0.142822,54385.339844,0.142822,-377.25,,0.969727,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.053101,1.991211,12,0,226,200,226,200,226,200,167,150,23,18,0.884766,0.884766,0.898438,0.782715,0.016708,0.016937,1.013672,29.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.037048,1.904297,345.0,3.0,63.0,54.0,48.0,43.0,17.0,14.0,10.0,7.0,0.856934,0.82373,0.700195,6,1,1,1,1,0.0,1.231387
661618,1371960,97246,1,,,0.791504,0.791504,0.833496,0.199951,54385.339844,0.199951,-376.75,,0.969727,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.077087,1.437500,31,0,402,380,402,380,401,379,89,84,12,12,0.945312,0.945312,0.943848,1.000000,0.033051,0.033875,1.024414,29.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.037048,1.904297,345.0,3.0,63.0,54.0,48.0,43.0,17.0,14.0,10.0,7.0,0.856934,0.82373,0.700195,6,1,1,1,1,0.0,1.029473
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2236282,80,94696,1,,,,,,,,0.090881,,,,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.031403,1.645508,24,0,764,695,764,695,764,695,552,492,106,92,0.909668,0.909668,0.891113,0.868164,0.033264,0.033875,1.018555,27.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.029114,1.093750,716.0,222.0,32.0,28.0,0.0,0.0,0.0,0.0,0.0,0.0,0.875000,0.00000,0.000000,-1,0,0,0,0,0.0,-0.343391
857289,80,104045,1,,,0.666504,,0.750000,,,0.142822,,,0.125000,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.015121,1.725586,49,0,3241,2855,3241,2855,2487,2204,616,559,70,62,0.880859,0.886230,0.907227,0.885742,0.041382,0.038116,0.921387,27.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.029114,1.093750,716.0,222.0,32.0,28.0,0.0,0.0,0.0,0.0,0.0,0.0,0.875000,0.00000,0.000000,-1,0,0,0,0,0.0,-0.361437
1876455,80,67543,1,,,,,0.541504,,,,,,0.090881,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014008,1.689453,36,0,2570,2051,2570,2051,2419,1934,522,444,87,78,0.797852,0.799316,0.850586,0.896484,0.033112,0.033875,1.023438,27.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.029114,1.093750,716.0,222.0,32.0,28.0,0.0,0.0,0.0,0.0,0.0,0.0,0.875000,0.00000,0.000000,-1,0,0,0,0,0.0,-0.368458
2037551,80,53893,1,,,,,,0.090881,,,,,,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.019623,1.835938,683,0,34802,25320,2345,2052,876,788,388,336,15,15,0.727539,0.899414,0.866211,1.000000,0.032410,0.033875,1.044922,27.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.029114,1.093750,716.0,222.0,32.0,28.0,0.0,0.0,0.0,0.0,0.0,0.0,0.875000,0.00000,0.000000,-1,0,0,0,0,0.0,-0.372589


In [17]:
mapk(
    labels.groupby(['customer_id'])['article_id'].apply(list).tolist(),
    df_test.groupby(['customer_id'])['article_id'].apply(list).tolist(),
)

0.031625322037571076

In [None]:
0.031303033532957845

ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    eval_at=12,
    boosting_type="gbdt", #gbdt #goss
    
    force_col_wise=True,
    num_leaves=40, #31 #40
    max_depth=10, #10 #-1
    learning_rate=0.1, #0.1
    
    reg_lambda=0, #0 #0.3
    
    importance_type='split',
    
    min_split_gain=0, #0 #0.7
    
    colsample_bytree=1, #1
    
    n_estimators=100, #100 #150
    
    random_state=42,
    verbose=1
)

In [None]:
0.0311 w/o fillna

ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    eval_at=12,
    boosting_type="gbdt", #gbdt #goss
    
    force_col_wise=True,
    num_leaves=31, #31 #40
    max_depth=-1, #10 #-1
    learning_rate=0.1, #0.1
    
    reg_lambda=0, #0 #0.3
    
    importance_type='gain',
    
    min_split_gain=0, #0 #0.7
    
    colsample_bytree=1, #1
    
    n_estimators=100, #100 #150
    
    random_state=42,
    verbose=1
)

In [None]:
0.03179736094980154, w/o fillna

ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    eval_at=12,
    boosting_type="gbdt", #gbdt #goss
    
    force_col_wise=True,
    num_leaves=31, #31 #40
    max_depth=-1, #10 #-1
    learning_rate=0.1, #0.1
    
    reg_lambda=0, #0 #0.3
    
    importance_type='split',
    
    min_split_gain=0, #0 #0.7
    
    colsample_bytree=1, #1
    
    n_estimators=100, #100 #150
    
    random_state=42,
    verbose=1
)

In [None]:
0.03212297695843095

ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    eval_at=12,
    boosting_type="gbdt", #gbdt #goss
    
    force_col_wise=True,
    num_leaves=31, #31 #40
    max_depth=-1, #10 #-1
    learning_rate=0.1, #0.1
    
    reg_lambda=0, #0 #0.3
    
    importance_type='split',
    
    min_split_gain=0, #0 #0.7
    
    colsample_bytree=1, #1
    
    n_estimators=150, #100 #150
    
    random_state=42,
    verbose=1
)

In [None]:
0.03216523246745132

ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    eval_at=12,
    boosting_type="gbdt", #gbdt #goss
    
    force_col_wise=True,
    num_leaves=40, #31 #40
    max_depth=-1, #10 #-1
    learning_rate=0.1, #0.1
    
    reg_lambda=0, #0 #0.3
    
    importance_type='split',
    
    min_split_gain=0, #0 #0.7
    
    colsample_bytree=1, #1
    
    n_estimators=200, #100 #150
    
    random_state=42,
    verbose=1
)

In [None]:
0.03216971818356945

ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    eval_at=12,
    boosting_type="gbdt", #gbdt #goss
    
    force_col_wise=True,
    num_leaves=40, #31 #40
    max_depth=-1, #10 #-1
    learning_rate=0.1, #0.1
    
#     reg_lambda=0.1, #0 #0.3
    
    importance_type='split',
    
    min_split_gain=0, #0 #0.7
    
    colsample_bytree=1, #1
    
    n_estimators=200, #100 #150
    
    random_state=42,
    verbose=1
)