In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import os
from matplotlib import pyplot as plt
import sys

from lightgbm import LGBMRanker

sys.path.append('/home/juravlik/PycharmProjects/kaggle_hnm_recsys/')

from scripts.first_stage_models.LastPurchasesPopularity import LastPurchasesPopularity
from scripts.utils import create_predictions_for_second_stage,\
prepare_dataset, create_labels_for_second_stage, combine_train_sets_and_labels


In [2]:
df_transactions = pd.read_parquet('../data/compressed_dataset/transactions.parquet')
df_articles = pd.read_parquet('../data/compressed_dataset/articles.parquet')
df_customers = pd.read_parquet('../data/compressed_dataset/customers.parquet')

article_id_int = pd.read_pickle('../data/compressed_dataset/article_id_int.pickle')
int_article_id = pd.read_pickle('../data/compressed_dataset/int_article_id.pickle')

customer_id_int = pd.read_pickle('../data/compressed_dataset/customer_id_int.pickle')
int_customer_id = pd.read_pickle('../data/compressed_dataset/int_customer_id.pickle')

In [3]:
model = LastPuchasesPopularity()
model.fit(df_transactions)

In [4]:
df_predict, df_submission = model.predict(df_customers['customer_id'].tolist(),
                                          return_submit=True,
                                          int_article_id=int_article_id,
                                          int_customer_id=int_customer_id
                                         )

In [5]:
df_predict

Unnamed: 0,customer_id,article_id,score
0,0,16023,1.000000
1,0,104553,0.500000
2,0,104554,0.333333
3,0,104527,0.250000
4,0,104072,0.200000
...,...,...,...
16463755,1371979,103796,0.125000
16463756,1371979,103797,0.111111
16463757,1371979,3091,0.100000
16463758,1371979,71107,0.090909


In [6]:
df_submission

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0568601043 0924243001 0924243002 0923758001 09...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0924243001 0924243002 0923758001 0918522001 09...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0794321007 0924243001 0924243002 0923758001 09...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0924243001 0924243002 0923758001 0918522001 09...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0924243001 0924243002 0923758001 0918522001 09...
...,...,...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,0713997002 0720125039 0740922009 0791587007 08...
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,0924243001 0924243002 0923758001 0918522001 09...
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,0689365050 0884081001 0794819001 0762846027 09...
1371978,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,0924243001 0924243002 0923758001 0918522001 09...


In [7]:
df_predict = model.predict(df_customers['customer_id'].tolist(),
                                          return_submit=False,
                                          int_article_id=int_article_id,
                                          int_customer_id=int_customer_id
                                         )

## Create training set for 2nd stage

In [3]:
model = LastPuchasesPopularity()

In [4]:
create_predictions_for_second_stage(
        model=model,
        model_name='LastPuchasesPopularity',
        transactions=df_transactions,
        all_customers=df_customers['customer_id'].unique(),
        path_to_destination_save='/home/juravlik/PycharmProjects/kaggle_hnm_recsys/data/ranker_train_set/LastPuchasesPopularity',
        int_article_id=int_article_id,
        int_customer_id=int_customer_id,
        num_train_weeks= 20
)

## Create labels for 2nd stage

In [3]:
create_labels_for_second_stage(
        transactions=df_transactions,
        path_to_destination_save='/home/juravlik/PycharmProjects/kaggle_hnm_recsys/data/ranker_train_labels',
        num_train_weeks=20
)

## Create train set with labels

In [4]:
train_w_labels = combine_train_sets_and_labels(
    path_to_train_set='/home/juravlik/PycharmProjects/kaggle_hnm_recsys/data/ranker_train_set/',
    path_to_train_labels='/home/juravlik/PycharmProjects/kaggle_hnm_recsys/data/ranker_train_labels',
    model_names=['LastPuchasesPopularity'],
    num_train_weeks=20,
    path_to_save_result='/home/juravlik/PycharmProjects/kaggle_hnm_recsys/data/train_set_and_labels/1_exp.parquet'
)

## Train Ranker model

In [79]:
train_w_labels = pd.read_parquet('/home/juravlik/PycharmProjects/kaggle_hnm_recsys/data/train_set_and_labels/1_exp.parquet')
train_w_labels.loc[train_w_labels.label > 0, 'label'] = 13 - train_w_labels['label'] # 1

train_w_labels = train_w_labels.sort_values(['weeks_before_sub', 'customer_id'])

In [80]:
last_week = pd.read_parquet('/home/juravlik/PycharmProjects/kaggle_hnm_recsys/data/ranker_train_set/LastPuchasesPopularity/LastPuchasesPopularity_0.parquet')

last_week = last_week.drop_duplicates(subset=['customer_id', 'article_id'])

last_week = last_week.sort_values(['weeks_before_sub', 'customer_id'])

In [81]:
ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    eval_at=12,
    boosting_type="dart",
    max_depth=7,
    n_estimators=100,
    importance_type='gain',
    verbose=-1
)

In [82]:
ranker = ranker.fit(
    X=train_w_labels[['score_LastPuchasesPopularity']],
    y=train_w_labels[['label']],
    group=train_w_labels.groupby(['weeks_before_sub', 'customer_id'])['article_id'].count().values,
)



In [83]:
predict = ranker.predict(
    last_week[['score']]
)

In [84]:
last_week['predict'] = predict

In [85]:
last_week = last_week.sort_values(['customer_id', 'predict'], ascending=False).groupby('customer_id').head(12)

In [89]:
last_week['article_id'] = last_week['article_id'].apply(lambda x: int_article_id[x])
last_week['customer_id'] = last_week['customer_id'].apply(lambda x: int_customer_id[x])

last_week['article_id'] = last_week['article_id'].astype(str)

last_week['prediction'] = last_week.groupby(['customer_id'])['article_id'].transform(lambda x: ' '.join(x))
last_week.drop_duplicates(subset=['customer_id'], inplace=True)
last_week.reset_index(inplace=True, drop=True)
last_week['prediction'] = last_week['prediction'].str[:131]

last_week = last_week[['customer_id', 'prediction']]

In [90]:
last_week.to_csv('sub.csv', index=False)