Radek posted about this [here](https://www.kaggle.com/competitions/h-and-m-personalized-fashion-recommendations/discussion/309220), and linked to a GitHub repo with the code.

I just transferred that code here to Kaggle notebooks, that's all.

In [1]:
import numpy as np

def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.

    This function computes the average prescision at k between two lists of
    items.

    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The average precision at k over the input lists

    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.

    This function computes the mean average prescision at k between two lists
    of lists of items.

    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The mean average precision at k over the input lists

    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [2]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

# https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations/discussion/308635
def customer_hex_id_to_int(series):
    return series.str[-16:].apply(hex_id_to_int)

def hex_id_to_int(str):
    return int(str[-16:], 16)

def article_id_str_to_int(series):
    return series.astype('int32')

def article_id_int_to_str(series):
    return '0' + series.astype('str')

class Categorize(BaseEstimator, TransformerMixin):
    def __init__(self, min_examples=0):
        self.min_examples = min_examples
        self.categories = []
        
    def fit(self, X):
        for i in range(X.shape[1]):
            vc = X.iloc[:, i].value_counts()
            self.categories.append(vc[vc > self.min_examples].index.tolist())
        return self

    def transform(self, X):
        data = {X.columns[i]: pd.Categorical(X.iloc[:, i], categories=self.categories[i]).codes for i in range(X.shape[1])}
        return pd.DataFrame(data=data)


def calculate_apk(list_of_preds, list_of_gts):
    # for fast validation this can be changed to operate on dicts of {'cust_id_int': [art_id_int, ...]}
    # using 'data/val_week_purchases_by_cust.pkl'
    apks = []
    for preds, gt in zip(list_of_preds, list_of_gts):
        apks.append(apk(gt, preds, k=12))
    return np.mean(apks)

def eval_sub(sub_csv, skip_cust_with_no_purchases=True):
    sub=pd.read_csv(sub_csv)
    validation_set=pd.read_parquet('data/validation_ground_truth.parquet')

    apks = []

    no_purchases_pattern = []
    for pred, gt in zip(sub.prediction.str.split(), validation_set.prediction.str.split()):
        if skip_cust_with_no_purchases and (gt == no_purchases_pattern): continue
        apks.append(apk(gt, pred, k=12))
    return np.mean(apks)



In [3]:
import pandas as pd

In [4]:
%%time

transactions = pd.read_parquet('../input/warmup/transactions_train.parquet')
customers = pd.read_parquet('../input/warmup/customers.parquet')
articles = pd.read_parquet('../input/warmup/articles.parquet')

# sample = 0.05
# transactions = pd.read_parquet(f'data/transactions_train_sample_{sample}.parquet')
# customers = pd.read_parquet(f'data/customers_sample_{sample}.parquet')
# articles = pd.read_parquet(f'data/articles_train_sample_{sample}.parquet')

CPU times: user 3.22 s, sys: 2.03 s, total: 5.25 s
Wall time: 6.32 s


In [5]:
test_week = transactions.week.max() + 1
transactions = transactions[transactions.week > transactions.week.max() - 10]

In [6]:
%%time

c2weeks = transactions.groupby('customer_id')['week'].unique()

c2weeks2shifted_weeks = {}

for c_id, weeks in c2weeks.items():
    c2weeks2shifted_weeks[c_id] = {}
    for i in range(weeks.shape[0]-1):
        c2weeks2shifted_weeks[c_id][weeks[i]] = weeks[i+1]
    c2weeks2shifted_weeks[c_id][weeks[-1]] = test_week

candidates_last_purchase = transactions.copy()

weeks = []
for i, (c_id, week) in enumerate(zip(transactions['customer_id'], transactions['week'])):
    weeks.append(c2weeks2shifted_weeks[c_id][week])
    
candidates_last_purchase.week=weeks

mean_price = transactions \
    .groupby(['week', 'article_id'])['price'].mean()

sales = transactions \
    .groupby('week')['article_id'].value_counts() \
    .groupby('week').rank(method='dense', ascending=False) \
    .groupby('week').head(12).rename('bestseller_rank').astype('int8')

bestsellers_previous_week = pd.merge(sales, mean_price, on=['week', 'article_id']).reset_index()
bestsellers_previous_week.week += 1

CPU times: user 56.6 s, sys: 381 ms, total: 57 s
Wall time: 57 s


### Features

Feature 3: image captions

In [7]:
import pandas as pd # data processing
import numpy as np
import seaborn as sns
import matplotlib.pylab as plt
import os
plt.style.use("ggplot")
pd.set_option('display.max_columns', 200)

In [8]:
!pip install fastparquet
import fastparquet



In [9]:
captions = pd.read_parquet('/kaggle/input/captions/captioning_parquet_files/parquet_files/captionsAll.parquet')
captions

Unnamed: 0,article_id,caption
0,578476001,the skinny black pants are made from stretchy ...
1,573085042,the skinny jeans are made from a dark wash den...
2,572797002,a white tank top with a scoop neckline and ruf...
3,579541089,the cropped cardigan sweater is made from a so...
4,575347003,six pairs of black socks on a white surface wi...
...,...,...
991,859125001,a white shirt with long sleeves and a button d...
992,859105002,a pair of shorts in beige with buttons on the ...
993,856840001,a black and white top with a zebra print pattern
994,850259001,a top with lemon print and ruffled sleeves on ...


Now we embed the captions with sentenceTransformer

(I looked at Peter Kirby's code from feature engineering because he did this with the regular descriptions)
link here: https://github.com/LienM/ai-project-23-24/blob/main/PeterKirby/FeatureEngineering.ipynb

In [10]:
!pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25ldone
[?25h  Created wheel for sentence-transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125923 sha256=c4b11c957db4b3729d6d0bb6947ba9aeefb846a3a9760e8bf14bf19a34b6855b
  Stored in directory: /root/.cache/pip/wheels/62/f2/10/1e606fd5f02395388f74e7462910fe851042f97238cbbd902f
Successfully built sentence-transformers
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-2.2.2


In [11]:
# use sentencetransform to convert captions into integers
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L12-v2')
# model = SentenceTransformer('all-mpnet-base-v2')

Downloading .gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading 1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/573 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/352 [00:00<?, ?B/s]

Downloading train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [12]:
#converting to list of descriptions to be passed to encoder
descriptions = captions['caption'].tolist()
#encoding
embeddings = model.encode(descriptions)

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

In [13]:
embeddings.shape

(996, 384)

In [14]:
#PCA so we arent adding 300+ new features - top 2 principle components
from sklearn.decomposition import PCA
caption_PCA = PCA(n_components=50)
principle_components = caption_PCA.fit_transform(embeddings)
print(principle_components.shape)
variances = caption_PCA.explained_variance_ratio_
variance = 0
for value in variances:
    variance += value
print(variance)

(996, 50)
0.843591480050236


If I take the first 100 principle components and add their variance together, I get 94,5%. This means that by removing 284 principle components, I only lose 5,5% of the information.

In [15]:
# add all principal components as feature
for i in range(principle_components.shape[1]):
    captions["pc"+str(i)] = principle_components[:,i]
captions

Unnamed: 0,article_id,caption,pc0,pc1,pc2,pc3,pc4,pc5,pc6,pc7,pc8,pc9,pc10,pc11,pc12,pc13,pc14,pc15,pc16,pc17,pc18,pc19,pc20,pc21,pc22,pc23,pc24,pc25,pc26,pc27,pc28,pc29,pc30,pc31,pc32,pc33,pc34,pc35,pc36,pc37,pc38,pc39,pc40,pc41,pc42,pc43,pc44,pc45,pc46,pc47,pc48,pc49
0,578476001,the skinny black pants are made from stretchy ...,0.515711,-0.027654,-0.190305,-0.139294,-0.009907,-0.068835,-0.136067,0.027146,-0.031252,-0.016807,0.054875,-0.092974,-0.088357,-0.074815,-0.148455,-0.037319,-0.022574,-0.040412,-0.012248,-0.009295,-0.042290,0.046220,-0.101347,-0.011422,0.026391,-0.038918,0.056363,0.070302,0.029307,0.079672,-0.156431,0.008571,0.076951,-0.008258,-0.094703,-0.002241,0.002730,-0.001146,-0.048176,-0.006321,0.014307,-0.031919,0.062916,-0.020555,0.057368,-0.043883,0.021208,-0.040354,0.024065,0.020991
1,573085042,the skinny jeans are made from a dark wash den...,0.543518,0.184471,0.079495,-0.188556,0.042054,-0.147709,-0.120657,-0.025163,-0.019986,-0.091952,0.101073,-0.010325,0.155094,-0.143355,-0.195953,0.109768,-0.123807,-0.006848,-0.030577,-0.066592,-0.010800,-0.097276,-0.086110,-0.012172,0.060081,-0.063026,-0.042236,0.038052,-0.031517,-0.058493,-0.065087,0.009260,0.022669,0.008442,-0.048614,0.054878,0.013658,-0.001292,0.039855,0.017079,-0.018745,-0.090192,-0.009869,-0.041825,0.055731,0.022810,-0.047928,0.012876,0.028556,0.007815
2,572797002,a white tank top with a scoop neckline and ruf...,-0.261078,-0.044818,0.098601,-0.137210,0.294043,-0.115099,0.251523,-0.045751,-0.141905,-0.142426,-0.209978,0.034288,-0.022632,0.101718,0.079937,0.165858,0.025092,-0.008365,0.083587,0.011307,0.109200,-0.021847,-0.126312,-0.095686,-0.069920,-0.112888,0.147401,0.000497,-0.011265,-0.184086,-0.053964,-0.014114,-0.044336,0.070598,-0.030173,-0.042966,-0.033967,0.073189,-0.029538,0.097238,-0.010233,-0.066940,0.038384,0.021926,-0.050115,-0.055622,0.049283,0.008487,-0.037442,-0.005010
3,579541089,the cropped cardigan sweater is made from a so...,0.078603,0.578686,0.108701,-0.105878,-0.163558,0.214609,-0.094166,0.025330,0.082340,0.023061,-0.277417,0.099671,-0.153420,-0.076826,0.067997,-0.060581,-0.026710,-0.142784,-0.030269,-0.102267,0.062883,0.037585,0.039557,0.013074,-0.008460,0.039008,0.054842,-0.022129,-0.081990,-0.034077,-0.107469,-0.058446,-0.038799,0.046772,-0.043058,-0.016598,-0.002089,-0.006821,0.137600,-0.050285,-0.009343,0.040155,0.031049,-0.080647,-0.004886,0.025323,-0.140423,0.080382,0.030934,-0.034177
4,575347003,six pairs of black socks on a white surface wi...,0.234815,-0.052231,0.114519,0.216589,0.206344,0.507604,0.044938,0.031761,-0.099494,-0.077753,-0.026071,-0.008132,0.164475,0.045654,0.048138,0.107662,-0.197593,-0.043307,-0.022894,-0.015254,0.023573,-0.043254,-0.010667,0.062917,0.202167,-0.117436,-0.174311,-0.064676,-0.094202,0.102665,0.093779,0.008169,-0.008287,-0.014609,-0.130734,-0.012306,-0.087195,-0.079652,-0.051725,-0.082240,-0.006968,0.091700,0.020030,-0.057792,-0.082927,0.039145,0.194726,0.005335,0.055798,0.054020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
991,859125001,a white shirt with long sleeves and a button d...,-0.260890,0.158491,-0.109875,0.441637,0.031836,-0.168690,0.087984,0.054838,-0.223873,0.046644,-0.170008,-0.126721,-0.038846,0.065182,-0.130835,-0.085026,0.006266,0.037083,0.052132,-0.035234,0.007682,0.094923,0.069684,-0.043293,0.022966,-0.104507,0.047595,-0.048024,-0.017133,0.019767,0.055810,-0.014029,-0.000535,0.061616,-0.043647,0.047472,-0.081610,0.039828,-0.023052,-0.049487,-0.007982,-0.070649,-0.041782,-0.095863,-0.086619,0.050466,-0.044343,0.015329,-0.077488,-0.057409
992,859105002,a pair of shorts in beige with buttons on the ...,0.263939,-0.166281,-0.038916,0.254482,-0.049051,-0.091928,-0.126198,0.089481,0.002727,0.056984,-0.168851,-0.103050,-0.168665,0.130457,0.101427,0.122816,0.075769,0.116725,-0.196385,0.017274,-0.128162,-0.045755,-0.050198,-0.029065,-0.021733,0.066981,0.075096,-0.131669,0.016359,0.142100,-0.066692,0.099278,-0.098788,-0.091770,0.019280,0.149105,-0.070989,0.000374,-0.029842,-0.007812,0.076796,-0.011804,0.026530,0.006231,-0.010937,0.009582,0.024914,-0.058499,0.091240,-0.040803
993,856840001,a black and white top with a zebra print pattern,-0.286878,-0.018764,0.121034,-0.169658,0.343525,0.135330,0.089727,0.105590,0.078781,-0.078716,0.191135,0.069736,0.059270,-0.035931,0.031297,-0.004129,-0.145193,-0.076691,0.012781,-0.083024,-0.053960,0.125017,0.109594,-0.048274,-0.057122,0.026241,0.110808,-0.078234,0.021645,0.120286,-0.013907,0.047220,0.117094,0.033160,0.000984,0.111947,-0.008557,-0.016333,-0.015235,0.028779,-0.078588,-0.053530,-0.002199,-0.022978,-0.007927,-0.048680,0.014788,-0.097596,0.027583,0.060899
994,850259001,a top with lemon print and ruffled sleeves on ...,-0.285076,-0.032723,0.105439,-0.134898,0.073721,-0.003079,0.135381,0.038214,-0.017667,0.297031,-0.058980,0.089036,0.049308,-0.001898,-0.062840,0.094372,-0.147250,-0.026230,-0.060009,0.045468,-0.079068,-0.039072,0.058015,0.111100,-0.077320,-0.053692,0.177119,-0.064460,0.031478,0.022593,-0.008548,0.038744,0.009257,-0.001739,0.070678,-0.004595,-0.108173,-0.000322,0.044452,-0.028568,0.023602,-0.020606,-0.004756,0.084330,0.014442,-0.038790,-0.004023,-0.013702,0.010899,0.022329


In [16]:
captions = captions.set_index('article_id')
captions

Unnamed: 0_level_0,caption,pc0,pc1,pc2,pc3,pc4,pc5,pc6,pc7,pc8,pc9,pc10,pc11,pc12,pc13,pc14,pc15,pc16,pc17,pc18,pc19,pc20,pc21,pc22,pc23,pc24,pc25,pc26,pc27,pc28,pc29,pc30,pc31,pc32,pc33,pc34,pc35,pc36,pc37,pc38,pc39,pc40,pc41,pc42,pc43,pc44,pc45,pc46,pc47,pc48,pc49
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1
578476001,the skinny black pants are made from stretchy ...,0.515711,-0.027654,-0.190305,-0.139294,-0.009907,-0.068835,-0.136067,0.027146,-0.031252,-0.016807,0.054875,-0.092974,-0.088357,-0.074815,-0.148455,-0.037319,-0.022574,-0.040412,-0.012248,-0.009295,-0.042290,0.046220,-0.101347,-0.011422,0.026391,-0.038918,0.056363,0.070302,0.029307,0.079672,-0.156431,0.008571,0.076951,-0.008258,-0.094703,-0.002241,0.002730,-0.001146,-0.048176,-0.006321,0.014307,-0.031919,0.062916,-0.020555,0.057368,-0.043883,0.021208,-0.040354,0.024065,0.020991
573085042,the skinny jeans are made from a dark wash den...,0.543518,0.184471,0.079495,-0.188556,0.042054,-0.147709,-0.120657,-0.025163,-0.019986,-0.091952,0.101073,-0.010325,0.155094,-0.143355,-0.195953,0.109768,-0.123807,-0.006848,-0.030577,-0.066592,-0.010800,-0.097276,-0.086110,-0.012172,0.060081,-0.063026,-0.042236,0.038052,-0.031517,-0.058493,-0.065087,0.009260,0.022669,0.008442,-0.048614,0.054878,0.013658,-0.001292,0.039855,0.017079,-0.018745,-0.090192,-0.009869,-0.041825,0.055731,0.022810,-0.047928,0.012876,0.028556,0.007815
572797002,a white tank top with a scoop neckline and ruf...,-0.261078,-0.044818,0.098601,-0.137210,0.294043,-0.115099,0.251523,-0.045751,-0.141905,-0.142426,-0.209978,0.034288,-0.022632,0.101718,0.079937,0.165858,0.025092,-0.008365,0.083587,0.011307,0.109200,-0.021847,-0.126312,-0.095686,-0.069920,-0.112888,0.147401,0.000497,-0.011265,-0.184086,-0.053964,-0.014114,-0.044336,0.070598,-0.030173,-0.042966,-0.033967,0.073189,-0.029538,0.097238,-0.010233,-0.066940,0.038384,0.021926,-0.050115,-0.055622,0.049283,0.008487,-0.037442,-0.005010
579541089,the cropped cardigan sweater is made from a so...,0.078603,0.578686,0.108701,-0.105878,-0.163558,0.214609,-0.094166,0.025330,0.082340,0.023061,-0.277417,0.099671,-0.153420,-0.076826,0.067997,-0.060581,-0.026710,-0.142784,-0.030269,-0.102267,0.062883,0.037585,0.039557,0.013074,-0.008460,0.039008,0.054842,-0.022129,-0.081990,-0.034077,-0.107469,-0.058446,-0.038799,0.046772,-0.043058,-0.016598,-0.002089,-0.006821,0.137600,-0.050285,-0.009343,0.040155,0.031049,-0.080647,-0.004886,0.025323,-0.140423,0.080382,0.030934,-0.034177
575347003,six pairs of black socks on a white surface wi...,0.234815,-0.052231,0.114519,0.216589,0.206344,0.507604,0.044938,0.031761,-0.099494,-0.077753,-0.026071,-0.008132,0.164475,0.045654,0.048138,0.107662,-0.197593,-0.043307,-0.022894,-0.015254,0.023573,-0.043254,-0.010667,0.062917,0.202167,-0.117436,-0.174311,-0.064676,-0.094202,0.102665,0.093779,0.008169,-0.008287,-0.014609,-0.130734,-0.012306,-0.087195,-0.079652,-0.051725,-0.082240,-0.006968,0.091700,0.020030,-0.057792,-0.082927,0.039145,0.194726,0.005335,0.055798,0.054020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
859125001,a white shirt with long sleeves and a button d...,-0.260890,0.158491,-0.109875,0.441637,0.031836,-0.168690,0.087984,0.054838,-0.223873,0.046644,-0.170008,-0.126721,-0.038846,0.065182,-0.130835,-0.085026,0.006266,0.037083,0.052132,-0.035234,0.007682,0.094923,0.069684,-0.043293,0.022966,-0.104507,0.047595,-0.048024,-0.017133,0.019767,0.055810,-0.014029,-0.000535,0.061616,-0.043647,0.047472,-0.081610,0.039828,-0.023052,-0.049487,-0.007982,-0.070649,-0.041782,-0.095863,-0.086619,0.050466,-0.044343,0.015329,-0.077488,-0.057409
859105002,a pair of shorts in beige with buttons on the ...,0.263939,-0.166281,-0.038916,0.254482,-0.049051,-0.091928,-0.126198,0.089481,0.002727,0.056984,-0.168851,-0.103050,-0.168665,0.130457,0.101427,0.122816,0.075769,0.116725,-0.196385,0.017274,-0.128162,-0.045755,-0.050198,-0.029065,-0.021733,0.066981,0.075096,-0.131669,0.016359,0.142100,-0.066692,0.099278,-0.098788,-0.091770,0.019280,0.149105,-0.070989,0.000374,-0.029842,-0.007812,0.076796,-0.011804,0.026530,0.006231,-0.010937,0.009582,0.024914,-0.058499,0.091240,-0.040803
856840001,a black and white top with a zebra print pattern,-0.286878,-0.018764,0.121034,-0.169658,0.343525,0.135330,0.089727,0.105590,0.078781,-0.078716,0.191135,0.069736,0.059270,-0.035931,0.031297,-0.004129,-0.145193,-0.076691,0.012781,-0.083024,-0.053960,0.125017,0.109594,-0.048274,-0.057122,0.026241,0.110808,-0.078234,0.021645,0.120286,-0.013907,0.047220,0.117094,0.033160,0.000984,0.111947,-0.008557,-0.016333,-0.015235,0.028779,-0.078588,-0.053530,-0.002199,-0.022978,-0.007927,-0.048680,0.014788,-0.097596,0.027583,0.060899
850259001,a top with lemon print and ruffled sleeves on ...,-0.285076,-0.032723,0.105439,-0.134898,0.073721,-0.003079,0.135381,0.038214,-0.017667,0.297031,-0.058980,0.089036,0.049308,-0.001898,-0.062840,0.094372,-0.147250,-0.026230,-0.060009,0.045468,-0.079068,-0.039072,0.058015,0.111100,-0.077320,-0.053692,0.177119,-0.064460,0.031478,0.022593,-0.008548,0.038744,0.009257,-0.001739,0.070678,-0.004595,-0.108173,-0.000322,0.044452,-0.028568,0.023602,-0.020606,-0.004756,0.084330,0.014442,-0.038790,-0.004023,-0.013702,0.010899,0.022329


In [18]:
data = pd.read_parquet('/kaggle/input/captions/captioning_parquet_files/parquet_files/data_with_captions_and_pc.parquet')

In [19]:
data = data.drop(['caption', 'pc1', 'pc2'], axis=1)
data

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week,purchased,bestseller_rank,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,perceived_colour_value_id,perceived_colour_value_name,perceived_colour_master_id,perceived_colour_master_name,department_no,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc,FN,Active,club_member_status,fashion_news_frequency,age,postal_code,week_nr,days_since_last_purchase
0,2020-07-22,200292573348128,880777001,0.025407,2,96,1.0,999.0,880777,10832,59,20,6,1010016,0,52,16,7,5,4,3,1645,197,1,1,2,2,51,27,1018,12,9213,1,1,0,1,25,63947,30,0.0
1,2020-07-22,200292573348128,784332002,0.025407,2,96,1.0,999.0,784332,25312,274,7,1,1010016,0,9,0,4,0,5,0,1641,0,0,0,1,0,18,12,1005,0,7303,1,1,0,1,25,63947,30,0.0
2,2020-07-22,200292573348128,827968001,0.016932,2,96,1.0,10.0,827968,1066,255,3,0,1010016,0,10,2,3,2,9,2,1676,10,0,0,1,0,16,30,1002,2,1227,1,1,0,1,25,63947,30,0.0
3,2020-07-22,200292573348128,599580086,0.011847,2,96,1.0,999.0,599580,36,59,20,6,1010016,0,91,30,3,2,19,7,4242,5,7,7,1,0,60,22,1018,12,52,1,1,0,1,25,63947,30,0.0
4,2020-07-22,248294615847351,720504008,0.031458,1,96,1.0,999.0,720504,1373,272,0,1,1010023,4,7,4,2,3,12,4,5672,82,2,2,3,3,56,46,1016,11,95,-1,-1,0,0,46,8666,30,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17991757,2020-09-22,18438270306572912089,915529003,0.033439,1,105,0.0,8.0,915529,7046,252,2,0,1010016,0,9,0,4,0,5,0,1626,1,0,0,1,0,15,0,1003,3,10909,1,1,0,1,71,116920,39,0.0
17991758,2020-09-22,18438270306572912089,915529005,0.033417,1,105,0.0,9.0,915529,7046,252,2,0,1010016,0,13,12,1,1,1,14,1626,1,0,0,1,0,15,0,1003,3,10909,1,1,0,1,71,116920,39,0.0
17991759,2020-09-22,18438270306572912089,448509014,0.041630,1,105,0.0,10.0,448509,259,272,0,1,1010016,0,72,6,3,2,2,1,1747,37,1,1,2,2,53,1,1009,5,255,1,1,0,1,71,116920,39,
17991760,2020-09-22,18438270306572912089,762846027,0.025005,1,105,0.0,11.0,762846,472,259,8,0,1010016,0,13,12,1,1,1,14,1515,3,0,0,1,0,11,7,1010,6,492,1,1,0,1,71,116920,39,0.0


In [18]:
small_sample = data.head(100)
small_sample

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week,purchased,bestseller_rank,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,perceived_colour_value_id,perceived_colour_value_name,perceived_colour_master_id,perceived_colour_master_name,department_no,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc,FN,Active,club_member_status,fashion_news_frequency,age,postal_code,week_nr,days_since_last_purchase
0,2020-07-22,200292573348128,880777001,0.025407,2,96,1.0,999.0,880777,10832,59,20,6,1010016,0,52,16,7,5,4,3,1645,197,1,1,2,2,51,27,1018,12,9213,1,1,0,1,25,63947,30,0.0
1,2020-07-22,200292573348128,784332002,0.025407,2,96,1.0,999.0,784332,25312,274,7,1,1010016,0,9,0,4,0,5,0,1641,0,0,0,1,0,18,12,1005,0,7303,1,1,0,1,25,63947,30,0.0
2,2020-07-22,200292573348128,827968001,0.016932,2,96,1.0,10.0,827968,1066,255,3,0,1010016,0,10,2,3,2,9,2,1676,10,0,0,1,0,16,30,1002,2,1227,1,1,0,1,25,63947,30,0.0
3,2020-07-22,200292573348128,599580086,0.011847,2,96,1.0,999.0,599580,36,59,20,6,1010016,0,91,30,3,2,19,7,4242,5,7,7,1,0,60,22,1018,12,52,1,1,0,1,25,63947,30,0.0
4,2020-07-22,248294615847351,720504008,0.031458,1,96,1.0,999.0,720504,1373,272,0,1,1010023,4,7,4,2,3,12,4,5672,82,2,2,3,3,56,46,1016,11,95,-1,-1,0,0,46,8666,30,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,2020-07-22,39703554858650166,649440048,0.011847,2,96,1.0,999.0,649440,91,255,3,0,1010008,5,9,0,4,0,5,0,5828,9,2,2,3,3,20,23,1005,0,0,1,1,0,1,20,17024,30,0.0
96,2020-07-22,39703554858650166,585480001,0.016932,2,96,1.0,999.0,585480,11182,306,13,4,1010016,0,9,0,4,0,5,0,3710,107,7,7,1,0,61,5,1017,4,9914,1,1,0,1,20,17024,30,0.0
97,2020-07-22,43585159363207567,868161002,0.033881,1,96,1.0,999.0,868161,18005,258,5,0,1010016,0,51,3,3,2,3,10,1515,3,0,0,1,0,11,7,1010,6,11575,1,1,0,1,45,330344,30,0.0
98,2020-07-22,43585159363207567,841793003,0.027102,1,96,1.0,999.0,841793,7979,253,9,0,1010016,0,10,2,3,2,9,2,1515,3,0,0,1,0,11,7,1010,6,10756,1,1,0,1,45,330344,30,0.0


In [20]:
# join captions with data dataframe
data_captions = data.merge(captions, on='article_id', how='left')
data = data_captions

In [20]:
# data = data.drop(['caption_y', 'pc1_y', 'pc2_y'], axis=1)
# data

In [21]:
data.caption = data.caption.fillna('')
data

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week,purchased,bestseller_rank,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,perceived_colour_value_id,perceived_colour_value_name,perceived_colour_master_id,perceived_colour_master_name,department_no,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc,FN,Active,club_member_status,fashion_news_frequency,age,postal_code,week_nr,days_since_last_purchase,caption,pc0,pc1,pc2,pc3,pc4,pc5,pc6,pc7,pc8,pc9,pc10,pc11,pc12,pc13,pc14,pc15,pc16,pc17,pc18,pc19,pc20,pc21,pc22,pc23,pc24,pc25,pc26,pc27,pc28,pc29,pc30,pc31,pc32,pc33,pc34,pc35,pc36,pc37,pc38,pc39,pc40,pc41,pc42,pc43,pc44,pc45,pc46,pc47,pc48,pc49
0,2020-07-22,200292573348128,880777001,0.025407,2,96,1.0,999.0,880777,10832,59,20,6,1010016,0,52,16,7,5,4,3,1645,197,1,1,2,2,51,27,1018,12,9213,1,1,0,1,25,63947,30,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,2020-07-22,200292573348128,784332002,0.025407,2,96,1.0,999.0,784332,25312,274,7,1,1010016,0,9,0,4,0,5,0,1641,0,0,0,1,0,18,12,1005,0,7303,1,1,0,1,25,63947,30,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,2020-07-22,200292573348128,827968001,0.016932,2,96,1.0,10.0,827968,1066,255,3,0,1010016,0,10,2,3,2,9,2,1676,10,0,0,1,0,16,30,1002,2,1227,1,1,0,1,25,63947,30,0.0,a white t - shirt with a short sleeve and a sh...,-0.170963,0.125315,-0.070946,0.414099,0.011029,-0.117704,-0.007481,-0.133631,-0.303337,0.157205,-0.084690,0.155905,-0.001817,0.038022,-0.027409,0.014418,0.051000,-0.045029,-0.064564,-0.102366,0.075422,-0.034975,-0.008083,-0.012169,0.049369,0.026277,-0.068404,-0.112530,-0.053853,-0.108371,0.035925,-0.006778,0.034578,-0.014365,0.052595,0.060609,0.057311,0.013952,-0.091560,0.058054,-0.042180,-0.059691,0.038067,0.036548,0.065583,-0.049304,-0.025219,0.060136,-0.027526,-0.001357
3,2020-07-22,200292573348128,599580086,0.011847,2,96,1.0,999.0,599580,36,59,20,6,1010016,0,91,30,3,2,19,7,4242,5,7,7,1,0,60,22,1018,12,52,1,1,0,1,25,63947,30,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,2020-07-22,248294615847351,720504008,0.031458,1,96,1.0,999.0,720504,1373,272,0,1,1010023,4,7,4,2,3,12,4,5672,82,2,2,3,3,56,46,1016,11,95,-1,-1,0,0,46,8666,30,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17991757,2020-09-22,18438270306572912089,915529003,0.033439,1,105,0.0,8.0,915529,7046,252,2,0,1010016,0,9,0,4,0,5,0,1626,1,0,0,1,0,15,0,1003,3,10909,1,1,0,1,71,116920,39,0.0,the cropped sweater is black and has puff slee...,-0.055074,0.306548,-0.357711,-0.046762,0.002765,0.198750,-0.111173,-0.022372,-0.021536,0.193661,-0.130308,-0.078287,0.078943,-0.218341,0.116880,-0.130597,0.003162,-0.042700,-0.018012,0.081268,0.034808,0.035914,-0.051436,0.006933,-0.002885,0.021734,-0.050018,0.134731,0.032368,-0.092394,-0.054092,0.004041,-0.004369,0.059475,0.011685,-0.038583,-0.004435,-0.008936,-0.002630,0.007446,0.011881,0.094363,-0.041230,0.064723,0.021644,0.041402,-0.001505,-0.051423,0.030196,0.021024
17991758,2020-09-22,18438270306572912089,915529005,0.033417,1,105,0.0,9.0,915529,7046,252,2,0,1010016,0,13,12,1,1,1,14,1626,1,0,0,1,0,15,0,1003,3,10909,1,1,0,1,71,116920,39,0.0,the cropped sweater in beige is made from a so...,0.123746,0.565306,0.150030,-0.112724,-0.125986,0.152190,-0.098844,-0.046843,0.189867,-0.004971,-0.254307,0.055054,-0.163766,-0.110668,0.082709,0.045643,0.069483,0.025111,-0.092837,-0.032352,0.010469,-0.087498,-0.136419,-0.119536,-0.013757,-0.018385,0.037056,-0.015691,-0.063224,0.094089,0.007146,-0.062097,-0.074974,-0.019507,0.046556,0.064551,-0.086293,-0.027974,-0.028007,-0.065503,0.081180,-0.030630,0.080722,-0.016553,0.014999,0.004859,-0.018463,-0.009661,0.067485,-0.024754
17991759,2020-09-22,18438270306572912089,448509014,0.041630,1,105,0.0,10.0,448509,259,272,0,1,1010016,0,72,6,3,2,2,1,1747,37,1,1,2,2,53,1,1009,5,255,1,1,0,1,71,116920,39,,the jeans are light blue and have a faded wash...,0.561379,0.050978,0.153473,-0.096111,-0.032089,-0.090965,0.004965,0.112870,-0.083268,-0.048273,0.075425,-0.089019,0.240913,0.053490,0.142149,-0.053321,-0.082436,0.067272,-0.029641,-0.142884,0.151743,-0.018854,-0.039870,0.031151,-0.048709,0.065703,-0.106118,-0.070859,0.046813,-0.029779,0.029169,-0.067374,-0.124887,0.075326,0.024499,-0.074637,-0.067315,0.018803,-0.032299,-0.072671,-0.039031,-0.090673,-0.014555,0.038745,-0.008837,-0.064849,0.040505,0.003695,-0.041064,0.008846
17991760,2020-09-22,18438270306572912089,762846027,0.025005,1,105,0.0,11.0,762846,472,259,8,0,1010016,0,13,12,1,1,1,14,1515,3,0,0,1,0,11,7,1010,6,492,1,1,0,1,71,116920,39,0.0,the silk shirt is white and has buttons on the...,-0.131860,0.205244,0.082912,0.364054,0.032541,-0.106107,0.017094,0.242958,-0.032703,-0.073934,0.153515,-0.169750,-0.033079,0.086171,-0.119847,-0.116071,-0.079427,0.044902,0.002128,0.055910,-0.105985,-0.080701,-0.125137,0.064045,-0.042697,-0.099911,-0.036068,-0.011069,0.073134,-0.059484,0.047042,-0.080838,-0.025126,-0.017016,-0.023381,0.073322,-0.051821,0.046357,-0.032536,-0.001338,-0.086039,0.040983,0.012037,0.070079,0.042087,-0.042932,0.006330,-0.049830,0.041973,-0.038846


In [22]:
print(data['caption'].value_counts()[''])

3506498


In [23]:
data.to_parquet('/kaggle/working/data_with_captions_and_pc50.parquet')

  if _pandas_api.is_sparse(col):


__ __ __ __ end of features

In [9]:
data = pd.read_parquet('/kaggle/input/captions-pc50/data_with_captions_and_pc50.parquet')

In [10]:
data.sort_values(['week', 'customer_id'], inplace=True)
data.reset_index(drop=True, inplace=True)

In [11]:
train = data[data.week != test_week]
test = data[data.week==test_week].drop_duplicates(['customer_id', 'article_id', 'sales_channel_id']).copy()

In [12]:
train.to_parquet('/kaggle/working/train.parquet')
test.to_parquet('/kaggle/working/test.parquet')

  if _pandas_api.is_sparse(col):


In [8]:
train = pd.read_parquet('/kaggle/working/train.parquet')
test = pd.read_parquet('/kaggle/working/test.parquet')

In [13]:
train_baskets = train.groupby(['week', 'customer_id'])['article_id'].count().values

In [46]:
columns_to_use = ['article_id', 'product_type_no', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id',
'perceived_colour_master_id', 'department_no', 'index_code',
'index_group_no', 'section_no', 'garment_group_no', 'FN', 'Active',
'club_member_status', 'fashion_news_frequency', 'age', 'postal_code', 
                  # 'bestseller_rank', 
#                   'week_nr', 'days_since_last_purchase', 
                  'pc0', 'pc1', 'pc2', 'pc3', 'pc4', 'pc5', 'pc6', 'pc7', 'pc8', 'pc9', 'pc10', 'pc11', 'pc12', 'pc13', 'pc14', 'pc15', 'pc16', 'pc17', 'pc18', 'pc19', 'pc20', 'pc21', 'pc22', 'pc23', 'pc24', 'pc25', 'pc26', 'pc27', 'pc28', 'pc29', 'pc30', 'pc31', 'pc32', 'pc33', 'pc34', 'pc35', 'pc36', 'pc37', 'pc38', 'pc39', 'pc40', 'pc41', 'pc42', 'pc43', 'pc44', 'pc45', 'pc46', 'pc47', 'pc48', 'pc49'
                  #, 'pc50', 'pc51', 'pc52', 'pc53', 'pc54', 'pc55', 'pc56', 'pc57', 'pc58', 'pc59', 'pc60', 'pc61', 'pc62', 'pc63', 'pc64', 'pc65', 'pc66', 'pc67', 'pc68', 'pc69', 'pc70', 'pc71', 'pc72', 'pc73', 'pc74', 'pc75', 'pc76', 'pc77', 'pc78', 'pc79', 'pc80', 'pc81', 'pc82', 'pc83', 'pc84', 'pc85', 'pc86', 'pc87', 'pc88', 'pc89', 'pc90', 'pc91', 'pc92', 'pc93', 'pc94', 'pc95', 'pc96', 'pc97', 'pc98', 'pc99'
                  ]

In [2]:
string = "["
for i in range(100):
    string += "'pc" + str(i) + "', "
string += "]"
print(string)

['pc0', 'pc1', 'pc2', 'pc3', 'pc4', 'pc5', 'pc6', 'pc7', 'pc8', 'pc9', 'pc10', 'pc11', 'pc12', 'pc13', 'pc14', 'pc15', 'pc16', 'pc17', 'pc18', 'pc19', 'pc20', 'pc21', 'pc22', 'pc23', 'pc24', 'pc25', 'pc26', 'pc27', 'pc28', 'pc29', 'pc30', 'pc31', 'pc32', 'pc33', 'pc34', 'pc35', 'pc36', 'pc37', 'pc38', 'pc39', 'pc40', 'pc41', 'pc42', 'pc43', 'pc44', 'pc45', 'pc46', 'pc47', 'pc48', 'pc49', 'pc50', 'pc51', 'pc52', 'pc53', 'pc54', 'pc55', 'pc56', 'pc57', 'pc58', 'pc59', 'pc60', 'pc61', 'pc62', 'pc63', 'pc64', 'pc65', 'pc66', 'pc67', 'pc68', 'pc69', 'pc70', 'pc71', 'pc72', 'pc73', 'pc74', 'pc75', 'pc76', 'pc77', 'pc78', 'pc79', 'pc80', 'pc81', 'pc82', 'pc83', 'pc84', 'pc85', 'pc86', 'pc87', 'pc88', 'pc89', 'pc90', 'pc91', 'pc92', 'pc93', 'pc94', 'pc95', 'pc96', 'pc97', 'pc98', 'pc99', ]


In [47]:
%%time

train_X = train[columns_to_use]
train_y = train['purchased']

test_X = test[columns_to_use]

CPU times: user 826 ms, sys: 736 ms, total: 1.56 s
Wall time: 1.56 s


In [14]:
train_X.to_parquet('/kaggle/working/train_X.parquet')

  if _pandas_api.is_sparse(col):


In [16]:
train_y.to_parquet('/kaggle/working/train_y.parquet')

AttributeError: 'Series' object has no attribute 'to_parquet'

In [48]:
del data

NameError: name 'data' is not defined

In [49]:
train.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week,purchased,bestseller_rank,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,perceived_colour_value_id,perceived_colour_value_name,perceived_colour_master_id,perceived_colour_master_name,department_no,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc,FN,Active,club_member_status,fashion_news_frequency,age,postal_code,week_nr,days_since_last_purchase,caption,pc0,pc1,pc2,pc3,pc4,pc5,pc6,pc7,pc8,pc9,pc10,pc11,pc12,pc13,pc14,pc15,pc16,pc17,pc18,pc19,pc20,pc21,pc22,pc23,pc24,pc25,pc26,pc27,pc28,pc29,pc30,pc31,pc32,pc33,pc34,pc35,pc36,pc37,pc38,pc39,pc40,pc41,pc42,pc43,pc44,pc45,pc46,pc47,pc48,pc49
0,2020-07-26,28847241659200,887770001,0.016932,1,96,1.0,999.0,887770,727,253,9,0,1010016,0,9,0,4,0,5,0,1510,3,0,0,1,0,6,10,1010,6,3692,1,1,0,1,21,57896,30,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,2020-07-18,28847241659200,762846001,0.025407,1,96,0.0,999.0,762846,472,259,8,0,1010016,0,10,2,3,2,9,2,1515,3,0,0,1,0,11,7,1010,6,492,1,1,0,1,21,57896,29,,"the white shirt is made from a soft, lightweig...",-0.062508,0.39433,0.319803,0.287853,0.020013,-0.014537,-0.113232,-0.089888,-0.12859,-0.071208,0.030846,0.064495,-0.012386,-0.053274,-0.297004,-0.036746,0.041514,0.001776,0.09586,-0.06866,0.08728,-0.04274,-0.152421,-0.155637,0.001382,-0.010321,0.074692,-0.093452,0.015897,0.09715,-0.053258,-0.130203,0.010091,0.088739,0.002498,0.007673,-0.001234,-0.02587,0.062699,-0.085627,-0.033624,-0.009235,0.095653,0.01031,-0.007293,-0.072759,-0.076322,-0.026902,0.002217,-0.024311
2,2020-07-18,28847241659200,829308001,0.033881,1,96,0.0,999.0,829308,11402,273,15,1,1010016,0,9,0,4,0,5,0,8310,65,9,9,26,4,5,21,1005,0,9082,1,1,0,1,21,57896,29,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,2020-07-26,28847241659200,760084003,0.025094,1,96,0.0,1.0,760084,1134,272,0,1,1010016,0,9,0,4,0,5,0,1747,37,1,1,2,2,53,1,1009,5,847,1,1,0,1,21,57896,30,0.0,a black cargo pant with pockets on the side an...,0.215926,-0.234419,-0.204199,0.023771,0.115803,-0.020772,-0.001996,-0.183122,0.206735,0.016895,-0.032551,0.17046,-0.111937,-0.027995,-0.042492,0.037994,0.125143,-0.137621,-0.096133,-0.028749,0.009071,-0.046045,-0.019346,0.058061,-0.232144,0.075161,-0.002839,0.081598,0.136534,-0.048859,0.02412,0.090085,-0.021111,0.039923,0.041653,0.009099,0.027304,0.083135,-0.057053,-0.127933,0.066184,-0.015614,-0.096699,-0.076315,-0.064174,-0.152041,-0.084394,0.025905,-0.036878,0.049437
4,2020-07-26,28847241659200,866731001,0.024919,1,96,0.0,2.0,866731,3609,273,15,1,1010016,0,9,0,4,0,5,0,8310,65,9,9,26,4,5,21,1005,0,3130,1,1,0,1,21,57896,30,0.0,a pair of black leggings with a high waist,0.342039,-0.294485,-0.24472,-0.122353,-0.012183,0.080385,-0.03723,-0.082282,-0.18688,-0.052749,-0.047501,-0.016443,0.070934,-0.011699,0.090604,0.018868,0.074009,-0.120305,-0.026558,0.021176,-0.144923,-0.010086,0.110655,0.065646,-0.096259,-0.039678,0.109467,-0.115453,-5.4e-05,0.024566,0.026289,-0.077334,-0.027632,-0.043864,0.024723,-0.040746,-0.021774,-0.096507,0.016029,-0.045121,-0.056442,0.003107,0.005402,-0.010585,-0.003753,0.005781,0.000517,0.072382,0.032343,-0.071065


# Model training

In [17]:
from lightgbm.sklearn import LGBMRanker



In [53]:
ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type="dart",
    n_estimators=100,
    importance_type='gain',
    verbose=10
)

In [54]:
%%time

ranker = ranker.fit(
    train_X,
    train_y,
    group=train_baskets,
)

[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.848850
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.041135
[LightGBM] [Debug] init for col-wise cost 0.210805 seconds, init for row-wise cost 4.598888 seconds
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13809
[LightGBM] [Info] Number of data points in the train set: 11381612, number of used features: 67
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 18
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 9
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 11
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 11
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 11
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 14
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 14
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 12
[LightGBM] [Debug] Trained 

In [55]:
for i in ranker.feature_importances_.argsort()[::-1]:
    print(columns_to_use[i], ranker.feature_importances_[i]/ranker.feature_importances_.sum())

pc3 0.3042607109928055
article_id 0.07567645226147036
pc42 0.0603813157465858
product_type_no 0.04777813633528206
pc22 0.03366403601159087
department_no 0.0318561917795939
pc1 0.02273999327453679
pc10 0.021289014606740837
pc25 0.017859285087051317
pc23 0.017693170247252324
pc7 0.01543632261181361
pc49 0.014399568811509784
pc19 0.014252469227257545
pc17 0.014054984144765336
pc39 0.013553421995536935
pc24 0.013541578746915136
pc33 0.013382269688518958
pc4 0.011902426952792934
pc14 0.011779531446606798
pc20 0.011676823946518625
pc2 0.011555203071623608
pc45 0.010866836673675535
pc38 0.010619355530605848
pc12 0.010509236180514197
pc9 0.009880743804195725
pc21 0.009782430701506206
pc29 0.009767469051375708
pc11 0.009634331784181025
colour_group_code 0.009486792954408678
pc36 0.009036037976676532
pc47 0.007933003261983994
garment_group_no 0.007614273850115753
pc13 0.007037479724845289
pc34 0.006719356856220785
pc48 0.006580050576207803
pc15 0.006571912855597672
pc41 0.006227189027691841
pc40

# Calculate predictions

In [56]:
%time

test['preds'] = ranker.predict(test_X)

c_id2predicted_article_ids = test \
    .sort_values(['customer_id', 'preds'], ascending=False) \
    .groupby('customer_id')['article_id'].apply(list).to_dict()

bestsellers_last_week = \
    bestsellers_previous_week[bestsellers_previous_week.week == bestsellers_previous_week.week.max()]['article_id'].tolist()

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 7.63 µs


# Create submission

In [57]:
sub = pd.read_csv('/kaggle/input/h-and-m-personalized-fashion-recommendations/sample_submission.csv')

In [58]:
%%time
preds = []
for c_id in customer_hex_id_to_int(sub.customer_id):
    pred = c_id2predicted_article_ids.get(c_id, [])
    pred = pred + bestsellers_last_week
    preds.append(pred[:12])

CPU times: user 6.34 s, sys: 212 ms, total: 6.56 s
Wall time: 6.55 s


In [59]:
preds = [' '.join(['0' + str(p) for p in ps]) for ps in preds]
sub.prediction = preds

In [60]:
sub_name = 'basic_model_submission'
sub.to_csv(f'{sub_name}.csv.gz', index=False)