# Lecture 5: Experiments with Algorithms

## Setup

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cp -r /content/drive/MyDrive/recpack /content

In [None]:
!unzip /content/drive/MyDrive/transactions_train.zip
!unzip /content/drive/MyDrive/articles.zip
!unzip /content/drive/MyDrive/customers.zip

Archive:  /content/drive/MyDrive/transactions_train.zip
  inflating: transactions_train.feather  
Archive:  /content/drive/MyDrive/articles.zip
  inflating: articles.feather        
Archive:  /content/drive/MyDrive/customers.zip
  inflating: customers.feather       


In [None]:
# Install a pip package in the current Jupyter kernel
import sys
!{sys.executable} -m pip install sentence_transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 2.9 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 31.2 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 52.4 MB/s 
[?25hCollecting huggingface-hub>=0.4.0
  Downloading huggingface_hub-0.11.0-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 49.3 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 42.2 MB/s 
Building wheels for collected 

In [None]:
import numpy as np 
import pandas as pd
import random
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
import string
#import unidecode
import seaborn as sns
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [None]:
from sklearn.decomposition import TruncatedSVD
from lightgbm.sklearn import LGBMRanker
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA

### In order to achieve a better perfomance in RAM and running time, we convert the CSV datasets into Feather format.

In [None]:
#https://medium.com/@rajkrakesh/feather-vs-csv-829472196832
transactions = pd.read_csv('transactions_train.csv.zip').to_feather('transactions_train.feather')

In [None]:
articles = pd.read_csv('articles.csv.zip')
customers = pd.read_csv('customers.csv.zip')

In [None]:
articles.to_feather('articles.feather')

In [None]:
customers.to_feather('customers.feather')

In [None]:
articles = pd.read_feather('articles.feather')
customers = pd.read_feather('customers.feather')
transactions = pd.read_feather('transactions_train.feather')

### Preprocessing the transaction dataset


In [None]:
def customer_id_to_int(x): return int(x[-16:], 16)

transactions['customer_id'] = transactions['customer_id'].apply(customer_id_to_int).astype('int32')
transactions['article_id'] = transactions['article_id'].astype('int32')

In [None]:
transactions['t_dat'] = pd.to_datetime(transactions['t_dat'])
#transactions['article_id'] = transactions['article_id'].astype('int32')
#transactions['customer_id'] = transactions['customer_id'].apply(customer_id_to_int).astype('int32')
transactions.sales_channel_id = transactions.sales_channel_id.astype('int8')
transactions.price = transactions.price.astype('float32')

In [None]:
transactions['week'] = 104 - (transactions.t_dat.max() - transactions.t_dat).dt.days // 7
transactions.week = transactions.week.astype('int8') 
last_month = transactions['week'].max() - 8
transactions = transactions.loc[transactions["week"] >= last_month]

### Preprocessing the article dataset

In [None]:
ps = nltk.stem.porter.PorterStemmer()
punct = set(string.punctuation)

def preprocess_text(text):
    #text = unidecode.unidecode(text).lower()  # remove accents and lower
    text = ''.join([char for char in text if char not in punct])  # remove punctuation 
    words = nltk.word_tokenize(text) # tokenize
    words = [ps.stem(w) for w in words] # stem words
    text = ' '.join(words) # convert back to text
    return text

In [None]:
for c in articles.columns:
      articles[c] = articles[c].dropna()
articles

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1010017,Stripe,11,Off White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
3,110065001,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,9,Black,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
4,110065002,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,10,White,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105537,953450001,953450,5pk regular Placement1,302,Socks,Socks & Tights,1010014,Placement print,9,Black,...,Socks Bin,F,Menswear,3,Menswear,26,Men Underwear,1021,Socks and Tights,Socks in a fine-knit cotton blend with a small...
105538,953763001,953763,SPORT Malaga tank,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,Jersey,A,Ladieswear,1,Ladieswear,2,H&M+,1005,Jersey Fancy,Loose-fitting sports vest top in ribbed fast-d...
105539,956217002,956217,Cartwheel dress,265,Dress,Garment Full body,1010016,Solid,9,Black,...,Jersey,A,Ladieswear,1,Ladieswear,18,Womens Trend,1005,Jersey Fancy,"Short, A-line dress in jersey with a round nec..."
105540,957375001,957375,CLAIRE HAIR CLAW,72,Hair clip,Accessories,1010016,Solid,9,Black,...,Small Accessories,D,Divided,2,Divided,52,Divided Accessories,1019,Accessories,Large plastic hair claw.


In [None]:
articles['article_id'] = articles['article_id'].astype('int32')
articles['product_code'] = articles['product_code'].astype('int8')
#articles['prod_name'] = articles['prod_name'].apply(preprocess_text)
#articles['product_group_name'] = pd.factorize(articles['product_group_name'])[0].astype('int8')
articles['product_type_no'] = articles['product_type_no'].astype('int8')
articles['graphical_appearance_no'] = pd.factorize(articles['graphical_appearance_no'])[0].astype('int8')
articles['colour_group_code'] = articles['colour_group_code'].astype('int8')
articles['perceived_colour_value_id'] = articles['perceived_colour_value_id'].astype('int8')
articles['perceived_colour_master_id'] = articles['perceived_colour_master_id'].astype('int8')
articles['department_no'] = articles['department_no'].astype('int8')
articles['index_code'] = pd.factorize(articles['department_no'])[0].astype('int8')
articles['index_group_no'] = articles['index_group_no'].astype('int8')
articles['section_no'] = articles['section_no'].astype('int8')
articles['garment_group_no'] = articles['garment_group_no'].astype('int16')

In [None]:
# https://www.sbert.net/docs/pretrained_models.html
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
text_features = sbert_model.encode(articles['detail_desc'].map(str).values.tolist()).tolist()
text_transformation = PCA(n_components=8).fit_transform(text_features).tolist()

In [None]:
#articles[[f'prod_name_{i}' for i in range(8)]] = pd.DataFrame(text_transformation)
articles[[f'detail_desc{i}' for i in range(8)]] = pd.DataFrame(text_transformation)

In [None]:
text_features = sbert_model.encode(articles['prod_name'].values.tolist()).tolist()
text_transformation = PCA(n_components=8).fit_transform(text_features).tolist()

In [None]:
articles[[f'prod_name_{i}' for i in range(8)]] = pd.DataFrame(text_transformation)

In [None]:
#data_vectorizer = TfidfVectorizer(min_df = 2)
#X = data_vectorizer.fit_transform(articles['prod_name'].map(str))
#transformed = TruncatedSVD(n_components=6).fit_transform(X).tolist()
#articles[[f'prod_name_{i}' for i in range(6)]] = pd.DataFrame(transformed)

In [None]:
#X = data_vectorizer.fit_transform(articles['detail_desc'].map(str).values.tolist())
#transformed = TruncatedSVD(n_components=6).fit_transform(X).tolist()
#articles[[f'detail_desc{i}' for i in range(6)]] = pd.DataFrame(transformed)

In [None]:
articles.drop(
        columns=['product_type_name', 'graphical_appearance_name', 'colour_group_name', 'perceived_colour_value_name',
                 'perceived_colour_master_name', 'department_name', 'index_name', 'index_group_name', 'section_name',
                 'garment_group_name', 'detail_desc', 'prod_name', 'product_group_name'], inplace=True)

In [None]:
articles.drop(columns=['product_group_name'], inplace=True)

In [None]:
articles

Unnamed: 0,article_id,product_code,product_type_no,graphical_appearance_no,colour_group_code,perceived_colour_value_id,perceived_colour_master_id,department_no,index_code,index_group_no,...,detail_desc6,detail_desc7,prod_name_0,prod_name_1,prod_name_2,prod_name_3,prod_name_4,prod_name_5,prod_name_6,prod_name_7
0,108775015,-25,-3,0,9,4,5,-116,0,1,...,0.041083,0.178092,0.111872,-0.027902,-0.006179,0.150907,-0.066845,-0.045066,-0.038631,0.208026
1,108775044,-25,-3,0,10,3,9,-116,0,1,...,0.041083,0.178092,0.111872,-0.027902,-0.006179,0.150907,-0.066845,-0.045066,-0.038631,0.208026
2,108775051,-25,-3,1,11,1,9,-116,0,1,...,0.041083,0.178092,0.079202,-0.075822,-0.126421,0.130819,-0.063504,-0.020413,-0.007094,0.167599
3,110065001,-15,50,0,9,4,5,59,1,1,...,0.194199,-0.197477,0.126515,-0.056410,-0.075005,0.021385,0.185929,-0.047067,-0.003078,0.110634
4,110065002,-15,50,0,10,3,9,59,1,1,...,0.194199,-0.197477,0.126515,-0.056410,-0.075005,0.021385,0.185929,-0.047067,-0.003078,0.110634
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105537,953450001,106,46,18,9,4,5,20,13,3,...,-0.260117,0.063887,-0.298164,-0.188646,-0.333328,-0.132524,-0.120017,0.173635,-0.112967,-0.065272
105538,953763001,-93,-3,0,9,4,5,127,56,1,...,0.023042,-0.321749,-0.169315,0.013007,-0.260839,-0.063622,-0.025068,-0.148886,0.046484,0.079102
105539,956217002,57,9,0,9,4,5,105,33,1,...,-0.070490,0.149083,0.359452,0.138820,-0.132935,-0.005305,-0.136707,0.070296,-0.031907,-0.100961
105540,957375001,-65,72,0,9,4,5,106,30,2,...,-0.035293,0.035860,-0.040071,0.251877,0.078328,0.014182,-0.010682,0.105972,0.279376,-0.135249


### Preprocess of customer dataset

In [None]:
customers.fillna({"FN": 0, "Active": 0}, inplace=True)

def customer_id_to_int(x): return int(x[-16:], 16)
customers['customer_id'] = customers['customer_id'].apply(customer_id_to_int).astype('int32')

In [None]:
customers["FN"] = customers["FN"].astype('int8')
customers["Active"] = customers["Active"].astype('int8')
customers['age'].fillna(int((customers['age'].mean())), inplace=True)
customers["fashion_news_frequency"] = pd.factorize(customers["fashion_news_frequency"])[0].astype('int8')
customers["club_member_status"] = pd.factorize(customers["club_member_status"])[0].astype('int8')
customers['postal_code'] = pd.factorize(customers['postal_code'])[0].astype('int32')

In [None]:
transactions.sort_values(['week', 'customer_id'], inplace=True)

In [None]:
transactions.drop_duplicates(inplace=True)
transactions_merge = transactions.merge(articles, how="inner", on='article_id')
transactions_merge = transactions.merge(customers, how="inner", on='customer_id')
transactions_processed = transactions_merge[['t_dat','customer_id','age','article_id', 'price', 'week']].copy()

In [None]:
transactions_processed = transactions_processed.dropna()

In [None]:
transactions = transactions_processed
transactions

Unnamed: 0,t_dat,customer_id,age,article_id,price,week
0,2020-07-22,294687682,49,778064038,0.008458,96
1,2020-07-22,294687682,49,817166007,0.006763,96
2,2020-07-22,294687682,49,840360002,0.008458,96
3,2020-07-22,294687682,49,817166003,0.008458,96
4,2020-07-22,974978159,55,624486001,0.012729,96
...,...,...,...,...,...,...
2263235,2020-09-22,-161766335,26,914404001,0.042356,104
2263236,2020-09-22,-161766335,26,751471041,0.033881,104
2263237,2020-09-22,-161766335,26,751471038,0.033881,104
2263238,2020-09-22,-473003020,46,895730002,0.045746,104


# Generating the candidates that were last purchased

In [None]:
#https://www.kaggle.com/code/marcogorelli/radek-s-lgbmranker-starter-pack
test_week = transactions.week.max() + 1
transactions = transactions[transactions.week > transactions.week.max() - 10]
transactions

Unnamed: 0,t_dat,customer_id,age,article_id,price,week
0,2020-07-22,294687682,49,778064038,0.008458,96
1,2020-07-22,294687682,49,817166007,0.006763,96
2,2020-07-22,294687682,49,840360002,0.008458,96
3,2020-07-22,294687682,49,817166003,0.008458,96
4,2020-07-22,974978159,55,624486001,0.012729,96
...,...,...,...,...,...,...
2263235,2020-09-22,-161766335,26,914404001,0.042356,104
2263236,2020-09-22,-161766335,26,751471041,0.033881,104
2263237,2020-09-22,-161766335,26,751471038,0.033881,104
2263238,2020-09-22,-473003020,46,895730002,0.045746,104


In [None]:
last_purchased_weeks = transactions.groupby('customer_id')['week'].unique()
last_purchased_weeks_shifted = {}
for c_id, weeks in last_purchased_weeks.items():
    last_purchased_weeks_shifted[c_id] = {}
    for i in range(weeks.shape[0]-1):
        last_purchased_weeks_shifted[c_id][weeks[i]] = weeks[i+1]
    last_purchased_weeks_shifted[c_id][weeks[-1]] = test_week

In [None]:
candidates_last_purchase = transactions.copy()
weeks = []
for i, (c_id, week) in enumerate(zip(transactions['customer_id'], transactions['week'])):
    weeks.append(last_purchased_weeks_shifted[c_id][week])
    
candidates_last_purchase.week=weeks

# Bestseller Candidates

In [None]:
# https://github.com/radekosmulski/personalized_fashion_recs/blob/main/03c_Basic_Model_Submission.ipynb
mean_price = transactions \
    .groupby(['week', 'article_id'])['price'].mean()
sales = transactions \
    .groupby('week')['article_id'].value_counts() \
    .groupby('week').rank(method='dense', ascending=False) \
    .groupby('week').head(12).rename('bestseller_rank').astype('int8')
bestsellers_previous_week = pd.merge(sales, mean_price, on=['week', 'article_id']).reset_index()
bestsellers_previous_week.week += 1
unique_transactions = transactions \
    .groupby(['week', 'customer_id']) \
    .head(1) \
    .drop(columns=['article_id', 'price']) \
    .copy()
candidates_bestsellers = pd.merge(
    unique_transactions,
    bestsellers_previous_week,
    on='week',
)
test_set_transactions = unique_transactions.drop_duplicates('customer_id').reset_index(drop=True)
test_set_transactions.week = test_week
candidates_bestsellers_test_week = pd.merge(
    test_set_transactions,
    bestsellers_previous_week,
    on='week'
)
candidates_bestsellers = pd.concat([candidates_bestsellers, candidates_bestsellers_test_week])
candidates_bestsellers.drop(columns='bestseller_rank', inplace=True)

# Combination of transactions and available candidates

In [None]:
transactions['purchased'] = 1
data = pd.concat([transactions, candidates_last_purchase, candidates_bestsellers])
data.purchased.fillna(0, inplace=True)
data.drop_duplicates(['customer_id', 'article_id', 'week'], inplace=True)
data.purchased.mean()

0.13516074825980856

In [None]:
data

Unnamed: 0,t_dat,customer_id,age,article_id,price,week,purchased
0,2020-07-22,294687682,49,778064038,0.008458,96,1.0
1,2020-07-22,294687682,49,817166007,0.006763,96,1.0
2,2020-07-22,294687682,49,840360002,0.008458,96,1.0
3,2020-07-22,294687682,49,817166003,0.008458,96,1.0
4,2020-07-22,974978159,55,624486001,0.012729,96,1.0
...,...,...,...,...,...,...,...
4917211,2020-09-22,-473003020,46,751471001,0.033387,105,0.0
4917212,2020-09-22,-473003020,46,915529005,0.033366,105,0.0
4917213,2020-09-22,-473003020,46,762846027,0.024979,105,0.0
4917214,2020-09-22,-473003020,46,918292001,0.041590,105,0.0


# Bestseller information

In [None]:
data = pd.merge(
    data,
    bestsellers_previous_week[['week', 'article_id', 'bestseller_rank']],
    on=['week', 'article_id'],
    how='left'
)
data = data[data.week != data.week.min()]
data.bestseller_rank.fillna(999, inplace=True)
data = pd.merge(data, articles, on='article_id', how='left')
data = pd.merge(data, customers, on='customer_id', how='left')
data.sort_values(['week', 'customer_id'], inplace=True)
data.reset_index(drop=True, inplace=True)

In [None]:
data

Unnamed: 0,t_dat,customer_id,age_x,article_id,price,week,purchased,bestseller_rank,product_code,product_type_no,...,prod_name_4,prod_name_5,prod_name_6,prod_name_7,FN,Active,club_member_status,fashion_news_frequency,age_y,postal_code
0,2020-07-29,-2147481293,20,464297007,0.016932,97,1.0,999.0,47,30,...,0.055739,0.184584,-0.099735,0.050190,1,1,0,1,20,66225
1,2020-07-30,-2147481293,20,507909001,0.025407,97,1.0,999.0,-119,3,...,0.138754,0.170917,0.070271,0.003155,1,1,0,1,20,66225
2,2020-07-30,-2147481293,20,783346001,0.025407,97,1.0,999.0,81,16,...,-0.005821,-0.167344,-0.074249,0.080535,1,1,0,1,20,66225
3,2020-07-27,-2147481293,20,697564061,0.016932,97,0.0,999.0,-99,3,...,0.112621,0.176442,-0.017854,-0.107627,1,1,0,1,20,66225
4,2020-07-27,-2147481293,20,865594002,0.025407,97,0.0,999.0,-110,42,...,-0.006858,-0.146141,0.047321,-0.102298,1,1,0,1,20,66225
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16172632,2020-07-29,2147478685,28,751471001,0.033387,105,0.0,8.0,-103,16,...,-0.096864,-0.044245,0.096277,-0.050821,-1,-1,0,0,28,190365
16172633,2020-07-29,2147478685,28,915529005,0.033366,105,0.0,9.0,45,-4,...,0.035976,-0.022066,0.093988,-0.200892,-1,-1,0,0,28,190365
16172634,2020-07-29,2147478685,28,762846027,0.024979,105,0.0,10.0,75,3,...,0.066252,0.034702,-0.058858,0.082445,-1,-1,0,0,28,190365
16172635,2020-07-29,2147478685,28,918292001,0.041590,105,0.0,11.0,33,17,...,-0.045158,-0.008782,0.087437,-0.038687,-1,-1,0,0,28,190365


In [None]:
data.drop(
        columns=['age_y'], inplace=True)

In [None]:
#data = pd.merge(data, articles, on='article_id', how='left')
#data = pd.merge(data, customers, on='customer_id', how='left')

In [None]:
#data.sort_values(['week', 'customer_id'], inplace=True)
#data.reset_index(drop=True, inplace=True)

# Splitting training and test data

In [None]:
train = data[data.week != test_week]
test = data[data.week==test_week].drop_duplicates(['customer_id', 'article_id']).copy()

In [None]:
train

Unnamed: 0,t_dat,customer_id,age_x,article_id,price,week,purchased,bestseller_rank,product_code,product_type_no,...,prod_name_3,prod_name_4,prod_name_5,prod_name_6,prod_name_7,FN,Active,club_member_status,fashion_news_frequency,postal_code
0,2020-07-29,-2147481293,20,464297007,0.016932,97,1.0,999.0,47,30,...,0.175057,0.055739,0.184584,-0.099735,0.050190,1,1,0,1,66225
1,2020-07-30,-2147481293,20,507909001,0.025407,97,1.0,999.0,-119,3,...,-0.057091,0.138754,0.170917,0.070271,0.003155,1,1,0,1,66225
2,2020-07-30,-2147481293,20,783346001,0.025407,97,1.0,999.0,81,16,...,0.170543,-0.005821,-0.167344,-0.074249,0.080535,1,1,0,1,66225
3,2020-07-27,-2147481293,20,697564061,0.016932,97,0.0,999.0,-99,3,...,-0.044893,0.112621,0.176442,-0.017854,-0.107627,1,1,0,1,66225
4,2020-07-27,-2147481293,20,865594002,0.025407,97,0.0,999.0,-110,42,...,-0.153173,-0.006858,-0.146141,0.047321,-0.102298,1,1,0,1,66225
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9982667,2020-09-17,2147354887,23,448509014,0.041471,104,0.0,8.0,86,16,...,0.113876,0.148608,0.126384,-0.112904,-0.138671,1,1,0,1,74570
9982668,2020-09-17,2147354887,23,762846027,0.025082,104,0.0,9.0,75,3,...,-0.075774,0.066252,0.034702,-0.058858,0.082445,1,1,0,1,74570
9982669,2020-09-17,2147354887,23,809238005,0.041610,104,0.0,10.0,-11,-4,...,-0.285288,-0.024242,-0.068525,0.152685,0.053101,1,1,0,1,74570
9982670,2020-09-17,2147354887,23,673677002,0.024913,104,0.0,11.0,-54,-4,...,0.041502,0.113669,-0.223808,-0.119473,0.097020,1,1,0,1,74570


In [None]:
train_baskets = train.groupby(['week', 'customer_id'])['article_id'].count().values

In [None]:
columns_to_use = ['article_id', 'product_type_no', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id',
'perceived_colour_master_id', 'department_no', 'index_code',
'index_group_no', 'section_no', 'garment_group_no', 'FN', 'Active',
'club_member_status', 'fashion_news_frequency', 'age_x', 'postal_code', 
'bestseller_rank', 'detail_desc0', 'detail_desc1', 'detail_desc2',
'detail_desc3', 'detail_desc4', 'detail_desc5', 'detail_desc6', 'detail_desc7', 
'prod_name_0', 'prod_name_1', 'prod_name_2', 'prod_name_3', 'prod_name_4'
, 'prod_name_5', 'prod_name_6', 'prod_name_7']

In [None]:
columns_to_use += [f"prod_name_{i}" for i in range(8)]
columns_to_use += [f"detail_desc{i}" for i in range(8)]

In [None]:
columns_to_use

['article_id',
 'product_type_no',
 'graphical_appearance_no',
 'colour_group_code',
 'perceived_colour_value_id',
 'perceived_colour_master_id',
 'department_no',
 'index_code',
 'index_group_no',
 'section_no',
 'garment_group_no',
 'FN',
 'Active',
 'club_member_status',
 'fashion_news_frequency',
 'age_x',
 'postal_code',
 'bestseller_rank',
 'detail_desc0',
 'detail_desc1',
 'detail_desc2',
 'detail_desc3',
 'detail_desc4',
 'detail_desc5',
 'detail_desc6',
 'detail_desc7',
 'prod_name_0',
 'prod_name_1',
 'prod_name_2',
 'prod_name_3',
 'prod_name_4',
 'prod_name_5',
 'prod_name_6',
 'prod_name_7']

In [None]:
#rain = train.reindex(columns_to_use)
train_X = train[columns_to_use]
train_y = train['purchased']
test_X = test[columns_to_use]

# Baseline Model Training (LGBMRanker)

In [None]:
ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type="dart",
    n_estimators=1,
    importance_type='gain',
    verbose=10
)

In [None]:
train_baskets.sum()

9982672

In [None]:
train_X.shape[0]

9982672

In [None]:
ranker = ranker.fit(
    train_X,
    train_y,
    group=train_baskets,
)

In [None]:
for i in ranker.feature_importances_.argsort()[::-1]:
    print(columns_to_use[i], ranker.feature_importances_[i]/ranker.feature_importances_.sum())

bestseller_rank 0.9986552309874726
age_x 0.0003179796086764861
detail_desc5 0.0001866101583772841
detail_desc4 0.00017399814733175495
article_id 0.00016089990216255106
section_no 0.00015075491506517564
detail_desc3 9.562241091442533e-05
garment_group_no 6.81871585775318e-05
product_type_no 5.39028893218963e-05
index_code 5.153968361614544e-05
detail_desc1 4.3417098550645625e-05
colour_group_code 2.12836619255498e-05
club_member_status 2.0573378008025446e-05
perceived_colour_value_id 0.0
perceived_colour_master_id 0.0
department_no 0.0
index_group_no 0.0
graphical_appearance_no 0.0
FN 0.0
Active 0.0
prod_name_7 0.0
fashion_news_frequency 0.0
prod_name_6 0.0
detail_desc0 0.0
detail_desc2 0.0
detail_desc6 0.0
detail_desc7 0.0
prod_name_0 0.0
prod_name_1 0.0
prod_name_2 0.0
prod_name_3 0.0
prod_name_4 0.0
prod_name_5 0.0
postal_code 0.0


# Calculating the predictions

In [None]:
test['preds'] = ranker.predict(test_X)

c_id2predicted_article_ids = test \
    .sort_values(['customer_id', 'preds'], ascending=False) \
    .groupby('customer_id')['article_id'].apply(list).to_dict()

bestsellers_last_week = \
    bestsellers_previous_week[bestsellers_previous_week.week == bestsellers_previous_week.week.max()]['article_id'].tolist()

# Preparing the submission file

In [None]:
!unzip /content/drive/MyDrive/sample_submission.csv.zip

Archive:  /content/drive/MyDrive/sample_submission.csv.zip
  inflating: sample_submission.csv   


In [None]:
submission = pd.read_csv('sample_submission.csv')

In [None]:
#https://github.com/radekosmulski/personalized_fashion_recs/blob/main/01_Solution_warmup.ipynb
def customer_hex_id_to_int(series):
    return series.str[-16:].apply(hex_id_to_int)

def hex_id_to_int(str):
    return int(str[-16:], 16)

sub = pd.read_csv('sample_submission.csv')
preds = []
for c_id in customer_hex_id_to_int(sub.customer_id):
    pred = c_id2predicted_article_ids.get(c_id, [])
    pred = pred + bestsellers_last_week
    preds.append(pred[:12])
preds = [' '.join(['0' + str(p) for p in ps]) for ps in preds]
sub.prediction = preds
#sub_name = 'lgbmr_model_submission'
#sub.to_csv(f'{sub_name}.csv.gz', index=False)

In [None]:
predspreds = [' '.join(['0' + str(p) for p in ps]) for ps in preds]
submission.prediction = preds

In [None]:
sub

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0924243001 0918522001 0924243002 0923758001 08...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0924243001 0918522001 0924243002 0923758001 08...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0924243001 0918522001 0924243002 0923758001 08...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0924243001 0918522001 0924243002 0923758001 08...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0924243001 0918522001 0924243002 0923758001 08...
...,...,...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,0924243001 0918522001 0924243002 0923758001 08...
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,0924243001 0918522001 0924243002 0923758001 08...
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,0924243001 0918522001 0924243002 0923758001 08...
1371978,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,0924243001 0918522001 0924243002 0923758001 08...


In [None]:
sub_name = 'submission2'
submission.to_csv(f'{sub_name}.csv.gz', index=False)