# Lecture 4: Experiments with Algorithms

## Setup

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!unzip /content/drive/MyDrive/transactions_train.zip
!unzip /content/drive/MyDrive/articles.zip
!unzip /content/drive/MyDrive/customers.zip

Archive:  /content/drive/MyDrive/transactions_train.zip
  inflating: transactions_train.feather  
Archive:  /content/drive/MyDrive/articles.zip
  inflating: articles.feather        
Archive:  /content/drive/MyDrive/customers.zip
  inflating: customers.feather       


In [None]:
# Install a pip package in the current Jupyter kernel
import sys
!{sys.executable} -m pip install unidecode

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting unidecode
  Downloading Unidecode-1.3.6-py3-none-any.whl (235 kB)
[K     |████████████████████████████████| 235 kB 4.7 MB/s 
[?25hInstalling collected packages: unidecode
Successfully installed unidecode-1.3.6


In [None]:
!{sys.executable} -m pip install feather-format

Collecting feather-format
  Downloading feather-format-0.4.1.tar.gz (3.2 kB)
Building wheels for collected packages: feather-format
  Building wheel for feather-format (setup.py): started
  Building wheel for feather-format (setup.py): finished with status 'done'
  Created wheel for feather-format: filename=feather_format-0.4.1-py3-none-any.whl size=2458 sha256=1f95b0ac16d0fabd87785a66eb8830fa6419e35ee4052ef1214b3ab360de9e7a
  Stored in directory: c:\users\administrator\appdata\local\pip\cache\wheels\7b\83\4b\44aa8f2292c0ae17a550c5105454e974fc45a5b2a522d66ae1
Successfully built feather-format
Installing collected packages: feather-format
Successfully installed feather-format-0.4.1


In [3]:
import numpy as np 
import pandas as pd
import random
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
import string
#import unidecode
import seaborn as sns
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [4]:
from sklearn.decomposition import TruncatedSVD
from lightgbm.sklearn import LGBMRanker

### In order to achieve a better perfomance in RAM and running time, we convert the CSV datasets into Feather format.

In [None]:
#https://medium.com/@rajkrakesh/feather-vs-csv-829472196832
transactions = pd.read_csv('transactions_train.csv.zip').to_feather('transactions_train.feather')

In [None]:
articles = pd.read_csv('articles.csv.zip')
customers = pd.read_csv('customers.csv.zip')

In [None]:
articles.to_feather('articles.feather')

In [None]:
customers.to_feather('customers.feather')

In [5]:
articles = pd.read_feather('articles.feather')
customers = pd.read_feather('customers.feather')
transactions = pd.read_feather('transactions_train.feather')

### Preprocessing the transaction dataset


In [6]:
def customer_id_to_int(x): return int(x[-16:], 16)

transactions['customer_id'] = transactions['customer_id'].apply(customer_id_to_int).astype('int32')
transactions['article_id'] = transactions['article_id'].astype('int32')

In [7]:
transactions['t_dat'] = pd.to_datetime(transactions['t_dat'])
transactions['week'] = 104 - (transactions.t_dat.max() - transactions.t_dat).dt.days // 7
transactions.sales_channel_id = transactions.sales_channel_id.astype('int8')
transactions.price = transactions.price.astype('float32')
#transactions.drop(columns='t_dat')

In [8]:
transactions = transactions.loc[transactions["week"] >= 96]
transactions

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week
29314980,2020-07-22,294687682,778064038,0.008458,2,96
29314981,2020-07-22,294687682,817166007,0.006763,2,96
29314982,2020-07-22,294687682,840360002,0.008458,2,96
29314983,2020-07-22,294687682,817166003,0.008458,2,96
29314984,2020-07-22,974978159,624486001,0.012729,2,96
...,...,...,...,...,...,...
31788319,2020-09-22,1362182998,929511001,0.059305,2,104
31788320,2020-09-22,1362182998,891322004,0.042356,2,104
31788321,2020-09-22,-324376415,918325001,0.043203,1,104
31788322,2020-09-22,2104975119,833459002,0.006763,1,104


### Preprocessing the article dataset

In [9]:
ps = nltk.stem.porter.PorterStemmer()
punct = set(string.punctuation)

def preprocess_text(text):
    #text = unidecode.unidecode(text).lower()  # remove accents and lower
    text = ''.join([char for char in text if char not in punct])  # remove punctuation 
    words = nltk.word_tokenize(text) # tokenize
    words = [ps.stem(w) for w in words] # stem words
    text = ' '.join(words) # convert back to text
    return text

In [10]:
for c in articles.columns:
      articles[c] = articles[c].dropna()
articles

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1010017,Stripe,11,Off White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
3,110065001,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,9,Black,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
4,110065002,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,10,White,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105537,953450001,953450,5pk regular Placement1,302,Socks,Socks & Tights,1010014,Placement print,9,Black,...,Socks Bin,F,Menswear,3,Menswear,26,Men Underwear,1021,Socks and Tights,Socks in a fine-knit cotton blend with a small...
105538,953763001,953763,SPORT Malaga tank,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,Jersey,A,Ladieswear,1,Ladieswear,2,H&M+,1005,Jersey Fancy,Loose-fitting sports vest top in ribbed fast-d...
105539,956217002,956217,Cartwheel dress,265,Dress,Garment Full body,1010016,Solid,9,Black,...,Jersey,A,Ladieswear,1,Ladieswear,18,Womens Trend,1005,Jersey Fancy,"Short, A-line dress in jersey with a round nec..."
105540,957375001,957375,CLAIRE HAIR CLAW,72,Hair clip,Accessories,1010016,Solid,9,Black,...,Small Accessories,D,Divided,2,Divided,52,Divided Accessories,1019,Accessories,Large plastic hair claw.


In [None]:
# In Jupyter Notebook worked perfectly well, in Colab no.
articles['article_id'] = articles['article_id'].astype('int32')
for c in articles.columns:
    if articles[c].dtype == 'object':
        articles[c] = articles[c].apply(preprocess_text)

In [12]:
data_vectorizer = TfidfVectorizer(min_df = 2)
X = data_vectorizer.fit_transform(articles['prod_name'].map(str))
transformed = TruncatedSVD(n_components=6).fit_transform(X).tolist()
articles[[f'prod_name_{i}' for i in range(6)]] = pd.DataFrame(transformed)

In [13]:
X = data_vectorizer.fit_transform(articles['detail_desc'].map(str).values.tolist())
transformed = TruncatedSVD(n_components=6).fit_transform(X).tolist()
articles[[f'detail_desc{i}' for i in range(6)]] = pd.DataFrame(transformed)

In [14]:
articles.drop(
        columns=['product_type_name', 'graphical_appearance_name', 'colour_group_name', 'perceived_colour_value_name',
                 'perceived_colour_master_name', 'department_name', 'index_name', 'index_group_name', 'section_name',
                 'garment_group_name', 'detail_desc'], inplace=True)

In [15]:
articles['graphical_appearance_no'] = pd.factorize(articles['graphical_appearance_no'])[0].astype('int8')
articles['product_group_name'] = pd.factorize(articles['product_group_name'])[0].astype('int8')
articles['index_code'] = pd.factorize(articles['department_no'])[0].astype('int8')
articles['department_no'] = articles['department_no'].astype('int16')
articles['garment_group_no'] = articles['garment_group_no'].astype('int8')

In [16]:
articles

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_group_name,graphical_appearance_no,colour_group_code,perceived_colour_value_id,perceived_colour_master_id,department_no,...,prod_name_2,prod_name_3,prod_name_4,prod_name_5,detail_desc0,detail_desc1,detail_desc2,detail_desc3,detail_desc4,detail_desc5
0,108775015,108775,strap top,253,0,0,9,4,5,1676,...,0.099267,0.399670,0.371911,-0.209661,0.228790,0.041428,-0.003849,-0.334597,0.184842,-0.124412
1,108775044,108775,strap top,253,0,0,10,3,9,1676,...,0.099267,0.399670,0.371911,-0.209661,0.228790,0.041428,-0.003849,-0.334597,0.184842,-0.124412
2,108775051,108775,strap top 1,253,0,1,11,1,9,1676,...,0.099267,0.399670,0.371911,-0.209661,0.228790,0.041428,-0.003849,-0.334597,0.184842,-0.124412
3,110065001,110065,op tshirt idro,306,1,0,9,4,5,1339,...,0.009557,0.011171,-0.012672,0.020196,0.197881,-0.080551,-0.149139,-0.075300,0.089431,-0.119313
4,110065002,110065,op tshirt idro,306,1,0,10,3,9,1339,...,0.009557,0.011171,-0.012672,0.020196,0.197881,-0.080551,-0.149139,-0.075300,0.089431,-0.119313
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105537,953450001,953450,5pk regular placement1,302,2,18,9,4,5,7188,...,0.003882,-0.001330,-0.020137,-0.027315,0.322707,0.148015,0.196263,-0.023952,-0.141949,-0.303808
105538,953763001,953763,sport malaga tank,253,0,0,9,4,5,1919,...,0.022993,0.032678,0.007899,-0.010057,0.190535,0.036904,0.012697,0.006064,0.133524,0.019402
105539,956217002,956217,cartwheel dress,265,11,0,9,4,5,1641,...,-0.034087,-0.028636,0.014303,-0.015762,0.383275,0.098320,-0.121614,-0.103733,-0.091021,0.018774
105540,957375001,957375,clair hair claw,72,4,0,9,4,5,3946,...,0.000815,0.000424,-0.001909,-0.000910,0.005291,-0.003782,-0.005613,-0.001309,0.006278,-0.007182


### Preprocess of customer dataset

In [17]:
def customer_id_to_int(x): return int(x[-16:], 16)
customers['customer_id'] = customers['customer_id'].apply(customer_id_to_int).astype('int32')

In [18]:
for c in ['FN', 'Active', 'age']:
    customers[c].fillna(-1, inplace=True)
    customers[c] = customers[c].astype('int8')
customers["fashion_news_frequency"] = pd.factorize(customers["fashion_news_frequency"])[0].astype('int8')
customers["club_member_status"] = pd.factorize(customers["club_member_status"])[0].astype('int8')
customers['postal_code'] = pd.factorize(customers['postal_code'])[0].astype('int32')
customers

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,-1612724649,-1,-1,0,0,49,0
1,-1740365574,-1,-1,0,0,25,1
2,277996312,-1,-1,0,0,24,2
3,-16268226,-1,-1,0,0,54,3
4,-1922717606,1,1,0,1,52,4
...,...,...,...,...,...,...,...
1371975,-1940645839,-1,-1,0,0,24,62927
1371976,-1245382473,-1,-1,0,0,21,6316
1371977,-47869340,1,1,0,1,21,273671
1371978,1238687594,1,1,0,1,18,218323


# Generating the candidates that were last purchased

In [19]:
#https://www.kaggle.com/code/marcogorelli/radek-s-lgbmranker-starter-pack
test_week = transactions.week.max() + 1
transactions = transactions[transactions.week > transactions.week.max() - 10]
transactions

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week
29314980,2020-07-22,294687682,778064038,0.008458,2,96
29314981,2020-07-22,294687682,817166007,0.006763,2,96
29314982,2020-07-22,294687682,840360002,0.008458,2,96
29314983,2020-07-22,294687682,817166003,0.008458,2,96
29314984,2020-07-22,974978159,624486001,0.012729,2,96
...,...,...,...,...,...,...
31788319,2020-09-22,1362182998,929511001,0.059305,2,104
31788320,2020-09-22,1362182998,891322004,0.042356,2,104
31788321,2020-09-22,-324376415,918325001,0.043203,1,104
31788322,2020-09-22,2104975119,833459002,0.006763,1,104


In [20]:
#https://www.kaggle.com/code/marcogorelli/radek-s-lgbmranker-starter-pack
purch_weeks = transactions.groupby('customer_id')['week'].unique()
transactions.groupby('week')['t_dat'].agg(['min', 'max'])

Unnamed: 0_level_0,min,max
week,Unnamed: 1_level_1,Unnamed: 2_level_1
96,2020-07-22,2020-07-28
97,2020-07-29,2020-08-04
98,2020-08-05,2020-08-11
99,2020-08-12,2020-08-18
100,2020-08-19,2020-08-25
101,2020-08-26,2020-09-01
102,2020-09-02,2020-09-08
103,2020-09-09,2020-09-15
104,2020-09-16,2020-09-22


In [21]:
purch_shifted_weeks = {}
# Here we check the shifted purchased items all the available weeks in the transactions dataset within the all purchased items.
for c_id, weeks in purch_weeks.items():
    purch_shifted_weeks[c_id] = {}
    for i in range(weeks.shape[0]-1):
        purch_shifted_weeks[c_id][weeks[i]] = weeks[i+1]
    purch_shifted_weeks[c_id][weeks[-1]] = test_week

In [22]:
candidates_last_purchase = transactions.copy()
weeks = []
for i, (c_id, week) in enumerate(zip(transactions['customer_id'], transactions['week'])):
    weeks.append(purch_shifted_weeks[c_id][week])
    
candidates_last_purchase.week=weeks

# Bestseller Candidates

In [23]:
# Median price
median_price = transactions \
    .groupby(['week', 'article_id'])['price'].median()
median_price

week  article_id
96    108775015     0.002068
      108775044     0.008458
      111565001     0.002695
      111586001     0.011288
      111593001     0.011288
                      ...   
104   952267001     0.013441
      952938001     0.050831
      953450001     0.016932
      953763001     0.022017
      956217002     0.059305
Name: price, Length: 176327, dtype: float32

In [24]:
sales = transactions.groupby('week')['article_id'].value_counts().groupby('week').rank(method=
'dense', ascending=False).groupby('week').head(12).rename('bestseller_rank').astype('int8')
sales

week  article_id
96    827968001      1
      706016003      2
      706016001      3
      760084003      4
      717490064      5
                    ..
104   915529003      8
      915529005      9
      448509014     10
      762846027     11
      714790020     12
Name: bestseller_rank, Length: 108, dtype: int8

In [25]:
bestsellers_previous_week = pd.merge(sales, median_price, on=['week', 'article_id']).reset_index()
bestsellers_previous_week.week += 1

unique_transactions = transactions.groupby(['week', 'customer_id']).head(1).drop(columns=['article_id', 'price']).copy()

candidates_bestsellers = pd.merge(
    unique_transactions,
    bestsellers_previous_week,
    on='week',
)

test_set_transactions = unique_transactions.drop_duplicates('customer_id').reset_index(drop=True)
test_set_transactions.week = test_week

candidates_bestsellers_test_week = pd.merge(
    test_set_transactions,
    bestsellers_previous_week,
    on='week'
)

candidates_bestsellers = pd.concat([candidates_bestsellers, candidates_bestsellers_test_week])
candidates_bestsellers.drop(columns='bestseller_rank', inplace=True)

# Combination of transactions and available candidates

In [26]:
transactions["purchased"] = 1

data = pd.concat([transactions, candidates_last_purchase, candidates_bestsellers])
data.purchased.fillna(0, inplace=True)
data["purchased"] = data["purchased"].astype("int8")

data.drop_duplicates(["customer_id", "article_id", "week"], inplace=True)
data.purchased.mean()

0.13514436388224577

# Bestseller information

In [27]:
data = pd.merge(
    data,
    bestsellers_previous_week[['week', 'article_id', 'bestseller_rank']],
    on=['week', 'article_id'],
    how='left'
)

data = data[data.week != data.week.min()]
data.bestseller_rank.fillna(999, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)


In [28]:
data = pd.merge(data, articles, on='article_id', how='left')
data = pd.merge(data, customers, on='customer_id', how='left')

In [29]:
data.sort_values(['week', 'customer_id'], inplace=True)
data.reset_index(drop=True, inplace=True)

# Splitting training and test data

In [30]:
train = data[data.week != test_week]
test = data[data.week==test_week].drop_duplicates(['customer_id', 'article_id', 'sales_channel_id']).copy()

In [None]:
train

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week,purchased,bestseller_rank,product_code,prod_name,...,detail_desc2,detail_desc3,detail_desc4,detail_desc5,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,2020-07-29,-2147481293,464297007,0.016932,1,97,1,999.0,464297,greta thong mynta low 3p,...,-0.023156,-0.193897,0.093667,-0.133588,1,1,0,1,20,66225
1,2020-07-30,-2147481293,507909001,0.025407,1,97,1,999.0,507909,rebecca or delphin shirt,...,-0.041196,0.013745,-0.223388,0.297232,1,1,0,1,20,66225
2,2020-07-30,-2147481293,783346001,0.025407,1,97,1,999.0,783346,primo slack,...,0.067336,0.093201,-0.109777,-0.047968,1,1,0,1,20,66225
3,2020-07-27,-2147481293,697564061,0.016932,1,97,0,999.0,697564,kelli shirt s1,...,-0.105686,0.030172,-0.228471,0.417434,1,1,0,1,20,66225
4,2020-07-27,-2147481293,865594002,0.025407,1,97,0,999.0,865594,vitamin sea top,...,-0.182157,-0.084124,0.147809,-0.167377,1,1,0,1,20,66225
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9983879,2020-09-17,2147354887,918292001,0.042356,1,104,0,8.0,918292,strong hw seamless tight,...,0.019741,0.043474,0.043444,-0.086704,1,1,0,1,23,74570
9983880,2020-09-17,2147354887,762846027,0.025407,1,104,0,9.0,762846,luci blous,...,-0.193421,0.110309,-0.094700,0.331281,1,1,0,1,23,74570
9983881,2020-09-17,2147354887,809238005,0.042356,1,104,0,10.0,809238,atom wrap sweater,...,-0.129348,0.141752,-0.038696,-0.054752,1,1,0,1,23,74570
9983882,2020-09-17,2147354887,673677002,0.025407,1,104,0,11.0,673677,henri polo 1,...,0.029181,0.236490,-0.188987,-0.247529,1,1,0,1,23,74570


In [31]:
train_baskets = train.groupby(['week', 'customer_id'])['article_id'].count().values

In [32]:
columns_to_use = ['article_id', 'product_type_no', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id',
'perceived_colour_master_id', 'department_no', 'index_code',
'index_group_no', 'section_no', 'garment_group_no', 'FN', 'Active',
'club_member_status', 'fashion_news_frequency', 'age', 'postal_code', 'bestseller_rank']

In [33]:
columns_to_use += [f"prod_name_{i}" for i in range(6)]
columns_to_use += [f"detail_desc{i}" for i in range(6)]

In [34]:
columns_to_use

['article_id',
 'product_type_no',
 'graphical_appearance_no',
 'colour_group_code',
 'perceived_colour_value_id',
 'perceived_colour_master_id',
 'department_no',
 'index_code',
 'index_group_no',
 'section_no',
 'garment_group_no',
 'FN',
 'Active',
 'club_member_status',
 'fashion_news_frequency',
 'age',
 'postal_code',
 'bestseller_rank',
 'prod_name_0',
 'prod_name_1',
 'prod_name_2',
 'prod_name_3',
 'prod_name_4',
 'prod_name_5',
 'detail_desc0',
 'detail_desc1',
 'detail_desc2',
 'detail_desc3',
 'detail_desc4',
 'detail_desc5']

In [35]:
#rain = train.reindex(columns_to_use)
train_X = train[columns_to_use]
train_y = train['purchased']
test_X = test[columns_to_use]

# Baseline Model Training (LGBMRanker)

In [36]:
ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type="dart",
    n_estimators=1,
    importance_type='gain',
    verbose=10
)

In [37]:
train_baskets.sum()

9983884

In [38]:
train_X.shape[0]

9983884

In [39]:
ranker = ranker.fit(
    train_X,
    train_y,
    group=train_baskets,
)

In [40]:
for i in ranker.feature_importances_.argsort()[::-1]:
    print(columns_to_use[i], ranker.feature_importances_[i]/ranker.feature_importances_.sum())

bestseller_rank 0.9983531793936743
detail_desc3 0.0005483340300577146
age 0.0003537714038157181
article_id 0.00019115232118738375
garment_group_no 0.00012928816622112408
index_code 0.00010739314499735762
section_no 9.782331091243137e-05
department_no 4.6842720652755586e-05
detail_desc2 3.427708474912043e-05
detail_desc0 2.780166700025138e-05
detail_desc5 2.6421890696499158e-05
club_member_status 2.288669756604699e-05
postal_code 2.2145697279786142e-05
prod_name_2 2.000233998197783e-05
prod_name_4 1.868013120748763e-05
prod_name_5 0.0
product_type_no 0.0
graphical_appearance_no 0.0
colour_group_code 0.0
perceived_colour_value_id 0.0
perceived_colour_master_id 0.0
detail_desc1 0.0
prod_name_0 0.0
index_group_no 0.0
prod_name_1 0.0
prod_name_3 0.0
FN 0.0
Active 0.0
detail_desc4 0.0
fashion_news_frequency 0.0


# Calculating the predictions

In [41]:
test['preds'] = ranker.predict(test_X)

predictions = test.sort_values(['customer_id', 
'preds'], ascending=False).groupby('customer_id')['article_id'].apply(list).to_dict()

bestsellers_last_week = bestsellers_previous_week[bestsellers_previous_week.week
== bestsellers_previous_week.week.max()]['article_id'].tolist()

# Preparing the submission file

In [42]:
!unzip /content/drive/MyDrive/sample_submission.csv.zip

Archive:  /content/drive/MyDrive/sample_submission.csv.zip
  inflating: sample_submission.csv   


In [43]:
submission = pd.read_csv('sample_submission.csv')

In [48]:
#https://github.com/radekosmulski/personalized_fashion_recs/blob/main/01_Solution_warmup.ipynb
def customer_hex_id_to_int(series):
    return series.str[-16:].apply(hex_id_to_int)

def hex_id_to_int(str):
    return int(str[-16:], 16)

preds = []
for c_id in customer_hex_id_to_int(submission.customer_id):
    pred = predictions.get(c_id, [])
    pred = pred + bestsellers_last_week
    preds.append(pred[:12])


In [50]:
predspreds = [' '.join(['0' + str(p) for p in ps]) for ps in preds]
submission.prediction = preds

In [52]:
sub_name = 'submission'
submission.to_csv(f'{sub_name}.csv.gz', index=False)