Based on the baseline "Radek's LGBMRanker starter pack warmup”: https://www.kaggle.com/code/marcogorelli/radek-s-lgbmranker-starter-pack-warmup


# Helper functions

In [1]:
!wget https://raw.githubusercontent.com/benhamner/Metrics/master/Python/ml_metrics/average_precision.py

--2023-11-06 15:42:11--  https://raw.githubusercontent.com/benhamner/Metrics/master/Python/ml_metrics/average_precision.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1654 (1.6K) [text/plain]
Saving to: ‘average_precision.py’


2023-11-06 15:42:11 (23.2 MB/s) - ‘average_precision.py’ saved [1654/1654]



In [2]:
# helper functions
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
from average_precision import apk

# https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations/discussion/308635
def customer_hex_id_to_int(series):
    return series.str[-16:].apply(hex_id_to_int)

def hex_id_to_int(str):
    return int(str[-16:], 16)

def article_id_str_to_int(series):
    return series.astype('int32')

def article_id_int_to_str(series):
    return '0' + series.astype('str')

class Categorize(BaseEstimator, TransformerMixin):
    def __init__(self, min_examples=0):
        self.min_examples = min_examples
        self.categories = []

    def fit(self, X):
        for i in range(X.shape[1]):
            vc = X.iloc[:, i].value_counts()
            self.categories.append(vc[vc > self.min_examples].index.tolist())
        return self

    def transform(self, X):
        data = {X.columns[i]: pd.Categorical(X.iloc[:, i], categories=self.categories[i]).codes for i in range(X.shape[1])}
        return pd.DataFrame(data=data)

# Read input files

In [3]:
%%time
import pandas as pd

transactions = pd.read_csv('/kaggle/input/h-and-m-personalized-fashion-recommendations/transactions_train.csv', dtype={"article_id": "str"})
customers = pd.read_csv('/kaggle/input/h-and-m-personalized-fashion-recommendations/customers.csv')
articles = pd.read_csv('/kaggle/input/h-and-m-personalized-fashion-recommendations/articles.csv', dtype={"article_id": "str"})

CPU times: user 35.8 s, sys: 4.63 s, total: 40.5 s
Wall time: 1min 10s


In [4]:
%%time
transactions['customer_id'] = customer_hex_id_to_int(transactions['customer_id'])
transactions['customer_id'].nunique()

CPU times: user 23.7 s, sys: 1.92 s, total: 25.6 s
Wall time: 25.6 s


1362281

In [5]:
%%time

transactions.t_dat = pd.to_datetime(transactions.t_dat, format='%Y-%m-%d')

CPU times: user 2.99 s, sys: 139 ms, total: 3.13 s
Wall time: 3.13 s


# Add the season as feature

In [6]:
def get_season(date):
    month = date.month
    if 3 <= month <= 5:
        return 1  # Spring
    elif 6 <= month <= 8:
        return 2  # Summer
    elif 9 <= month <= 11:
        return 3  # Autumn
    else:
        return 4  # Winter
transactions['season'] = transactions['t_dat'].apply(get_season)
transactions['season'] = transactions['season'].astype('int8')


# Add month and year as features

In [7]:
transactions['year'] = transactions['t_dat'].dt.year
transactions['month'] = transactions['t_dat'].dt.month

transactions['year'] = transactions['year'].astype('int8')
transactions['month'] = transactions['month'].astype('int8')


Add week

In [8]:
transactions['week'] = 104 - (transactions.t_dat.max() - transactions.t_dat).dt.days // 7

Let's do something about the `article_id` (both here and on `articles`) and let's take a closer look at `price`, `sales_channel_id` and `week`.

In [9]:
transactions.article_id = article_id_str_to_int(transactions.article_id)
articles.article_id = article_id_str_to_int(articles.article_id)

transactions.week = transactions.week.astype('int8')
transactions.sales_channel_id = transactions.sales_channel_id.astype('int8')
transactions.price = transactions.price.astype('float32')

In [10]:
transactions.drop(columns='t_dat').info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31788324 entries, 0 to 31788323
Data columns (total 8 columns):
 #   Column            Dtype  
---  ------            -----  
 0   customer_id       uint64 
 1   article_id        int32  
 2   price             float32
 3   sales_channel_id  int8   
 4   season            int8   
 5   year              int8   
 6   month             int8   
 7   week              int8   
dtypes: float32(1), int32(1), int8(5), uint64(1)
memory usage: 636.6 MB


# Use median for empty age

In [11]:
customers.customer_id = customer_hex_id_to_int(customers.customer_id)
median_age = customers['age'].median()
customers['age'].fillna(median_age, inplace=True)
customers['age'] = customers['age'].astype('int8')

Let's take a brief look at the `customers` and `articles` dfs.

In [12]:
for col in ['FN', 'Active']:
    customers[col].fillna(0, inplace=True)
    customers[col] = customers[col].astype('int8')

In [13]:
customers.club_member_status = Categorize().fit_transform(customers[['club_member_status']]).club_member_status
customers.postal_code = Categorize().fit_transform(customers[['postal_code']]).postal_code
customers.fashion_news_frequency = Categorize().fit_transform(customers[['fashion_news_frequency']]).fashion_news_frequency

# Add fabrics as a feature

In [14]:
articles['detail_desc'] = articles['detail_desc'].fillna('')

# Extract fabric names from descriptions
fabric_names = ['jeans','cotton', 'wool', 'polyester', 'silk', 'denim', 'linen', 'spandex', 'rayon', 'nylon', 'leather', 'suede']

# Create fabric features
for fabric in fabric_names:
    articles[f'fabric_{fabric}'] = articles['detail_desc'].str.contains(fabric, case=False, regex=True).astype(int)

In [15]:
articles['section_name'].unique()

array(['Womens Everyday Basics', 'Womens Lingerie',
       'Womens Nightwear, Socks & Tigh', 'Baby Essentials & Complements',
       'Men Underwear', 'Mama', 'Womens Small accessories',
       'Men H&M Sport', 'Kids Boy', 'Divided Basics',
       'Girls Underwear & Basics', 'Mens Outerwear',
       'Womens Big accessories', 'Divided Accessories',
       'Womens Swimwear, beachwear', 'Divided Selected',
       'Boys Underwear & Basics', 'Contemporary Street',
       'Contemporary Casual', 'Men Accessories', 'Men Suits & Tailoring',
       'Womens Everyday Collection', 'Men Shoes', 'Young Boy', 'H&M+',
       'Divided Collection', 'Ladies Denim', 'Contemporary Smart',
       'Womens Trend', 'Kids Outerwear', 'Young Girl', 'Womens Shoes',
       'Womens Tailoring', 'Divided Projects', 'Denim Men', 'Men Other',
       'Womens Jackets', 'Men Other 2', 'Baby Boy', 'Womens Casual',
       'Kids Accessories, Swimwear & D', 'Ladies H&M Sport',
       'Kids & Baby Shoes', 'Baby Girl', 'Kids Girl

# Add feature if the customer buys kids, womans and/or mens clothing

In [16]:
# Step 1: Define keywords for kid and baby clothing sections
kid_baby_keywords = ['Kids', 'Baby', 'Girls', 'Girl', 'Boy', 'Boys']
mens_keywords = ['Men', 'Mens']
women_keywords = ['Womens', 'Women', 'Woman', 'Ladies', 'Mama']

# Step 2: Merge the transaction_train DataFrame with the articles DataFrame
merged_df = transactions.merge(articles[['article_id', 'section_name']], on='article_id', how='left')

# Step 3: Check if the section name contains kid or baby keywords
merged_df['buys_kid_baby_clothes'] = merged_df['section_name'].str.contains('|'.join(kid_baby_keywords), case=False)
merged_df['buys_mens_clothes'] = merged_df['section_name'].str.contains('|'.join(mens_keywords), case=False)
merged_df['buys_womens_clothing'] = merged_df['section_name'].str.contains('|'.join(women_keywords), case=False)

# Step 4: Aggregate the information for each customer
customer_buys_kid_baby = merged_df.groupby('customer_id')['buys_kid_baby_clothes'].any().reset_index()
customer_buys_mens = merged_df.groupby('customer_id')['buys_mens_clothes'].any().reset_index()
customer_buys_womens = merged_df.groupby('customer_id')['buys_womens_clothing'].any().reset_index()

# Merge this information with your customers DataFrame
customers = customers.merge(customer_buys_kid_baby, on='customer_id', how='left')
customers = customers.merge(customer_buys_mens, on='customer_id', how='left')
customers = customers.merge(customer_buys_womens, on='customer_id', how='left')

# Correct column names for converting to integers
customers['buys_mens_clothes'] = customers['buys_mens_clothes'].fillna(-1).astype(int)
customers['buys_womens_clothing'] = customers['buys_womens_clothing'].fillna(-1).astype(int)
customers['buys_kid_baby_clothes'] = customers['buys_kid_baby_clothes'].fillna(-1).astype(int)

In [17]:
articles['index_code'] = Categorize().fit_transform(articles[['index_code']])['index_code']

In [18]:
for col in articles.columns:
    if articles[col].dtype == 'int64':
        articles[col] = articles[col].astype('int32')

# And this concludes our raw data preparation step! Let's now write everything back to disk.

In [19]:
transactions.sort_values(['t_dat', 'customer_id'], inplace=True)

In [20]:
%%time
pad = ""
transactions.to_parquet(pad+'transactions_train.parquet')
customers.to_parquet(pad+'customers.parquet')
articles.to_parquet(pad+'articles.parquet')

CPU times: user 5.74 s, sys: 511 ms, total: 6.25 s
Wall time: 6.31 s
