In [1]:
import pandas as pd
import lightgbm as lgb
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import LSTM, Embedding, Dense
from tensorflow.keras.metrics import AUC
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

2024-12-13 14:38:40.449959: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-13 14:38:40.450022: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-13 14:38:40.457190: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-13 14:38:40.471539: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Load data

user_log_format1 = pd.read_csv('../data/data_format1/user_log_format1.csv', dtype={'time_stamp':'str'})
user_info_format1 = pd.read_csv('../data/data_format1/user_info_format1.csv')
train_data_format1 = pd.read_csv('../data/data_format1/train_format1.csv')
submission_data_format1 = pd.read_csv('../data/data_format1/test_format1.csv')

data_train_format2 = pd.read_csv('../data/data_format2/train_format2.csv')
data_submission_format2 = pd.read_csv('../data/data_format2/test_format2.csv')

In [3]:
# print head of data_train_format2
print(data_train_format2.head())

   user_id  age_range  gender  merchant_id  label  \
0    34176        6.0     0.0          944     -1   
1    34176        6.0     0.0          412     -1   
2    34176        6.0     0.0         1945     -1   
3    34176        6.0     0.0         4752     -1   
4    34176        6.0     0.0          643     -1   

                                        activity_log  
0                            408895:1505:7370:1107:0  
1  17235:1604:4396:0818:0#954723:1604:4396:0818:0...  
2  231901:662:2758:0818:0#231901:662:2758:0818:0#...  
3                             174142:821:6938:1027:0  
4                             716371:1505:968:1024:3  


In [4]:
# Tag origin
train_data_format1['origin'] = 'train'
submission_data_format1['origin'] = 'test'
submission_data_format1.drop(['prob'], axis=1, inplace=True)

# Merge data
train_test_matrix = \
    pd.concat([train_data_format1, submission_data_format1], ignore_index=True, sort=False)
train_test_matrix = train_test_matrix.merge(user_info_format1, on='user_id', how='left')

# Give same name to seller_id in user_log_format1 and user_info_format1
user_log_format1.rename(columns={'seller_id':'merchant_id'}, inplace=True)

In [5]:
# Print if the column have null values
print(train_test_matrix.isnull().sum()) # Number of label to predict: 261477
print('\n')
print(user_log_format1.isnull().sum())

user_id             0
merchant_id         0
label          261477
origin              0
age_range        2578
gender           7545
dtype: int64


user_id            0
item_id            0
cat_id             0
merchant_id        0
brand_id       91015
time_stamp         0
action_type        0
dtype: int64


In [6]:
# Print range of all columns
for col in train_test_matrix.columns:
    print(f'{col}: {train_test_matrix[col].min()} - {train_test_matrix[col].max()}')
print('\n')

for col in user_log_format1.columns:
    print(f'{col}: {user_log_format1[col].min()} - {user_log_format1[col].max()}')

user_id: 1 - 424170
merchant_id: 2 - 4993
label: 0.0 - 1.0
origin: test - train
age_range: 0.0 - 8.0
gender: 0.0 - 2.0


user_id: 1 - 424170
item_id: 1 - 1113166
cat_id: 1 - 1671
merchant_id: 1 - 4995
brand_id: 1.0 - 8477.0
time_stamp: 0511 - 1112
action_type: 0 - 3


In [7]:
# Rename gender column. 0 for female, 1 for male, 2 or NULL for unknown
train_test_matrix['gender'] = train_test_matrix['gender'].map({
    0: 'female',
    1: 'male',
    2: 'unknown'
}).fillna('unknown')
# Rename age_range column. NULL for unknown
train_test_matrix['age_range'] = train_test_matrix['age_range'].map({
    1: 'first group',
    2: 'second group',
    3: 'third group',
    4: 'fourth group',
    5: 'fifth group',
    6: 'sixth group',
    7: 'seventh group',
    8: 'eighth group'
}).fillna('unknown')

# Rename action_type column. 0 for click, 1 for add-to-cart, 2 for purchase, 3 for add-to-favorite
user_log_format1['action_type'] = user_log_format1['action_type'].map({
    0: 'click',
    1: 'add-to-cart',
    2: 'purchase',
    3: 'add-to-favorite'
})
# Fill in the missing values of brand_id with 0
user_log_format1['brand_id'].fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  user_log_format1['brand_id'].fillna(0, inplace=True)


In [8]:
# Print if the column have null values
print(train_test_matrix.isnull().sum()) # Number of label to predict: 261477
print('\n')
print(user_log_format1.isnull().sum())

user_id             0
merchant_id         0
label          261477
origin              0
age_range           0
gender              0
dtype: int64


user_id        0
item_id        0
cat_id         0
merchant_id    0
brand_id       0
time_stamp     0
action_type    0
dtype: int64


In [9]:
# Print range of all columns
for col in train_test_matrix.columns:
    print(f'{col}: {train_test_matrix[col].min()} - {train_test_matrix[col].max()}')
print('\n')

for col in user_log_format1.columns:
    print(f'{col}: {user_log_format1[col].min()} - {user_log_format1[col].max()}')

user_id: 1 - 424170
merchant_id: 2 - 4993
label: 0.0 - 1.0
origin: test - train
age_range: eighth group - unknown
gender: female - unknown


user_id: 1 - 424170
item_id: 1 - 1113166
cat_id: 1 - 1671
merchant_id: 1 - 4995
brand_id: 0.0 - 8477.0
time_stamp: 0511 - 1112
action_type: add-to-cart - purchase


In [10]:
# Convert data types
train_test_matrix['user_id'] = train_test_matrix['user_id'].astype('uint32')
train_test_matrix['merchant_id'] = train_test_matrix['merchant_id'].astype('uint16')
train_test_matrix['label'] = train_test_matrix['label'].astype('float64')
train_test_matrix['origin'] = train_test_matrix['origin'].astype('category')
train_test_matrix['age_range'] = train_test_matrix['age_range'].astype('category')
train_test_matrix['gender'] = train_test_matrix['gender'].astype('category')

user_log_format1['user_id'] = user_log_format1['user_id'].astype('uint32')
user_log_format1['item_id'] = user_log_format1['item_id'].astype('uint32')
user_log_format1['cat_id'] = user_log_format1['cat_id'].astype('uint16')
user_log_format1['merchant_id'] = user_log_format1['merchant_id'].astype('uint16')
user_log_format1['brand_id'] = user_log_format1['brand_id'].astype('int16')
user_log_format1['time_stamp'] = pd.to_datetime('2016' + user_log_format1['time_stamp'], format='%Y%m%d')
user_log_format1['action_type'] = user_log_format1['action_type'].astype('category')

In [11]:
# Print range of all columns after conversion
for col in train_test_matrix.columns:
    if train_test_matrix[col].dtype.name == 'category' and not train_test_matrix[col].cat.ordered:
        print(f'{col}: Cannot compute range on unordered categorical data')
    elif train_test_matrix[col].dtype.kind in 'biufc':  # Numeric columns
        print(f'{col}: {train_test_matrix[col].min()} - {train_test_matrix[col].max()}')
    else:
        print(f'{col}: Non-numeric or unsupported type')
print('\n')

for col in user_log_format1.columns:
    if user_log_format1[col].dtype.name == 'category' and not user_log_format1[col].cat.ordered:
        print(f'{col}: Cannot compute range on unordered categorical data')
    elif user_log_format1[col].dtype.kind in 'biufc':  # Numeric columns
        print(f'{col}: {user_log_format1[col].min()} - {user_log_format1[col].max()}')
    else:
        print(f'{col}: Non-numeric or unsupported type')

user_id: 1 - 424170
merchant_id: 2 - 4993
label: 0.0 - 1.0
origin: Cannot compute range on unordered categorical data
age_range: Cannot compute range on unordered categorical data
gender: Cannot compute range on unordered categorical data


user_id: 1 - 424170
item_id: 1 - 1113166
cat_id: 1 - 1671
merchant_id: 1 - 4995
brand_id: 0 - 8477
time_stamp: Non-numeric or unsupported type
action_type: Cannot compute range on unordered categorical data


In [12]:
# Print head of train_test_matrix
print(train_test_matrix.head())

   user_id  merchant_id  label origin    age_range  gender
0    34176         3906    0.0  train  sixth group  female
1    34176          121    0.0  train  sixth group  female
2    34176         4356    1.0  train  sixth group  female
3    34176         2217    0.0  train  sixth group  female
4   230784         4818    0.0  train      unknown  female


In [13]:
user_grouped_by_user_id = user_log_format1.groupby('user_id')

train_test_matrix = train_test_matrix.merge(
    user_grouped_by_user_id['item_id'].nunique().reset_index().rename(columns={'item_id': 'u_iid'}),
    on='user_id', how='left'
)
train_test_matrix = train_test_matrix.merge(
    user_grouped_by_user_id['cat_id'].nunique().reset_index().rename(columns={'cat_id': 'u_cid'}),
    on='user_id', how='left'
)
train_test_matrix = train_test_matrix.merge(
    user_grouped_by_user_id['merchant_id'].nunique().reset_index().rename(columns={'merchant_id': 'u_mid'}),
    on='user_id', how='left'
)
train_test_matrix = train_test_matrix.merge(
    user_grouped_by_user_id['brand_id'].nunique().reset_index().rename(columns={'brand_id': 'u_bid'}),
    on='user_id', how='left'
)
train_test_matrix = train_test_matrix.merge(
    user_grouped_by_user_id['action_type'].value_counts().unstack().reset_index().rename(
        columns={'click': 'u_click', 'add-to-cart': 'u_cart', 'purchase': 'u_purchase', 'add-to-favorite': 'u_fav'}),
    on='user_id', how='left'
)
# Number of days between the first and the last action
user_time = user_grouped_by_user_id['time_stamp'].agg(['min', 'max']).reset_index()
user_time['u_days_between'] = (user_time['max'] - user_time['min']).dt.days
train_test_matrix = train_test_matrix.merge(
    user_time[['user_id', 'u_days_between']], 
    on='user_id', 
    how='left'
)

del user_grouped_by_user_id, user_time

In [15]:
# Print head of train_test_matrix
print(train_test_matrix.head())

   user_id  merchant_id  label origin    age_range  gender  u_iid  u_cid  \
0    34176         3906    0.0  train  sixth group  female    256     45   
1    34176          121    0.0  train  sixth group  female    256     45   
2    34176         4356    1.0  train  sixth group  female    256     45   
3    34176         2217    0.0  train  sixth group  female    256     45   
4   230784         4818    0.0  train      unknown  female     31     17   

   u_mid  u_bid  u_cart  u_fav  u_click  u_purchase  u_days_between  
0    109    108       0      7      410          34             174  
1    109    108       0      7      410          34             174  
2    109    108       0      7      410          34             174  
3    109    108       0      7      410          34             174  
4     20     19       0      0       47           7             163  


In [16]:
user_grouped_by_merchant_id = user_log_format1.groupby('merchant_id')

train_test_matrix = train_test_matrix.merge(
    user_grouped_by_merchant_id['user_id'].nunique().reset_index().rename(columns={'user_id': 'm_uid'}),
    on='merchant_id', how='left'
)
train_test_matrix = train_test_matrix.merge(
    user_grouped_by_merchant_id['item_id'].nunique().reset_index().rename(columns={'item_id': 'm_iid'}),
    on='merchant_id', how='left'
)
train_test_matrix = train_test_matrix.merge(
    user_grouped_by_merchant_id['cat_id'].nunique().reset_index().rename(columns={'cat_id': 'm_cid'}),
    on='merchant_id', how='left'
)
train_test_matrix = train_test_matrix.merge(
    user_grouped_by_merchant_id['brand_id'].nunique().reset_index().rename(columns={'brand_id': 'm_bid'}),
    on='merchant_id', how='left'
)
train_test_matrix = train_test_matrix.merge(
    user_grouped_by_merchant_id['action_type'].value_counts().unstack().reset_index().rename(
        columns={'click': 'm_click', 'add-to-cart': 'm_cart', 'purchase': 'm_purchase', 'add-to-favorite': 'm_fav'}),
    on='merchant_id', how='left'
)
# Number of days between the first and the last action
merchant_time = user_grouped_by_merchant_id['time_stamp'].agg(['min', 'max']).reset_index()
merchant_time['m_days_between'] = (merchant_time['max'] - merchant_time['min']).dt.days
train_test_matrix = train_test_matrix.merge(
    merchant_time[['merchant_id', 'm_days_between']], 
    on='merchant_id', 
    how='left'
)

del user_grouped_by_merchant_id, merchant_time

In [17]:
# Print head of train_test_matrix
print(train_test_matrix.head())

   user_id  merchant_id  label origin    age_range  gender  u_iid  u_cid  \
0    34176         3906    0.0  train  sixth group  female    256     45   
1    34176          121    0.0  train  sixth group  female    256     45   
2    34176         4356    1.0  train  sixth group  female    256     45   
3    34176         2217    0.0  train  sixth group  female    256     45   
4   230784         4818    0.0  train      unknown  female     31     17   

   u_mid  u_bid  ...  u_days_between  m_uid  m_iid  m_cid  m_bid  m_cart  \
0    109    108  ...             174   5819    308     20      2      28   
1    109    108  ...             174  10931   1179     26      2     121   
2    109    108  ...             174   2281     67     15      2      16   
3    109    108  ...             174  16870    377      5      2     101   
4     20     19  ...             163   7500    461     27      2     129   

   m_fav  m_click  m_purchase  m_days_between  
0    961    14870         410         

In [18]:
user_merchant_group = user_log_format1.groupby(['user_id', 'merchant_id'])

train_test_matrix = train_test_matrix.merge(
    user_merchant_group.size().reset_index().rename(columns={0: 'um_action_count'}),
    on=['user_id', 'merchant_id'], how='left'
)
train_test_matrix = train_test_matrix.merge(
    user_merchant_group[['item_id', 'cat_id', 'brand_id']].nunique().reset_index().rename(
        columns={'item_id': 'um_iid', 'cat_id': 'um_cid', 'brand_id': 'um_bid'},
    ),
    on=['user_id', 'merchant_id'], how='left'
)
um_time = user_merchant_group['time_stamp'].agg(['min', 'max']).reset_index()
um_time['um_days_between'] = (um_time['max'] - um_time['min']).dt.days
train_test_matrix = train_test_matrix.merge(
    um_time[['user_id', 'merchant_id', 'um_days_between']],
    on=['user_id', 'merchant_id'], how='left'
)

del user_merchant_group, um_time

In [19]:
# User buy click ratio
train_test_matrix['u_bcr'] = \
    train_test_matrix['u_purchase'] / train_test_matrix['u_click']
# Merchant buy click ratio
train_test_matrix['m_bcr'] = \
    train_test_matrix['m_purchase'] / train_test_matrix['m_click']

In [27]:
# Merge data_train_format2 and data_submission_format2
data_format2 = pd.concat([data_train_format2, data_submission_format2], ignore_index=True, sort=False)

In [28]:
# decode the action_type
def parse_activity_log(log):
    log = str(log)
    actions = log.split('#')
    seq = []
    for action in actions:
        item = action.split(':')
        if len(item) == 5:
            item_id, cat_id, brand_id, time_stamp, action_type = item
            seq.append({
                'item_id': int(item_id),
                'cat_id': int(cat_id),
                'brand_id': int(brand_id),
                'time_stamp': int(time_stamp),
                'action_type': int(action_type)
            })

    return seq

def count_actions(log):
    return len(log)

data_format2['parsed_log'] = data_format2['activity_log'].apply(parse_activity_log)
data_format2['action_count'] = data_format2['parsed_log'].apply(count_actions)

# Add number of actions to train_test_matrix on user_id
data_format2_grouped_by_user_id = data_format2.groupby('user_id')

train_test_matrix = train_test_matrix.merge(
    data_format2_grouped_by_user_id['action_count'].agg(['min', 'max', 'mean', 'std']).reset_index().rename(
        columns={'min': 'u_ac_min', 'max': 'u_ac_max', 'mean': 'u_ac_mean', 'std': 'u_ac_std'}
    ),
    on='user_id', how='left'
)

del data_format2_grouped_by_user_id

# Add number of actions to train_test_matrix on merchant_id
data_format2_grouped_by_merchant_id = data_format2.groupby('merchant_id')

train_test_matrix = train_test_matrix.merge(
    data_format2_grouped_by_merchant_id['action_count'].agg(['min', 'max', 'mean', 'std']).reset_index().rename(
        columns={'min': 'm_ac_min', 'max': 'm_ac_max', 'mean': 'm_ac_mean', 'std': 'm_ac_std'}
    ),
    on='merchant_id', how='left'
)

del data_format2_grouped_by_merchant_id

# Add number of actions to train_test_matrix on user meerchant pair
user_merchant_group_format2 = data_format2.groupby(['user_id', 'merchant_id'])

train_test_matrix = train_test_matrix.merge(
    user_merchant_group_format2['action_count'].agg(['min', 'max', 'mean', 'std']).reset_index().rename(
        columns={'min': 'um_ac_min', 'max': 'um_ac_max', 'mean': 'um_ac_mean', 'std': 'um_ac_std'}
    ),
    on=['user_id', 'merchant_id'],
    how='left'
)

del user_merchant_group_format2

In [29]:
# Print head of train_test_matrix
print(train_test_matrix.head())

   user_id  merchant_id  label origin    age_range  gender  u_iid  u_cid  \
0    34176         3906    0.0  train  sixth group  female    256     45   
1    34176          121    0.0  train  sixth group  female    256     45   
2    34176         4356    1.0  train  sixth group  female    256     45   
3    34176         2217    0.0  train  sixth group  female    256     45   
4   230784         4818    0.0  train      unknown  female     31     17   

   u_mid  u_bid  ...  u_ac_mean  u_ac_std  m_ac_min  m_ac_max  m_ac_mean  \
0    109    108  ...   4.137615  8.318284         0       300   2.791201   
1    109    108  ...   4.137615  8.318284         0       311   7.295215   
2    109    108  ...   4.137615  8.318284         0       146   3.179746   
3    109    108  ...   4.137615  8.318284         0       694   3.562596   
4     20     19  ...   2.700000  2.364207         0       266   6.394800   

    m_ac_std  um_ac_min  um_ac_max  um_ac_mean  um_ac_std  
0   7.001489         39   

In [30]:
train_data = train_test_matrix[train_test_matrix['origin'] == 'train'].drop(['origin'], axis=1)
test_data = train_test_matrix[train_test_matrix['origin'] == 'test'].drop(['label', 'origin'], axis=1)
train_X, train_y = train_data.drop(['label'], axis=1), train_data['label']

In [31]:
print(train_X.dtypes)

user_id              uint32
merchant_id          uint16
age_range          category
gender             category
u_iid                 int64
u_cid                 int64
u_mid                 int64
u_bid                 int64
u_cart                int64
u_fav                 int64
u_click               int64
u_purchase            int64
u_days_between        int64
m_uid                 int64
m_iid                 int64
m_cid                 int64
m_bid                 int64
m_cart                int64
m_fav                 int64
m_click               int64
m_purchase            int64
m_days_between        int64
um_action_count       int64
um_iid                int64
um_cid                int64
um_bid                int64
um_days_between       int64
u_bcr               float64
m_bcr               float64
ac_min_x              int64
ac_max_x              int64
ac_mean_x           float64
ac_std_x            float64
ac_min_y              int64
ac_max_y              int64
ac_mean_y           

In [32]:
print(test_data.dtypes)

user_id              uint32
merchant_id          uint16
age_range          category
gender             category
u_iid                 int64
u_cid                 int64
u_mid                 int64
u_bid                 int64
u_cart                int64
u_fav                 int64
u_click               int64
u_purchase            int64
u_days_between        int64
m_uid                 int64
m_iid                 int64
m_cid                 int64
m_bid                 int64
m_cart                int64
m_fav                 int64
m_click               int64
m_purchase            int64
m_days_between        int64
um_action_count       int64
um_iid                int64
um_cid                int64
um_bid                int64
um_days_between       int64
u_bcr               float64
m_bcr               float64
ac_min_x              int64
ac_max_x              int64
ac_mean_x           float64
ac_std_x            float64
ac_min_y              int64
ac_max_y              int64
ac_mean_y           

In [33]:
X_train, X_valid, y_train, y_valid = train_test_split(train_X, train_y, test_size=0.2, random_state=42)

In [34]:
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train)

params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'is_unbalance': True,
    'device': 'gpu',
    'gpu_platform_id': 0,
    'gpu_device_id': 0,
    'early_stopping_rounds': 10
}

gbm = lgb.train(
    params,
    lgb_train,
    num_boost_round=5000,
    valid_sets=[lgb_train, lgb_eval]
)

y_pred = gbm.predict(X_valid, num_iteration=gbm.best_iteration)
auc_score = roc_auc_score(y_valid, y_pred)
print(f"LightGBM AUC: {auc_score}")

[LightGBM] [Info] Number of positive: 12805, number of negative: 195886
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 9709
[LightGBM] [Info] Number of data points in the train set: 208691, number of used features: 51
[LightGBM] [Info] Using requested OpenCL platform 0 device 0
[LightGBM] [Info] Using GPU Device: Tesla P100-PCIE-16GB, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 43 dense feature groups (8.76 MB) transferred to GPU in 0.058829 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.061359 -> initscore=-2.727697
[LightGBM] [Info] Start training from score -2.727697
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[42]	training's auc: 0.72228	valid_1's auc: 0.680728
LightGBM AUC: 0.6807277095604388


In [None]:
# Generate submission
submission = submission_data_format1[['user_id', 'merchant_id']]
submission['prob'] = gbm.predict(test_data, num_iteration=gbm.best_iteration)
submission.to_csv('./submission/submission.csv', index=False)