In [None]:
import pandas as pd
import lightgbm as lgb
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import LSTM, Embedding, Dense
from tensorflow.keras.metrics import AUC
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

In [None]:
# Load data

user_log_format1 = pd.read_csv('../data/data_format1/user_log_format1.csv', dtype={'time_stamp':'str'})
user_info_format1 = pd.read_csv('../data/data_format1/user_info_format1.csv')
train_data_format1 = pd.read_csv('../data/data_format1/train_format1.csv')
submission_data_format1 = pd.read_csv('../data/data_format1/test_format1.csv')

data_train_format2 = pd.read_csv('../data/data_format2/train_format2.csv')
data_submission_format2 = pd.read_csv('../data/data_format2/test_format2.csv')

In [None]:
# print head of data_train_format2
print(data_train_format2.head())

In [None]:
# Tag origin
train_data_format1['origin'] = 'train'
submission_data_format1['origin'] = 'test'
submission_data_format1.drop(['prob'], axis=1, inplace=True)

# Merge data
train_test_matrix = \
    pd.concat([train_data_format1, submission_data_format1], ignore_index=True, sort=False)
train_test_matrix = train_test_matrix.merge(user_info_format1, on='user_id', how='left')

# Give same name to seller_id in user_log_format1 and user_info_format1
user_log_format1.rename(columns={'seller_id':'merchant_id'}, inplace=True)

In [None]:
# Print if the column have null values
print(train_test_matrix.isnull().sum()) # Number of label to predict: 261477
print('\n')
print(user_log_format1.isnull().sum())

In [None]:
# Print range of all columns
for col in train_test_matrix.columns:
    print(f'{col}: {train_test_matrix[col].min()} - {train_test_matrix[col].max()}')
print('\n')

for col in user_log_format1.columns:
    print(f'{col}: {user_log_format1[col].min()} - {user_log_format1[col].max()}')

In [None]:
# Rename gender column. 0 for female, 1 for male, 2 or NULL for unknown
train_test_matrix['gender'] = train_test_matrix['gender'].map({
    0: 'female',
    1: 'male',
    2: 'unknown'
}).fillna('unknown')
# Rename age_range column. NULL for unknown
train_test_matrix['age_range'] = train_test_matrix['age_range'].map({
    1: 'first group',
    2: 'second group',
    3: 'third group',
    4: 'fourth group',
    5: 'fifth group',
    6: 'sixth group',
    7: 'seventh group',
    8: 'eighth group'
}).fillna('unknown')

# Rename action_type column. 0 for click, 1 for add-to-cart, 2 for purchase, 3 for add-to-favorite
user_log_format1['action_type'] = user_log_format1['action_type'].map({
    0: 'click',
    1: 'add-to-cart',
    2: 'purchase',
    3: 'add-to-favorite'
})
# Fill in the missing values of brand_id with 0
user_log_format1['brand_id'].fillna(0, inplace=True)

In [None]:
# Print if the column have null values
print(train_test_matrix.isnull().sum()) # Number of label to predict: 261477
print('\n')
print(user_log_format1.isnull().sum())

In [None]:
# Print range of all columns
for col in train_test_matrix.columns:
    print(f'{col}: {train_test_matrix[col].min()} - {train_test_matrix[col].max()}')
print('\n')

for col in user_log_format1.columns:
    print(f'{col}: {user_log_format1[col].min()} - {user_log_format1[col].max()}')

In [None]:
# Convert data types
train_test_matrix['user_id'] = train_test_matrix['user_id'].astype('uint32')
train_test_matrix['merchant_id'] = train_test_matrix['merchant_id'].astype('uint16')
train_test_matrix['label'] = train_test_matrix['label'].astype('float64')
train_test_matrix['origin'] = train_test_matrix['origin'].astype('category')
train_test_matrix['age_range'] = train_test_matrix['age_range'].astype('category')
train_test_matrix['gender'] = train_test_matrix['gender'].astype('category')

user_log_format1['user_id'] = user_log_format1['user_id'].astype('uint32')
user_log_format1['item_id'] = user_log_format1['item_id'].astype('uint32')
user_log_format1['cat_id'] = user_log_format1['cat_id'].astype('uint16')
user_log_format1['merchant_id'] = user_log_format1['merchant_id'].astype('uint16')
user_log_format1['brand_id'] = user_log_format1['brand_id'].astype('int16')
user_log_format1['time_stamp'] = pd.to_datetime('2016' + user_log_format1['time_stamp'], format='%Y%m%d')
user_log_format1['action_type'] = user_log_format1['action_type'].astype('category')

In [None]:
# Print range of all columns after conversion
for col in train_test_matrix.columns:
    if train_test_matrix[col].dtype.name == 'category' and not train_test_matrix[col].cat.ordered:
        print(f'{col}: Cannot compute range on unordered categorical data')
    elif train_test_matrix[col].dtype.kind in 'biufc':  # Numeric columns
        print(f'{col}: {train_test_matrix[col].min()} - {train_test_matrix[col].max()}')
    else:
        print(f'{col}: Non-numeric or unsupported type')
print('\n')

for col in user_log_format1.columns:
    if user_log_format1[col].dtype.name == 'category' and not user_log_format1[col].cat.ordered:
        print(f'{col}: Cannot compute range on unordered categorical data')
    elif user_log_format1[col].dtype.kind in 'biufc':  # Numeric columns
        print(f'{col}: {user_log_format1[col].min()} - {user_log_format1[col].max()}')
    else:
        print(f'{col}: Non-numeric or unsupported type')

In [None]:
# Print head of train_test_matrix
print(train_test_matrix.head())

In [None]:
user_grouped_by_user_id = user_log_format1.groupby('user_id')

train_test_matrix = train_test_matrix.merge(
    user_grouped_by_user_id['item_id'].nunique().reset_index().rename(columns={'item_id': 'u_iid'}),
    on='user_id', how='left'
)
train_test_matrix = train_test_matrix.merge(
    user_grouped_by_user_id['cat_id'].nunique().reset_index().rename(columns={'cat_id': 'u_cid'}),
    on='user_id', how='left'
)
train_test_matrix = train_test_matrix.merge(
    user_grouped_by_user_id['merchant_id'].nunique().reset_index().rename(columns={'merchant_id': 'u_mid'}),
    on='user_id', how='left'
)
train_test_matrix = train_test_matrix.merge(
    user_grouped_by_user_id['brand_id'].nunique().reset_index().rename(columns={'brand_id': 'u_bid'}),
    on='user_id', how='left'
)
train_test_matrix = train_test_matrix.merge(
    user_grouped_by_user_id['action_type'].value_counts().unstack().reset_index().rename(
        columns={'click': 'u_click', 'add-to-cart': 'u_cart', 'purchase': 'u_purchase', 'add-to-favorite': 'u_fav'}),
    on='user_id', how='left'
)
# Number of days between the first and the last action
user_time = user_grouped_by_user_id['time_stamp'].agg(['min', 'max']).reset_index()
user_time['u_days_between'] = (user_time['max'] - user_time['min']).dt.days
train_test_matrix = train_test_matrix.merge(
    user_time[['user_id', 'u_days_between']], 
    on='user_id', 
    how='left'
)

del user_grouped_by_user_id, user_time

In [None]:
# Print head of train_test_matrix
print(train_test_matrix.head())

In [None]:
user_grouped_by_merchant_id = user_log_format1.groupby('merchant_id')

train_test_matrix = train_test_matrix.merge(
    user_grouped_by_merchant_id['user_id'].nunique().reset_index().rename(columns={'user_id': 'm_uid'}),
    on='merchant_id', how='left'
)
train_test_matrix = train_test_matrix.merge(
    user_grouped_by_merchant_id['item_id'].nunique().reset_index().rename(columns={'item_id': 'm_iid'}),
    on='merchant_id', how='left'
)
train_test_matrix = train_test_matrix.merge(
    user_grouped_by_merchant_id['cat_id'].nunique().reset_index().rename(columns={'cat_id': 'm_cid'}),
    on='merchant_id', how='left'
)
train_test_matrix = train_test_matrix.merge(
    user_grouped_by_merchant_id['brand_id'].nunique().reset_index().rename(columns={'brand_id': 'm_bid'}),
    on='merchant_id', how='left'
)
train_test_matrix = train_test_matrix.merge(
    user_grouped_by_merchant_id['action_type'].value_counts().unstack().reset_index().rename(
        columns={'click': 'm_click', 'add-to-cart': 'm_cart', 'purchase': 'm_purchase', 'add-to-favorite': 'm_fav'}),
    on='merchant_id', how='left'
)
# Number of days between the first and the last action
merchant_time = user_grouped_by_merchant_id['time_stamp'].agg(['min', 'max']).reset_index()
merchant_time['m_days_between'] = (merchant_time['max'] - merchant_time['min']).dt.days
train_test_matrix = train_test_matrix.merge(
    merchant_time[['merchant_id', 'm_days_between']], 
    on='merchant_id', 
    how='left'
)

del user_grouped_by_merchant_id, merchant_time

In [None]:
# Print head of train_test_matrix
print(train_test_matrix.head())

In [None]:
user_merchant_group = user_log_format1.groupby(['user_id', 'merchant_id'])

train_test_matrix = train_test_matrix.merge(
    user_merchant_group.size().reset_index().rename(columns={0: 'um_action_count'}),
    on=['user_id', 'merchant_id'], how='left'
)
train_test_matrix = train_test_matrix.merge(
    user_merchant_group[['item_id', 'cat_id', 'brand_id']].nunique().reset_index().rename(
        columns={'item_id': 'um_iid', 'cat_id': 'um_cid', 'brand_id': 'um_bid'},
    ),
    on=['user_id', 'merchant_id'], how='left'
)
um_time = user_merchant_group['time_stamp'].agg(['min', 'max']).reset_index()
um_time['um_days_between'] = (um_time['max'] - um_time['min']).dt.days
train_test_matrix = train_test_matrix.merge(
    um_time[['user_id', 'merchant_id', 'um_days_between']],
    on=['user_id', 'merchant_id'], how='left'
)

del user_merchant_group, um_time

In [None]:
# User buy click ratio
train_test_matrix['u_bcr'] = \
    train_test_matrix['u_purchase'] / train_test_matrix['u_click']
# Merchant buy click ratio
train_test_matrix['m_bcr'] = \
    train_test_matrix['m_purchase'] / train_test_matrix['m_click']

In [None]:
# Merge data_train_format2 and data_submission_format2
data_format2 = pd.concat([data_train_format2, data_submission_format2], ignore_index=True, sort=False)

In [None]:
# decode the action_type
def parse_activity_log(log):
    log = str(log)
    actions = log.split('#')
    seq = []
    for action in actions:
        item = action.split(':')
        if len(item) == 5:
            item_id, cat_id, brand_id, time_stamp, action_type = item
            seq.append({
                'item_id': int(item_id),
                'cat_id': int(cat_id),
                'brand_id': int(brand_id),
                'time_stamp': int(time_stamp),
                'action_type': int(action_type)
            })

    return seq

def count_actions(log):
    return len(log)

data_format2['parsed_log'] = data_format2['activity_log'].apply(parse_activity_log)
data_format2['action_count'] = data_format2['parsed_log'].apply(count_actions)

# Add number of actions to train_test_matrix on user_id
data_format2_grouped_by_user_id = data_format2.groupby('user_id')

train_test_matrix = train_test_matrix.merge(
    data_format2_grouped_by_user_id['action_count'].agg(['min', 'max', 'mean', 'std']).reset_index().rename(
        columns={'min': 'u_ac_min', 'max': 'u_ac_max', 'mean': 'u_ac_mean', 'std': 'u_ac_std'}
    ),
    on='user_id', how='left'
)

del data_format2_grouped_by_user_id

# Add number of actions to train_test_matrix on merchant_id
data_format2_grouped_by_merchant_id = data_format2.groupby('merchant_id')

train_test_matrix = train_test_matrix.merge(
    data_format2_grouped_by_merchant_id['action_count'].agg(['min', 'max', 'mean', 'std']).reset_index().rename(
        columns={'min': 'm_ac_min', 'max': 'm_ac_max', 'mean': 'm_ac_mean', 'std': 'm_ac_std'}
    ),
    on='merchant_id', how='left'
)

del data_format2_grouped_by_merchant_id

# Add number of actions to train_test_matrix on user meerchant pair
user_merchant_group_format2 = data_format2.groupby(['user_id', 'merchant_id'])

train_test_matrix = train_test_matrix.merge(
    user_merchant_group_format2['action_count'].agg(['min', 'max', 'mean', 'std']).reset_index().rename(
        columns={'min': 'um_ac_min', 'max': 'um_ac_max', 'mean': 'um_ac_mean', 'std': 'um_ac_std'}
    ),
    on=['user_id', 'merchant_id'],
    how='left'
)

del user_merchant_group_format2

In [None]:
# Print head of train_test_matrix
print(train_test_matrix.head())

In [None]:
train_data = train_test_matrix[train_test_matrix['origin'] == 'train'].drop(['origin'], axis=1)
test_data = train_test_matrix[train_test_matrix['origin'] == 'test'].drop(['label', 'origin'], axis=1)
train_X, train_y = train_data.drop(['label'], axis=1), train_data['label']

In [None]:
print(train_X.dtypes)

In [None]:
print(test_data.dtypes)

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(train_X, train_y, test_size=0.2, random_state=42)

In [None]:
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train)

params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'is_unbalance': True,
    'device': 'gpu',
    'gpu_platform_id': 0,
    'gpu_device_id': 0,
    'early_stopping_rounds': 10
}

gbm = lgb.train(
    params,
    lgb_train,
    num_boost_round=5000,
    valid_sets=[lgb_train, lgb_eval]
)

y_pred = gbm.predict(X_valid, num_iteration=gbm.best_iteration)
auc_score = roc_auc_score(y_valid, y_pred)
print(f"LightGBM AUC: {auc_score}")

In [None]:
# Generate submission
submission = submission_data_format1[['user_id', 'merchant_id']]
submission['prob'] = gbm.predict(test_data, num_iteration=gbm.best_iteration)
submission.to_csv('./submission/submission.csv', index=False)