In [21]:
import pandas as pd
from datetime import datetime
import numpy as np
import seaborn as sns
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.utils.class_weight import compute_class_weight
import os
import xgboost as xgb
import gc
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import polars as pl
from sklearn.metrics import roc_auc_score
import shap
from pathlib import Path
from tqdm import tqdm
import pickle

In [5]:
root = Path('/Users/ignat/Desktop/Projects/dodo promocodes')
def prepare_dataset():
    df = pd.read_csv(root / 'data/train_target.csv')
    orders = pd.read_csv(root / 'data/orders.csv')
    events = pd.read_csv(root / 'data/mobile_events.csv')
    promocodes = pd.read_csv(root / 'data/clients_promo_october.csv')

    df.drop(['LocalBeginDate', 'LocalEndDate'], axis=1, inplace=True)

    orders['SaleDate'] = pd.to_datetime(orders['SaleDate'])
    orders['Date'] = pd.to_datetime(orders['Date'])

    events['Timestamp'] = pd.to_datetime(events['Timestamp'])
    promocodes['LocalBeginDate'] = pd.to_datetime(promocodes['LocalBeginDate'])
    promocodes['LocalEndDate'] = pd.to_datetime(promocodes['LocalEndDate'])
    events['Timestamp'] = events['Timestamp'].dt.floor('s')
    
    num_promocodes = promocodes.groupby('ClientUUId').agg({'Id': 'count'}).reset_index().rename(columns={'Id': 'num_promocodes'})
    df_october = orders[orders.Date.dt.month == 10]
    test_1 = df_october.groupby(['ClientUUId', 'OrderUUId']).agg(
        {'apply_promo': 'first'}
    ).reset_index().drop('OrderUUId', axis=1).groupby('ClientUUId') \
    .agg({'apply_promo': 'sum'}).reset_index().rename(columns={'apply_promo': 'apply_promo_used_last_month'})

    test_1 = test_1.merge(num_promocodes, on='ClientUUId')
    test_1['ratio_apply_promo_to_num_promocodes'] = test_1['apply_promo_used_last_month'] / test_1['num_promocodes']
    test_1.drop(columns=['apply_promo_used_last_month', 'num_promocodes'], inplace=True)
    df = df.merge(test_1, on='ClientUUId', how='left')
    events['hour'] = events['Timestamp'].dt.hour
    events['weekday'] = events['Timestamp'].dt.weekday
    df_events = pl.from_pandas(events).group_by('ClientUUId').agg(
        pl.col('VisitToken').n_unique().alias('VisitToken_n_unique'),
        pl.col('hour').median().alias('hour_events_median'),
        pl.col('Platform').first(),
        pl.col('VisitToken').filter(pl.col('weekday') > 3).n_unique().alias('num_visits_from_thursday'),
        pl.col('VisitToken').filter(pl.col('weekday') < 4).n_unique().alias('num_visits_not_in_promo'),
    ).to_pandas()

    ohe = OneHotEncoder(sparse_output=False, dtype=np.int8)
    ohe.fit(events[['EventName']])

    new_features = pd.DataFrame(ohe.transform(events[['EventName']]), columns=ohe.get_feature_names_out())
    events = pd.concat([events, new_features], axis=1)
    events = pl.from_pandas(events)
    df_events2 = events.group_by('ClientUUId').agg(
        *[pl.col(col).mean().alias(f'{col}_mean') for col in ohe.get_feature_names_out()],
        *[pl.col(col).sum().alias(f'{col}_sum') for col in ohe.get_feature_names_out()],
        ((pl.datetime(2023, 11, 2, time_unit='ns', time_zone='UTC') - pl.col('Timestamp').last()).dt.total_seconds() / 3600).alias('last_online')
    )
    
    df_events = df_events.merge(df_events2.to_pandas(), on='ClientUUId', how='left')
    events = events.to_pandas()
    
    
    df_events['Platform'] = df_events['Platform'].map({'ios': 1, 'android': 0})
    df = df.merge(df_events, on='ClientUUId', how='left')
    
    id_ohe = OneHotEncoder(sparse_output=False, dtype=np.int8)
    id_features = pd.DataFrame(id_ohe.fit_transform(promocodes[['Id']]), columns=id_ohe.get_feature_names_out())
    promocodes = pd.concat([promocodes, id_features], axis=1)
    promocodes['promocode_duration'] = ((promocodes['LocalEndDate'] - promocodes['LocalBeginDate']).dt.total_seconds() / 3600)
    promocodes['OrderType'] = promocodes['OrderType'].map({'2,3': 0, '1,2,3': 1})
    promocodes['Discount_percenteges'] = np.where(promocodes['Discount'] <= 100, promocodes['Discount'], 0)
    promocodes['Discount_usual'] = np.where(promocodes['Discount'] > 100, promocodes['Discount'], 0)
    promocodes = pl.from_pandas(promocodes)
    promocodes = promocodes.group_by('ClientUUId').agg(
        pl.col('Id').count().alias('num_promocodes'),
        *[pl.col(col).mean().alias(f'{col}_mean') for col  in id_ohe.get_feature_names_out()],
        *[pl.col(col).sum().alias(f'{col}_sum') for col  in id_ohe.get_feature_names_out()],
        pl.col('OrderPrice').max().alias('OrderPrice_max'),
        pl.col('OrderPrice').min().alias('OrderPrice_min'),
        pl.col('OrderPrice').median().alias('OrderPrice_median'),
        pl.col('Discount').max().alias('Discount_max'),
        pl.col('Discount').min().alias('Discount_min'),
        pl.col('Discount').median().alias('Discount_median'),
    )
    promocodes = promocodes.to_pandas()
    df = df.merge(promocodes, on='ClientUUId', how='left')
    
    orders['in_restaurant'] = orders['addressId'].isnull().astype(np.int8)
    
    df_orders = orders.groupby('ClientUUId').agg({'NewClient': 'max', 'Date': ['max', 'min'], 
                                  'ClientOrderNumber': ['max', 'min'],
                                  }).reset_index()

    df_orders.columns = ['_'.join(col).strip() for col in df_orders.columns.values]
    df_orders.rename(columns={'ClientUUId_': 'ClientUUId'}, inplace=True)

    df_orders['orders_in_this_year'] = df_orders['ClientOrderNumber_max'] - df_orders['ClientOrderNumber_min']
    df_orders['last_order_was'] = (datetime(2023, 11, 1) - df_orders['Date_max']).dt.days 

    orders.drop(columns=['NewClient'], inplace=True, errors='ignore')

    orders['discount_for_product'] = orders['MenuPrice'] - orders['ProductTotalPrice']
    

    enc_category = OneHotEncoder(sparse_output=False)
    enc_category.fit(orders[['CategoryId']])
    categories_orders  = pd.DataFrame(enc_category.transform(orders[['CategoryId']]), columns=enc_category.get_feature_names_out())

    orders = pd.concat([orders, categories_orders], axis=1) 
    orders = pl.from_pandas(orders)
    orders_agg = orders.group_by(['ClientUUId', 'OrderUUId']).agg(
        pl.col('in_restaurant').first(),
        pl.col('OrderTotalPrice').first(),
        pl.col('OrderPaymentType').first(),
        pl.col('OrderType').first(),
        pl.col('OrderState').first(),
        pl.col('ClientOrderNumber').first(),
        pl.col('apply_promo').first(),
        pl.col('Date').first(),
        pl.col('addressId').first(),
        pl.col('deliverySectorId').first(),
        pl.col('SaleDate').first(),
        pl.col('UnitUUId').first(),
        pl.col('ProductUUId').count().alias('ProductUUId_count'),
        pl.col('ProductUUId').unique().count().alias('ProductUUId_unique_count'),
        pl.col('discount_for_product').mean().alias('discount_for_product_mean'),
        pl.col('discount_for_product').sum().alias('discount_for_product_sum'),
        pl.col('CategoryId').mode().get(0).alias('CategoryId_mode'),
        pl.col('MenuPrice').max().alias('MenuPrice_max'),
        pl.col('MenuPrice').min().alias('MenuPrice_min'),
        pl.col('MenuPrice').median().alias('MenuPrice_median'),
        pl.col('ProductTotalPrice').max().alias('ProductTotalPrice_max'),
        pl.col('ProductTotalPrice').min().alias('ProductTotalPrice_min'),
        pl.col('ProductTotalPrice').median().alias('ProductTotalPrice_median'),
        pl.col('ProductTotalPrice').sum().alias('ProductTotalPrice_sum'),
        pl.col('ProductTotalPrice').mean().alias('ProductTotalPrice_mean'),
        *[pl.col(col).first() for col in enc_category.get_feature_names_out()],
        
    )
    orders_agg = orders_agg.sort(by=['ClientUUId', 'ClientOrderNumber'])

    orders_agg = orders_agg.group_by('ClientUUId').agg(
        *[pl.col(col).mean().alias(f'{col}_mean') for col in enc_category.get_feature_names_out()],
        *[pl.col(col).sum().alias(f'{col}_sum') for col in enc_category.get_feature_names_out()], 
        *[func('OrderTotalPrice').alias(f'OrderTotalPrice_{func.__name__}') for func in [pl.max, pl.min, pl.median, pl.mean, pl.last]],
        *[func('apply_promo').alias(f'apply_promo_{func.__name__}') for func in [pl.sum, pl.mean, pl.last]],
        pl.last('ClientOrderNumber').alias('ClientOrderNumber_last'),
        pl.col('deliverySectorId').unique().count().alias(f'deliverySectorId_unq_count'),
        *[func('MenuPrice_max').alias(f'MenuPrice_max_{func.__name__}') for func in [pl.max, pl.min, pl.median, pl.mean, pl.last, pl.sum]],
        *[func('MenuPrice_min').alias(f'MenuPrice_min_{func.__name__}') for func in [pl.max, pl.min, pl.median, pl.mean, pl.last, pl.sum]],
        *[func('MenuPrice_median').alias(f'MenuPrice_median_{func.__name__}') for func in [pl.max, pl.min, pl.median, pl.mean, pl.last, pl.sum]],
        *[func('ProductTotalPrice_max').alias(f'ProductTotalPrice_max_{func.__name__}') for func in [pl.max, pl.min, pl.median, pl.mean, pl.last, pl.sum]],
        *[func('ProductTotalPrice_min').alias(f'ProductTotalPrice_min_{func.__name__}') for func in [pl.max, pl.min, pl.median, pl.mean, pl.last, pl.sum]],
        *[func('ProductTotalPrice_median').alias(f'ProductTotalPrice_median_{func.__name__}') for func in [pl.max, pl.min, pl.median, pl.mean, pl.last, pl.sum]],
        *[func('ProductUUId_unique_count').alias(f'ProductUUId_unique_count_{func.__name__}') for func in [pl.max, pl.min, pl.median, pl.mean, pl.last, pl.sum]],
        pl.col('apply_promo').filter(pl.col('Date').dt.month() == 10).mean().alias('apply_promo_mean_last_month'),
        pl.col('apply_promo').filter(pl.col('Date').dt.month() == 10).sum().alias('apply_promo_sum_last_month'),
        pl.col('in_restaurant').filter(pl.col('Date').dt.month() == 10).mean().alias('in_restaurant_mean_last_month'),
        pl.col('in_restaurant').filter(pl.col('Date').dt.month() == 10).sum().alias('in_restaurant_sum_last_month'),
        pl.col('in_restaurant').mean().alias('in_restaurant_mean'),
        pl.col('in_restaurant').sum().alias('in_restaurant_sum'),
        *[func('ProductTotalPrice_sum').alias(f'ProductTotalPrice_sum_{func.__name__}') for func in [pl.max, pl.min, pl.median, pl.sum]],
        *[func('ProductTotalPrice_mean').alias(f'ProductTotalPrice_mean_{func.__name__}') for func in [pl.max, pl.min, pl.median, pl.sum]],
        pl.col('OrderTotalPrice').filter(pl.col('Date').dt.month() == 10).mean().alias('OrderTotalPrice_mean_last_month'),
        pl.col('OrderTotalPrice').filter(pl.col('Date').dt.month() == 10).sum().alias('OrderTotalPrice_sum_last_month'),
    )
    
    df_orders = df_orders.merge(orders_agg.to_pandas(), on='ClientUUId', how='left')
    df = df.merge(df_orders, on='ClientUUId', how='left')
    
    le = LabelEncoder()
    df['groups'] = le.fit_transform(df[['ClientUUId']])
    test = pd.read_csv(root / 'data/test-2.csv')
    
    promocodes_copy = pd.read_csv(root / 'data/clients_promo_october.csv')
    promocodes_copy['LocalBeginDate'] = pd.to_datetime(promocodes_copy['LocalBeginDate']).dt.tz_convert(None)
    promocodes_copy['LocalEndDate'] = pd.to_datetime(promocodes_copy['LocalEndDate']).dt.tz_convert(None)
    orders_agg = orders.group_by(['ClientUUId', 'OrderUUId']).agg(
        pl.col('apply_promo').first(),
        pl.col('Date').first(),
        pl.col('OrderTotalPrice').first(),
        pl.col('OrderType').first(),
    ).filter((pl.col('apply_promo') == 1) & (pl.col('Date').dt.month() == 10))
    orders_agg = orders_agg.join(pl.from_pandas(promocodes_copy), on='ClientUUId', how='left')
    orders_agg = orders_agg.filter((pl.col('OrderTotalPrice')>=pl.col('OrderPrice')) & (pl.col('LocalBeginDate') <= pl.col('Date')) 
                    & (pl.col('Date') <= pl.col('LocalEndDate')) & (pl.col('OrderType_right').str.find(pl.col('OrderType')) > -1)).to_pandas()

    idx = orders_agg.groupby(['ClientUUId', 'Date'])['OrderPrice'].idxmax()
    orders_agg = orders_agg.loc[idx].sort_values(by='ClientUUId')
    orders_agg.sort_values(by=['ClientUUId', 'Date'])
    orders_agg = pl.from_pandas(orders_agg)
    orders_agg = orders_agg.group_by(['ClientUUId']).agg(
        pl.col('OrderTotalPrice').last().alias('OrderTotalPrice_last_fp'),
        pl.col('OrderType').last().alias('OrderType_last_fp'),
        pl.col('Id').last().alias('Id_last_fp'),
        pl.col('OrderPrice').last().alias('OrderPrice_last_fp'),
        pl.col('Discount').last().alias('Discount_last_fp'),
    ).to_pandas()
    df = df.merge(orders_agg , on='ClientUUId', how='left')
    columns = ['OrderTotalPrice_last_fp', 'OrderType_last_fp', 'Id_last_fp', 'OrderPrice_last_fp', 'Discount_last_fp']
    df[columns] = df[columns].fillna(-1)
    promocodes_equality = df[df.apply(lambda row: row['OrderType'].find(str(row['OrderType_last_fp'])) > -1 
                                      and row['Id'] == row['Id_last_fp'], axis=1)][['ClientUUId', 'Id', 'OrderType']]
    promocodes_equality['promocodes_equality'] = 1
    df = df.merge(promocodes_equality, on=['ClientUUId', 'Id', 'OrderType'], how='left')
    df['promocodes_equality'] = df['promocodes_equality'].fillna(0)

    test = test.merge(orders_agg , on='ClientUUId', how='left')
    columns = ['OrderTotalPrice_last_fp', 'OrderType_last_fp', 'Id_last_fp', 'OrderPrice_last_fp', 'Discount_last_fp']
    test[columns] = test[columns].fillna(-1)
    test = test.merge(promocodes_equality, on=['ClientUUId', 'Id', 'OrderType'], how='left')
    test['promocodes_equality'] = test['promocodes_equality'].fillna(0)
    
    test = test.merge(df_orders, on='ClientUUId', how='left')
    test = test.merge(df_events, on='ClientUUId', how='left')
    test = test.merge(promocodes, on='ClientUUId', how='left')
    test = test.merge(test_1, on='ClientUUId', how='left')
    
    df.drop(columns, inplace=True, axis=1)
    test.drop(columns, inplace=True, axis=1)
    
    ohe = OneHotEncoder(sparse_output=False, dtype=np.int8)
    ohe.fit(df[['Id']])

    new_features = pd.DataFrame(ohe.transform(df[['Id']]), columns=ohe.get_feature_names_out())
    df = pd.concat([df, new_features], axis=1)
    
    new_features = pd.DataFrame(ohe.transform(test[['Id']]), columns=ohe.get_feature_names_out())
    test = pd.concat([test, new_features], axis=1)
    
    test['Is_id_7_mean'] = test['Id_7_mean'] * test['Id_7']
    test['Is_id_7_sum'] = test['Id_7_sum'] * test['Id_7']

    test['Is_id_6_mean'] = test['Id_6_mean'] * test['Id_6']
    test['Is_id_6_sum'] = test['Id_6_sum'] * test['Id_6']

    test['Is_id_5_mean'] = test['Id_5_mean'] * test['Id_5']
    test['Is_id_5_sum'] = test['Id_5_sum'] * test['Id_5']

    df['Is_id_7_mean'] = df['Id_7_mean'] * df['Id_7']
    df['Is_id_7_sum'] = df['Id_7_sum'] * df['Id_7']

    df['Is_id_6_mean'] = df['Id_6_mean'] * df['Id_6']
    df['Is_id_6_sum'] = df['Id_6_sum'] * df['Id_6']

    df['Is_id_5_mean'] = df['Id_5_mean'] * df['Id_5']
    df['Is_id_5_sum'] = df['Id_5_sum'] * df['Id_5']
    return df, test

In [6]:
df, test = prepare_dataset()

  y = column_or_1d(y, warn=True)


In [7]:
def find_high_correlation_features(df):
    corr_matrix = df.select_dtypes([float, int]).corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] > 0.99)]
    return to_drop
to_drop = find_high_correlation_features(df)
df.drop(columns=to_drop, inplace=True)
test.drop(columns=to_drop, inplace=True)

df.shape, to_drop

((27741, 135),
 ['ClientOrderNumber_min',
  'CategoryId_2_sum',
  'CategoryId_3_sum',
  'CategoryId_4_sum',
  'CategoryId_5_sum',
  'CategoryId_6_sum',
  'CategoryId_7_sum',
  'apply_promo_mean',
  'ClientOrderNumber_last',
  'in_restaurant_sum',
  'ProductTotalPrice_sum_max',
  'ProductTotalPrice_sum_min',
  'ProductTotalPrice_sum_median'])

In [8]:
index_for_model_1 = df.dropna(axis=0).index
X = df.iloc[index_for_model_1].select_dtypes([float, int]).drop(columns=['apply_promo', 'groups'])
y = df.iloc[index_for_model_1]['apply_promo']
groups = df.iloc[index_for_model_1]['groups']
result_df = pd.DataFrame({'score': 0.0, 'ClientUUId': df['ClientUUId'], 'OrderType' : df['OrderType'], 'target': df['apply_promo']
                          }, index=df.iloc[index_for_model_1].index)

In [9]:
models = []
num_iter = []
skf = StratifiedGroupKFold(n_splits=10)
for i, (train_index, test_index) in enumerate(skf.split(X, y, groups)):
    print(f"Fold {i + 1}:")
    
    dtrain = xgb.DMatrix(X.iloc[train_index], label=y.iloc[train_index])
    dtest = xgb.DMatrix(X.iloc[test_index], label=y.iloc[test_index])
    
    params = {
        "objective": "binary:logistic",
        "eval_metric": "auc",
        "learning_rate": 0.05,
        "n_estimators": 3000,
        "tree_method": "hist",
        "verbosity": 0,
        'max_depth': 5,
        'colsample_bytree': 0.6489131779549984,
        'colsample_bynode': 0.6009610046445969,
        'colsample_bylevel': 0.7957273275573362,
        'subsample': 0.6805161691122295,
        'alpha': 8.371930284576981,
        'lambda': 5.649133354153159,
        'gamma': 7.103762168902399,
        'min_child_weight': 4.101573431623777,
        'max_delta_step': 1,
        'sampling_method': 'uniform'
    }

    evals = [(dtrain, 'train'), (dtest, 'eval')]

    model = xgb.train(
        params,
        dtrain,
        num_boost_round=3000,
        evals=evals,
        early_stopping_rounds=90,
        verbose_eval=50,
    )

    result_df.iloc[test_index, 0] = model.predict(dtest)
    models.append(model)
    num_iter.append(model.best_iteration)

Fold 1:
[0]	train-auc:0.68836	eval-auc:0.72625
[50]	train-auc:0.82253	eval-auc:0.75794
[100]	train-auc:0.86073	eval-auc:0.76082
[110]	train-auc:0.86409	eval-auc:0.76000
Fold 2:
[0]	train-auc:0.64673	eval-auc:0.63596
[50]	train-auc:0.82435	eval-auc:0.78086
[100]	train-auc:0.85148	eval-auc:0.78872
[150]	train-auc:0.86610	eval-auc:0.79191
[200]	train-auc:0.87207	eval-auc:0.79600
[250]	train-auc:0.87546	eval-auc:0.79951
[300]	train-auc:0.87749	eval-auc:0.80039
[350]	train-auc:0.87921	eval-auc:0.80126
[400]	train-auc:0.88205	eval-auc:0.80286
[450]	train-auc:0.88467	eval-auc:0.80435
[500]	train-auc:0.88599	eval-auc:0.80453
[550]	train-auc:0.88775	eval-auc:0.80656
[600]	train-auc:0.88856	eval-auc:0.80732
[650]	train-auc:0.88959	eval-auc:0.80767
[700]	train-auc:0.89067	eval-auc:0.80829
[750]	train-auc:0.89255	eval-auc:0.80859
[794]	train-auc:0.89298	eval-auc:0.80880
Fold 3:
[0]	train-auc:0.71942	eval-auc:0.63254
[50]	train-auc:0.82505	eval-auc:0.76671
[100]	train-auc:0.85515	eval-auc:0.77017
[

In [10]:
roc_auc_score(result_df['target'].astype(float), result_df['score'].astype(float))

0.7820448943704266

In [11]:
dtrain = xgb.DMatrix(X, label=y)
params = {
        "objective": "binary:logistic",
        "eval_metric": "auc",
        "learning_rate": 0.05,
        "n_estimators": 3000,
        "tree_method": "hist",
        "verbosity": 0,
        'max_depth': 5,
        'colsample_bytree': 0.6489131779549984,
        'colsample_bynode': 0.6009610046445969,
        'colsample_bylevel': 0.7957273275573362,
        'subsample': 0.6805161691122295,
        'alpha': 8.371930284576981,
        'lambda': 5.649133354153159,
        'gamma': 7.103762168902399,
        'min_child_weight': 4.101573431623777,
        'max_delta_step': 1,
        'sampling_method': 'uniform'
}
watchlist = [(dtrain, 'train')]
meta_model = xgb.train(
    params,
    dtrain,
    num_boost_round=int(np.median(num_iter)),
    evals=watchlist,
    verbose_eval=50,
)

[0]	train-auc:0.69688
[50]	train-auc:0.82463
[100]	train-auc:0.85400
[150]	train-auc:0.87119
[200]	train-auc:0.87842
[227]	train-auc:0.88121


### Train model without orders

In [12]:
df = df.dropna(axis=1,thresh=18_000)

X = df.select_dtypes([float, int]).drop(columns=['apply_promo', 'groups'])
y = df['apply_promo']
groups = df['groups']
result_df2 = pd.DataFrame({'score': 0.0, 'ClientUUId': df['ClientUUId'], 'OrderType' : df['OrderType'], 'target': df['apply_promo']
                          }, index=df.index)
X.shape

(27741, 50)

In [13]:
models_2 = []
num_iter_2 = []
skf = StratifiedGroupKFold(n_splits=10)
for i, (train_index, test_index) in enumerate(skf.split(X, y, groups)):
    print(f"Fold {i + 1}:")
    
    dtrain = xgb.DMatrix(X.iloc[train_index], label=y.iloc[train_index])
    dtest = xgb.DMatrix(X.iloc[test_index], label=y.iloc[test_index])
    
    params = {
        "objective": "binary:logistic",
        "eval_metric": "auc",
        "learning_rate": 0.05,
        "tree_method": "hist",
        "verbosity": 0,
        'max_depth': 4,
        'colsample_bytree': 0.7055286391212441,
        'colsample_bynode': 0.6915897982585256,
        'colsample_bylevel': 0.7855187905031771,
        'subsample': 0.7684073221158629,
        'alpha': 5.250995916757242,
        'lambda': 0.7316373694506648,
        'gamma': 7.108685845995447,
        'min_child_weight': 6.354648841169535,
        'max_delta_step': 0,
        # 'sampling_method': 'gradient_based'
 }
        

    watchlist = [(dtrain, 'train'), (dtest, 'eval')]

    model = xgb.train(
        params,
        dtrain,
        num_boost_round=3000,
        evals=watchlist,
        early_stopping_rounds=90,
        verbose_eval=100,
    )

    result_df2.iloc[test_index, 0] = model.predict(dtest)
    models_2.append(model)
    num_iter_2.append(model.best_iteration + 90)

Fold 1:
[0]	train-auc:0.63550	eval-auc:0.60419
[100]	train-auc:0.77726	eval-auc:0.75021
[200]	train-auc:0.79472	eval-auc:0.75724
[300]	train-auc:0.79689	eval-auc:0.75824
[400]	train-auc:0.79827	eval-auc:0.75835
[431]	train-auc:0.79843	eval-auc:0.75856
Fold 2:
[0]	train-auc:0.67530	eval-auc:0.65722
[100]	train-auc:0.79175	eval-auc:0.68705
[200]	train-auc:0.80467	eval-auc:0.68739
[239]	train-auc:0.80596	eval-auc:0.68865
Fold 3:
[0]	train-auc:0.67240	eval-auc:0.69539
[100]	train-auc:0.78205	eval-auc:0.77130
[200]	train-auc:0.79606	eval-auc:0.77498
[300]	train-auc:0.79858	eval-auc:0.77810
[400]	train-auc:0.80073	eval-auc:0.77851
[500]	train-auc:0.80166	eval-auc:0.78033
[592]	train-auc:0.80197	eval-auc:0.78040
Fold 4:
[0]	train-auc:0.68222	eval-auc:0.69391
[100]	train-auc:0.78747	eval-auc:0.74119
[169]	train-auc:0.79633	eval-auc:0.73838
Fold 5:
[0]	train-auc:0.68213	eval-auc:0.62162
[100]	train-auc:0.79340	eval-auc:0.69747
[200]	train-auc:0.80541	eval-auc:0.70794
[300]	train-auc:0.80834	eva

In [14]:
result = pd.DataFrame(columns=result_df2.columns)
for row in result_df2.iterrows():
    if row[0] in result_df.index:
        result.loc[row[0], :] = result_df.loc[row[0]]
    else:
        result.loc[row[0], :] = result_df2.iloc[row[0]]

In [15]:
roc_auc_score(result['target'].astype(float), result['score'].astype(float))

0.7751348309124527

In [16]:
dtrain = xgb.DMatrix(X, label=y)
params = {
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "learning_rate": 0.05,
    "tree_method": "hist",
    "verbosity": 0,
    'max_depth': 4,
    'colsample_bytree': 0.7055286391212441,
    'colsample_bynode': 0.6915897982585256,
    'colsample_bylevel': 0.7855187905031771,
    'subsample': 0.7684073221158629,
    'alpha': 5.250995916757242,
    'lambda': 0.7316373694506648,
    'gamma': 7.108685845995447,
    'min_child_weight': 6.354648841169535,
    'max_delta_step': 0,
}
watchlist = [(dtrain, 'train')]
meta_model_2 = xgb.train(
    params,
    dtrain,
    num_boost_round=int(np.median(num_iter_2)) - 90,
    evals=watchlist,
    verbose_eval=50,
)

[0]	train-auc:0.68411
[50]	train-auc:0.76100
[100]	train-auc:0.79308
[135]	train-auc:0.79955


### Predict test

In [17]:
test = test.select_dtypes([float, int])
test_result = pd.read_csv('data/test-2.csv')
test_result['apply_promo'] = 0

In [18]:
final_scores = []

for i in tqdm(range(len(test))):
    scores = []
    data = test.iloc[i]
    if data.isnull().sum() > 40:
        data = pd.DataFrame(data.to_frame().T)
        data = data[models_2[0].feature_names]
        data_dmatrix = xgb.DMatrix(data)
        scores.append(meta_model_2.predict(data_dmatrix))
        # for model in models_2:
        #     scores.append(model.predict(data_dmatrix))
    else:
        data = pd.DataFrame(data.to_frame().T)
        data = data[models[0].feature_names]
        data_dmatrix = xgb.DMatrix(data)
        scores.append(meta_model.predict(data_dmatrix))
        # for model in models:
        #     scores.append(model.predict(data_dmatrix))
            
    final_score = np.mean(scores)
    final_scores.append(final_score)

test_result.iloc[:, -1] = final_scores


100%|██████████| 6806/6806 [01:27<00:00, 77.49it/s] 


In [19]:
test_result.to_csv('sub.csv', index=False)

In [20]:
test_result

Unnamed: 0,ClientUUId,Id,OrderType,LocalBeginDate,LocalEndDate,OrderPrice,Discount,apply_promo
0,000D3A20F23EA95811E7C0A95563344E,7,23,2023-11-02T00:00:00Z,2023-11-05T23:59:00Z,799,200,0.027065
1,000D3A20F23EA95811E7C7892A0CE261,5,23,2023-11-02T00:00:00Z,2023-11-05T23:59:00Z,699,200,0.183698
2,000D3A20F23EA95811E7CD686C396528,6,23,2023-11-02T00:00:00Z,2023-11-05T23:59:00Z,799,20,0.022180
3,000D3A20F23EA95911E7CEA8C574EDAE,5,23,2023-11-02T00:00:00Z,2023-11-05T23:59:00Z,799,200,0.043432
4,000D3A20F23EA95911E7D4F05C59C978,7,23,2023-11-02T00:00:00Z,2023-11-05T23:59:00Z,799,200,0.054016
...,...,...,...,...,...,...,...,...
6801,E25501F0CF189F4711ECF161D396AAEB,5,123,2023-11-02T00:00:00Z,2023-11-05T23:59:00Z,1249,250,0.011764
6802,000D3AAC977BBB2F11ECDE319AE8B765,7,123,2023-11-02T00:00:00Z,2023-11-05T23:59:00Z,1249,200,0.016657
6803,000D3A39D824A81611E922DAAA472ACF,6,123,2023-11-02T00:00:00Z,2023-11-05T23:59:00Z,1249,20,0.045454
6804,000D3A39D824A82E11E983DB973D46C8,7,123,2023-11-02T00:00:00Z,2023-11-05T23:59:00Z,1149,200,0.061874


In [22]:
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)
    
with open('model_2.pkl', 'wb') as f:
    pickle.dump(models_2, f)
    
with open('meta_model.pkl', 'wb') as f:
    pickle.dump(meta_model, f)
    
with open('meta_model_2.pkl', 'wb') as f:
    pickle.dump(meta_model_2, f)