# Model selection

In [1]:
import os
import numpy as np
import pandas as pd

from sklearn.metrics import root_mean_squared_error as rmse
from xgboost import XGBRegressor

from utils import get_gain_ranking

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format', '{:.3f}'.format)

## 1. Data import

In [3]:
DATA_DIR = 'data'

In [4]:
TRAIN_FILE_NAME = 'train_dev.csv'

In [5]:
train_file_path = os.path.join(DATA_DIR, TRAIN_FILE_NAME)

In [6]:
df = pd.read_csv(train_file_path).set_index('user_id')

In [7]:
df.shape

(4862, 83)

In [8]:
df.head()

Unnamed: 0_level_0,entry_point=android,country_e,country=US,battles_won_d0,battles_won_d1,battles_won_d3,battles_won_d7,battles_lost_d0,battles_lost_d1,battles_lost_d3,battles_lost_d7,battles_win_ratio_d0,battles_win_ratio_d1,battles_win_ratio_d3,battles_win_ratio_d7,session_time_d0,session_time_d1,session_time_d3,session_time_d7,session_mean_d0,session_mean_d1,session_mean_d3,session_mean_d7,n_sessions_d0,n_sessions_d1,n_sessions_d3,n_sessions_d7,inactive_d1,n_active_days,wealth_on_login_max_d0,wealth_on_login_max_d1,wealth_on_login_max_d3,wealth_on_login_max_d7,wealth_on_login_min_d0,wealth_on_login_min_d1,wealth_on_login_min_d3,wealth_on_login_min_d7,wealth_growth_d0_d1,wealth_growth_d1_d3,wealth_growth_d3_d7,wealth_growth_ratio_d0_d1,wealth_growth_ratio_d1_d3,wealth_growth_ratio_d3_d7,finish_quest_sum_d0,finish_quest_sum_d1,finish_quest_sum_d3,finish_quest_sum_d7,n_finish_quest_40_d0,n_finish_quest_40_d1,n_finish_quest_40_d3,n_finish_quest_40_d7,n_finish_quest_50_d0,n_finish_quest_50_d1,n_finish_quest_50_d3,n_finish_quest_50_d7,level_up_max_d0,level_up_max_d1,level_up_max_d3,level_up_max_d7,levels_unique_d0,levels_unique_d1,levels_unique_d3,levels_unique_d7,payment_sum_d0,payment_sum_d1,payment_sum_d3,payment_sum_d7,payment_max_d0,payment_max_d1,payment_max_d3,payment_max_d7,payment_mean_d0,payment_mean_d1,payment_mean_d3,payment_mean_d7,n_payments_d0,n_payments_d1,n_payments_d3,n_payments_d7,wealth_on_login_max_d0=802,wealth_on_login_max_d7=802,segment,ltv_30
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1
1,1,3,1,47.0,172.0,214.0,232.0,1.0,1.0,2.0,8.0,0.0,0.0,0.0,0.0,341.0,761.0,1192.0,1921.0,20.059,19.025,12.817,10.977,17,40,93,175,0,8,20695.0,186656.0,249194.0,419827.0,802.0,802.0,802.0,802.0,165961.0,62538.0,170633.0,0.183,0.21,0.336,270,400,1150,1740,3,4,9,13,0,0,1,1,10,17,17,19,9.0,16.0,16.0,18.0,122.255,195.608,195.608,391.363,24.451,24.451,24.451,97.902,24.451,24.451,24.451,32.614,5.0,8.0,8.0,12.0,0,0,TRAIN,1468.187
2,1,8,0,0.0,34.0,69.0,153.0,0.0,1.0,1.0,10.0,0.0,0.0,0.0,0.0,33.0,188.0,598.0,1453.0,6.6,18.8,26.0,25.946,5,10,23,56,0,8,806.0,9045.0,28680.0,90769.0,802.0,802.0,802.0,802.0,8239.0,19635.0,62089.0,0.009,0.028,0.088,0,0,390,1230,0,0,2,9,0,0,1,2,2,9,14,15,1.0,8.0,13.0,14.0,0.0,0.0,30.648,30.648,0.0,0.0,30.648,30.648,0.0,0.0,30.648,30.648,,,1.0,1.0,0,0,TRAIN,619.849
3,1,3,1,41.0,65.0,78.0,113.0,13.0,20.0,20.0,23.0,0.0,0.0,0.0,0.0,505.0,883.0,1106.0,1567.0,72.143,55.188,46.083,38.22,7,16,24,41,0,8,4684.0,15214.0,32761.0,58584.0,802.0,802.0,802.0,802.0,10530.0,17547.0,25823.0,0.015,0.032,0.057,230,340,600,970,2,2,3,4,0,0,1,1,9,13,14,16,8.0,12.0,13.0,15.0,0.0,48.902,48.902,73.353,0.0,24.451,24.451,24.451,0.0,24.451,24.451,24.451,,2.0,2.0,3.0,0,0,TRAIN,97.804
4,1,13,0,26.0,47.0,89.0,125.0,1.0,1.0,3.0,3.0,0.0,0.0,0.0,0.0,126.0,222.0,631.0,860.0,42.0,44.4,45.071,45.263,3,5,14,19,0,7,8940.0,12900.0,40856.0,82547.0,802.0,802.0,802.0,802.0,3960.0,27956.0,41691.0,0.013,0.04,0.079,100,260,780,1240,1,2,6,7,0,0,0,1,9,9,14,16,8.0,8.0,13.0,15.0,0.0,0.0,71.662,71.662,0.0,0.0,23.906,23.906,0.0,0.0,23.887,23.887,,,3.0,3.0,0,0,TRAIN,71.662
5,1,8,0,44.0,67.0,94.0,127.0,2.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,535.0,1075.0,1766.0,2523.0,53.5,43.0,30.982,25.745,10,25,57,98,0,8,17689.0,25283.0,32983.0,39425.0,802.0,802.0,802.0,802.0,7594.0,7700.0,6442.0,0.025,0.032,0.038,350,590,980,1830,1,4,5,8,1,1,1,3,9,10,10,10,8.0,9.0,9.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0,0,TRAIN,0.0


## 2. Datasets preparation

In [9]:
TARGET = 'ltv_30'

In [10]:
(df[TARGET] == 0).mean()

np.float64(0.637186343068696)

In [11]:
features = [x for x in df.columns.tolist() if x not in ['segment', TARGET]]

In [12]:
train_df = df[df['segment'] == 'TRAIN']
dev_df = df[df['segment'] == 'DEV']

In [13]:
print(train_df.shape)
print(dev_df.shape)

(3889, 83)
(973, 83)


In [14]:
X_train = train_df[features]#.values
y_train = train_df[TARGET].values

X_dev = dev_df[features]#.values
y_dev = dev_df[TARGET].values

In [15]:
print(X_train.shape)
print(y_train.shape)
print()
print(X_dev.shape)
print(y_dev.shape)

(3889, 81)
(3889,)

(973, 81)
(973,)


## 3. Model selection

In [16]:
n_xgb = 20

In [17]:
XGB_HYPERPARAMS_FIXED = {
    'booster': 'gbtree',
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'n_jobs': -1,
    
    'random_state': 42,
}

In [18]:
xgb_df = pd.DataFrame()

In [19]:
xgb_df['n_estimators'] = np.random.randint(low=100, high=1001, size=n_xgb)
xgb_df['max_depth'] = np.random.randint(low=3, high=11, size=n_xgb)
xgb_df['learning_rate'] = 3 * 10 ** np.random.uniform(low=-2.477, high=-1, size=n_xgb)
xgb_df['subsample'] = 10 ** np.random.uniform(low=-0.181, high=0, size=n_xgb)
xgb_df['colsample_bytree'] = np.random.uniform(low=0.5, high=1.0, size=n_xgb)
xgb_df['min_child_weight'] = np.random.uniform(low=1, high=10, size=n_xgb)
xgb_df['gamma'] = np.random.uniform(low=0, high=5, size=n_xgb)
xgb_df['reg_alpha'] = np.random.uniform(low=0, high=10, size=n_xgb)
xgb_df['reg_lambda'] = np.random.uniform(low=0, high=10, size=n_xgb)

In [20]:
xgb_df

Unnamed: 0,n_estimators,max_depth,learning_rate,subsample,colsample_bytree,min_child_weight,gamma,reg_alpha,reg_lambda
0,773,10,0.012,0.949,0.779,3.301,4.875,6.134,0.285
1,726,10,0.025,0.75,0.842,5.392,1.306,7.814,1.446
2,876,9,0.169,0.768,0.741,1.338,0.116,5.268,4.803
3,859,9,0.161,0.703,0.738,4.853,2.751,0.194,9.082
4,715,3,0.088,0.896,0.837,8.347,4.479,6.481,4.331
5,275,4,0.122,0.897,0.641,3.841,0.222,0.69,6.905
6,909,3,0.017,0.715,0.517,2.159,3.189,9.972,8.499
7,157,6,0.032,0.898,0.902,1.459,1.673,7.059,1.429
8,257,8,0.097,0.738,0.947,3.854,3.986,8.882,1.553
9,950,5,0.026,0.952,0.681,2.476,4.713,6.476,3.612


In [21]:
for index, item in xgb_df.iterrows():

    hp_var = {
        'n_estimators': int(item['n_estimators']),
        'max_depth': int(item['max_depth']),
        'learning_rate': item['learning_rate'],
        'subsample': item['subsample'],
        'colsample_bytree': item['colsample_bytree'],
        'min_child_weight': item['min_child_weight'],
        'gamma': item['gamma'],
        'reg_alpha': item['reg_alpha'],
        'reg_lambda': item['reg_lambda'],
    }

    print(hp_var)

    hp = XGB_HYPERPARAMS_FIXED | hp_var
    
    model = XGBRegressor(**hp)

    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    y_dev_pred = model.predict(X_dev)
    
    xgb_df.loc[index, 'rmse_train'] = rmse(y_train, y_train_pred)
    xgb_df.loc[index, 'rmse_dev'] = rmse(y_dev, y_dev_pred)

{'n_estimators': 773, 'max_depth': 10, 'learning_rate': np.float64(0.011888508106264763), 'subsample': np.float64(0.9492620533826412), 'colsample_bytree': np.float64(0.7788803502410688), 'min_child_weight': np.float64(3.3008528622135387), 'gamma': np.float64(4.874800927214214), 'reg_alpha': np.float64(6.133881409754791), 'reg_lambda': np.float64(0.2853806254999569)}
{'n_estimators': 726, 'max_depth': 10, 'learning_rate': np.float64(0.02470642552246917), 'subsample': np.float64(0.7498149858304566), 'colsample_bytree': np.float64(0.8419787314731617), 'min_child_weight': np.float64(5.392489381355348), 'gamma': np.float64(1.3059287331070473), 'reg_alpha': np.float64(7.814389979837656), 'reg_lambda': np.float64(1.4458586924703576)}
{'n_estimators': 876, 'max_depth': 9, 'learning_rate': np.float64(0.16904524992078435), 'subsample': np.float64(0.7684227966285097), 'colsample_bytree': np.float64(0.7410646435895123), 'min_child_weight': np.float64(1.3384372409424414), 'gamma': np.float64(0.1162

In [22]:
xgb_df = xgb_df.sort_values(by='rmse_dev', ascending=True).reset_index(drop=True)

In [23]:
xgb_df.style.format({
    'learning_rate': '{:.4f}',
    'subsample': '{:.3f}',
    'colsample_bytree': '{:.3f}',
    'min_child_weight': '{:.3f}',
    'gamma': '{:.3f}',
    'reg_alpha': '{:.3f}',
    'reg_lambda': '{:.3f}',
    'scale_pos_weight': '{:.3f}',
    
    'rmse_train': '{:.2f}',
    'rmse_dev': '{:.2f}',
}).background_gradient(axis=0, cmap='YlOrRd', subset=['learning_rate', 'subsample'])

Unnamed: 0,n_estimators,max_depth,learning_rate,subsample,colsample_bytree,min_child_weight,gamma,reg_alpha,reg_lambda,rmse_train,rmse_dev
0,157,6,0.0321,0.898,0.902,1.459,1.673,7.059,1.429,130.15,429.81
1,950,5,0.026,0.952,0.681,2.476,4.713,6.476,3.612,70.1,437.67
2,452,8,0.0135,0.76,0.647,4.101,3.518,0.665,8.633,278.23,442.11
3,909,3,0.0173,0.715,0.517,2.159,3.189,9.972,8.499,259.64,450.67
4,595,8,0.0174,0.915,0.863,7.393,4.527,9.336,6.456,185.47,457.99
5,726,10,0.0247,0.75,0.842,5.392,1.306,7.814,1.446,79.52,467.8
6,257,8,0.0966,0.738,0.947,3.854,3.986,8.882,1.553,32.91,468.95
7,376,8,0.0503,0.705,0.587,6.541,0.615,6.456,3.521,125.75,468.99
8,884,5,0.0142,0.785,0.617,8.974,1.376,3.646,2.408,210.49,472.91
9,234,9,0.0581,0.699,0.731,4.422,4.041,7.072,4.177,137.07,473.24


In [26]:
xgb_df.loc[0].to_dict()

{'n_estimators': 157.0,
 'max_depth': 6.0,
 'learning_rate': 0.03210921617155426,
 'subsample': 0.8979614161067629,
 'colsample_bytree': 0.9024503388694571,
 'min_child_weight': 1.4590048269858573,
 'gamma': 1.6730631959514086,
 'reg_alpha': 7.0586731716217646,
 'reg_lambda': 1.4294052894285947,
 'rmse_train': 130.15102571385802,
 'rmse_dev': 429.81041807913454}

In [25]:
break

SyntaxError: 'break' outside loop (668683560.py, line 1)

## 4. Feature selection

In [27]:
BEST_HP = {
    'n_estimators': 157,
    'max_depth': 6,
    'learning_rate': 0.03210921617155426,
    'subsample': 0.8979614161067629,
    'colsample_bytree': 0.9024503388694571,
    'min_child_weight': 1.4590048269858573,
    'gamma': 1.6730631959514086,
    'reg_alpha': 7.0586731716217646,
    'reg_lambda': 1.4294052894285947,
}

In [28]:
hp = XGB_HYPERPARAMS_FIXED | BEST_HP

In [29]:
best_model = XGBRegressor(**hp)

In [30]:
best_model.fit(X_train, y_train)

In [32]:
get_gain_ranking(model).head(20)

Unnamed: 0,gain
n_payments_d7,0.295
wealth_on_login_min_d0,0.142
payment_sum_d7,0.062
payment_max_d7,0.061
n_payments_d0,0.04
levels_unique_d1,0.036
n_finish_quest_50_d3,0.035
n_payments_d3,0.025
payment_mean_d3,0.023
payment_mean_d7,0.023


In [35]:
get_gain_ranking(model).head(30).index.tolist()

['n_payments_d7',
 'wealth_on_login_min_d0',
 'payment_sum_d7',
 'payment_max_d7',
 'n_payments_d0',
 'levels_unique_d1',
 'n_finish_quest_50_d3',
 'n_payments_d3',
 'payment_mean_d3',
 'payment_mean_d7',
 'payment_mean_d1',
 'levels_unique_d7',
 'session_mean_d1',
 'wealth_growth_d3_d7',
 'n_finish_quest_50_d7',
 'n_finish_quest_40_d7',
 'battles_lost_d7',
 'level_up_max_d0',
 'wealth_on_login_max_d3',
 'levels_unique_d3',
 'wealth_growth_ratio_d1_d3',
 'session_mean_d7',
 'n_sessions_d1',
 'wealth_on_login_max_d0',
 'n_finish_quest_40_d0',
 'session_mean_d0',
 'battles_lost_d3',
 'wealth_on_login_max_d1',
 'n_finish_quest_50_d1',
 'battles_won_d7']