# Model selection

In [1]:
import os
import numpy as np
import pandas as pd

from sklearn.metrics import root_mean_squared_error as rmse
from xgboost import XGBRegressor

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format', '{:.3f}'.format)

## 1. Data import

In [3]:
DATA_DIR = 'data'

In [4]:
TRAIN_FILE_NAME = 'train_dev.csv'

In [5]:
train_file_path = os.path.join(DATA_DIR, TRAIN_FILE_NAME)

In [6]:
df = pd.read_csv(train_file_path).set_index('user_id')

In [7]:
df.shape

(4862, 36)

In [8]:
df.head()

Unnamed: 0_level_0,entry_point=android,country_e,battles_won_d0,battles_won_d1,battles_won_d3,battles_won_d7,battles_lost_d0,battles_lost_d1,battles_lost_d3,battles_lost_d7,session_time_d0,session_time_d1,session_time_d3,session_time_d7,inactive_d1,n_active_days,wealth_on_login_max_d0,wealth_on_login_max_d1,wealth_on_login_max_d3,wealth_on_login_max_d7,finish_quest_sum_d0,finish_quest_sum_d1,finish_quest_sum_d3,finish_quest_sum_d7,level_up_max_d0,level_up_max_d1,level_up_max_d3,level_up_max_d7,payment_sum_d0,payment_sum_d1,payment_sum_d3,payment_sum_d7,wealth_on_login_max_d0=802,wealth_on_login_max_d7=802,segment,ltv_30
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1
1,1,3,47.0,172.0,214.0,232.0,1.0,1.0,2.0,8.0,341.0,761.0,1192.0,1921.0,0,8,20695.0,186656.0,249194.0,419827.0,270,400,1150,1740,10,17,17,19,122.255,195.608,195.608,391.363,0,0,TRAIN,1468.187
2,1,8,0.0,34.0,69.0,153.0,0.0,1.0,1.0,10.0,33.0,188.0,598.0,1453.0,0,8,806.0,9045.0,28680.0,90769.0,0,0,390,1230,2,9,14,15,0.0,0.0,30.648,30.648,0,0,TRAIN,619.849
3,1,3,41.0,65.0,78.0,113.0,13.0,20.0,20.0,23.0,505.0,883.0,1106.0,1567.0,0,8,4684.0,15214.0,32761.0,58584.0,230,340,600,970,9,13,14,16,0.0,48.902,48.902,73.353,0,0,TRAIN,97.804
4,1,13,26.0,47.0,89.0,125.0,1.0,1.0,3.0,3.0,126.0,222.0,631.0,860.0,0,7,8940.0,12900.0,40856.0,82547.0,100,260,780,1240,9,9,14,16,0.0,0.0,71.662,71.662,0,0,TRAIN,71.662
5,1,8,44.0,67.0,94.0,127.0,2.0,2.0,2.0,2.0,535.0,1075.0,1766.0,2523.0,0,8,17689.0,25283.0,32983.0,39425.0,350,590,980,1830,9,10,10,10,0.0,0.0,0.0,0.0,0,0,TRAIN,0.0


## 2. Datasets preparation

In [9]:
TARGET = 'ltv_30'

In [10]:
(df[TARGET] == 0).mean()

np.float64(0.637186343068696)

In [11]:
features = [x for x in df.columns.tolist() if x not in ['segment', TARGET]]

In [12]:
train_df = df[df['segment'] == 'TRAIN']
dev_df = df[df['segment'] == 'DEV']

In [13]:
print(train_df.shape)
print(dev_df.shape)

(3889, 36)
(973, 36)


In [14]:
X_train = train_df[features].values
y_train = train_df[TARGET].values

X_dev = dev_df[features].values
y_dev = dev_df[TARGET].values

In [15]:
print(X_train.shape)
print(y_train.shape)
print()
print(X_dev.shape)
print(y_dev.shape)

(3889, 34)
(3889,)

(973, 34)
(973,)


## 3. Model selection

In [16]:
n_xgb = 20

In [17]:
XGB_HYPERPARAMS_FIXED = {
    'booster': 'gbtree',
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'n_jobs': -1,
    
    'random_state': 42,
}

In [18]:
xgb_df = pd.DataFrame()

In [19]:
xgb_df['n_estimators'] = np.random.randint(low=100, high=1001, size=n_xgb)
xgb_df['max_depth'] = np.random.randint(low=3, high=11, size=n_xgb)
xgb_df['learning_rate'] = 3 * 10 ** np.random.uniform(low=-2.477, high=-1, size=n_xgb)
xgb_df['subsample'] = 10 ** np.random.uniform(low=-0.181, high=0, size=n_xgb)
xgb_df['colsample_bytree'] = np.random.uniform(low=0.5, high=1.0, size=n_xgb)
xgb_df['min_child_weight'] = np.random.uniform(low=1, high=10, size=n_xgb)
xgb_df['gamma'] = np.random.uniform(low=0, high=5, size=n_xgb)
xgb_df['reg_alpha'] = np.random.uniform(low=0, high=10, size=n_xgb)
xgb_df['reg_lambda'] = np.random.uniform(low=0, high=10, size=n_xgb)

In [20]:
xgb_df

Unnamed: 0,n_estimators,max_depth,learning_rate,subsample,colsample_bytree,min_child_weight,gamma,reg_alpha,reg_lambda
0,810,7,0.012,0.688,0.92,1.49,2.752,9.679,2.468
1,947,6,0.114,0.971,0.629,2.341,0.314,5.836,0.69
2,655,4,0.012,0.84,0.851,6.762,1.587,4.399,2.689
3,148,8,0.084,0.745,0.783,1.821,0.265,1.607,9.811
4,462,9,0.035,0.894,0.514,8.299,0.45,6.526,8.064
5,949,10,0.012,0.933,0.801,6.331,3.378,1.291,5.173
6,172,10,0.034,0.795,0.81,4.472,1.942,3.984,5.209
7,313,7,0.014,0.694,0.745,2.009,1.155,2.759,3.421
8,259,5,0.104,0.997,0.791,7.925,2.481,6.127,3.41
9,232,4,0.114,0.928,0.724,2.145,4.475,5.24,3.272


In [21]:
for index, item in xgb_df.iterrows():

    hp_var = {
        'n_estimators': int(item['n_estimators']),
        'max_depth': int(item['max_depth']),
        'learning_rate': item['learning_rate'],
        'subsample': item['subsample'],
        'colsample_bytree': item['colsample_bytree'],
        'min_child_weight': item['min_child_weight'],
        'gamma': item['gamma'],
        'reg_alpha': item['reg_alpha'],
        'reg_lambda': item['reg_lambda'],
    }

    print(hp_var)

    hp = XGB_HYPERPARAMS_FIXED | hp_var
    
    model = XGBRegressor(**hp)

    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    y_dev_pred = model.predict(X_dev)
    
    xgb_df.loc[index, 'rmse_train'] = rmse(y_train, y_train_pred)
    xgb_df.loc[index, 'rmse_dev'] = rmse(y_dev, y_dev_pred)

{'n_estimators': 810, 'max_depth': 7, 'learning_rate': np.float64(0.01179351185409908), 'subsample': np.float64(0.6876324569760308), 'colsample_bytree': np.float64(0.919534212286234), 'min_child_weight': np.float64(1.4904504989574718), 'gamma': np.float64(2.751711687437453), 'reg_alpha': np.float64(9.678725963933196), 'reg_lambda': np.float64(2.46757633609839)}
{'n_estimators': 947, 'max_depth': 6, 'learning_rate': np.float64(0.11440852220248116), 'subsample': np.float64(0.9714334862301303), 'colsample_bytree': np.float64(0.6294976968274536), 'min_child_weight': np.float64(2.3409541397681912), 'gamma': np.float64(0.3138191962754), 'reg_alpha': np.float64(5.836031270602295), 'reg_lambda': np.float64(0.6897639070904249)}
{'n_estimators': 655, 'max_depth': 4, 'learning_rate': np.float64(0.0118532001204543), 'subsample': np.float64(0.8403651569443816), 'colsample_bytree': np.float64(0.8507061160988264), 'min_child_weight': np.float64(6.762491416537665), 'gamma': np.float64(1.58683949904319

In [22]:
xgb_df = xgb_df.sort_values(by='rmse_dev', ascending=True).reset_index(drop=True)

In [23]:
xgb_df.style.format({
    'learning_rate': '{:.4f}',
    'subsample': '{:.3f}',
    'colsample_bytree': '{:.3f}',
    'min_child_weight': '{:.3f}',
    'gamma': '{:.3f}',
    'reg_alpha': '{:.3f}',
    'reg_lambda': '{:.3f}',
    'scale_pos_weight': '{:.3f}',
    
    'rmse_train': '{:.2f}',
    'rmse_dev': '{:.2f}',
}).background_gradient(axis=0, cmap='YlOrRd', subset=['learning_rate', 'subsample'])

Unnamed: 0,n_estimators,max_depth,learning_rate,subsample,colsample_bytree,min_child_weight,gamma,reg_alpha,reg_lambda,rmse_train,rmse_dev
0,655,4,0.0119,0.84,0.851,6.762,1.587,4.399,2.689,350.62,488.76
1,313,7,0.0144,0.694,0.745,2.009,1.155,2.759,3.421,288.8,493.25
2,148,8,0.0845,0.745,0.783,1.821,0.265,1.607,9.811,161.53,503.37
3,949,10,0.0118,0.933,0.801,6.331,3.378,1.291,5.173,193.57,506.27
4,330,5,0.0272,0.721,0.997,2.549,4.823,2.614,6.363,248.73,506.61
5,947,6,0.1144,0.971,0.629,2.341,0.314,5.836,0.69,0.89,506.8
6,462,9,0.0347,0.894,0.514,8.299,0.45,6.526,8.064,189.96,507.01
7,335,10,0.089,0.97,0.61,4.855,3.265,0.035,9.779,48.98,507.7
8,810,7,0.0118,0.688,0.92,1.49,2.752,9.679,2.468,121.39,508.14
9,120,8,0.031,0.878,0.994,4.854,0.644,0.068,6.261,365.05,511.29


In [25]:
xgb_df.loc[0].to_dict()

{'n_estimators': 655.0,
 'max_depth': 4.0,
 'learning_rate': 0.0118532001204543,
 'subsample': 0.8403651569443816,
 'colsample_bytree': 0.8507061160988264,
 'min_child_weight': 6.762491416537665,
 'gamma': 1.5868394990431955,
 'reg_alpha': 4.399205251373468,
 'reg_lambda': 2.689178745716033,
 'rmse_train': 350.62242824160074,
 'rmse_dev': 488.76240543146446}