# Predictions

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import root_mean_squared_error as rmse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import r2_score

from xgboost import XGBRegressor

from utils import get_gain_ranking
from feature_engineering import generate_event_features

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format', '{:.3f}'.format)

## 1. Data import

In [3]:
DATA_DIR = 'data'

In [4]:
TRAIN_FILE_NAME = 'train_dev.csv'

In [5]:
train_file_path = os.path.join(DATA_DIR, TRAIN_FILE_NAME)

In [6]:
df = pd.read_csv(train_file_path).set_index('user_id')

In [7]:
df.shape

(4862, 83)

In [8]:
df.head()

Unnamed: 0_level_0,entry_point=android,country_e,country=US,battles_won_d0,battles_won_d1,battles_won_d3,battles_won_d7,battles_lost_d0,battles_lost_d1,battles_lost_d3,battles_lost_d7,battles_win_ratio_d0,battles_win_ratio_d1,battles_win_ratio_d3,battles_win_ratio_d7,session_time_d0,session_time_d1,session_time_d3,session_time_d7,session_mean_d0,session_mean_d1,session_mean_d3,session_mean_d7,n_sessions_d0,n_sessions_d1,n_sessions_d3,n_sessions_d7,inactive_d1,n_active_days,wealth_on_login_max_d0,wealth_on_login_max_d1,wealth_on_login_max_d3,wealth_on_login_max_d7,wealth_on_login_min_d0,wealth_on_login_min_d1,wealth_on_login_min_d3,wealth_on_login_min_d7,wealth_growth_d0_d1,wealth_growth_d1_d3,wealth_growth_d3_d7,wealth_growth_ratio_d0_d1,wealth_growth_ratio_d1_d3,wealth_growth_ratio_d3_d7,finish_quest_sum_d0,finish_quest_sum_d1,finish_quest_sum_d3,finish_quest_sum_d7,n_finish_quest_40_d0,n_finish_quest_40_d1,n_finish_quest_40_d3,n_finish_quest_40_d7,n_finish_quest_50_d0,n_finish_quest_50_d1,n_finish_quest_50_d3,n_finish_quest_50_d7,level_up_max_d0,level_up_max_d1,level_up_max_d3,level_up_max_d7,levels_unique_d0,levels_unique_d1,levels_unique_d3,levels_unique_d7,payment_sum_d0,payment_sum_d1,payment_sum_d3,payment_sum_d7,payment_max_d0,payment_max_d1,payment_max_d3,payment_max_d7,payment_mean_d0,payment_mean_d1,payment_mean_d3,payment_mean_d7,n_payments_d0,n_payments_d1,n_payments_d3,n_payments_d7,wealth_on_login_max_d0=802,wealth_on_login_max_d7=802,segment,ltv_30
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1
1,1,3,1,47.0,172.0,214.0,232.0,1.0,1.0,2.0,8.0,0.0,0.0,0.0,0.0,341.0,761.0,1192.0,1921.0,20.059,19.025,12.817,10.977,17,40,93,175,0,8,20695.0,186656.0,249194.0,419827.0,802.0,802.0,802.0,802.0,165961.0,62538.0,170633.0,0.183,0.21,0.336,270,400,1150,1740,3,4,9,13,0,0,1,1,10,17,17,19,9.0,16.0,16.0,18.0,122.255,195.608,195.608,391.363,24.451,24.451,24.451,97.902,24.451,24.451,24.451,32.614,5.0,8.0,8.0,12.0,0,0,TRAIN,1468.187
2,1,8,0,0.0,34.0,69.0,153.0,0.0,1.0,1.0,10.0,0.0,0.0,0.0,0.0,33.0,188.0,598.0,1453.0,6.6,18.8,26.0,25.946,5,10,23,56,0,8,806.0,9045.0,28680.0,90769.0,802.0,802.0,802.0,802.0,8239.0,19635.0,62089.0,0.009,0.028,0.088,0,0,390,1230,0,0,2,9,0,0,1,2,2,9,14,15,1.0,8.0,13.0,14.0,0.0,0.0,30.648,30.648,0.0,0.0,30.648,30.648,0.0,0.0,30.648,30.648,,,1.0,1.0,0,0,TRAIN,619.849
3,1,3,1,41.0,65.0,78.0,113.0,13.0,20.0,20.0,23.0,0.0,0.0,0.0,0.0,505.0,883.0,1106.0,1567.0,72.143,55.188,46.083,38.22,7,16,24,41,0,8,4684.0,15214.0,32761.0,58584.0,802.0,802.0,802.0,802.0,10530.0,17547.0,25823.0,0.015,0.032,0.057,230,340,600,970,2,2,3,4,0,0,1,1,9,13,14,16,8.0,12.0,13.0,15.0,0.0,48.902,48.902,73.353,0.0,24.451,24.451,24.451,0.0,24.451,24.451,24.451,,2.0,2.0,3.0,0,0,TRAIN,97.804
4,1,13,0,26.0,47.0,89.0,125.0,1.0,1.0,3.0,3.0,0.0,0.0,0.0,0.0,126.0,222.0,631.0,860.0,42.0,44.4,45.071,45.263,3,5,14,19,0,7,8940.0,12900.0,40856.0,82547.0,802.0,802.0,802.0,802.0,3960.0,27956.0,41691.0,0.013,0.04,0.079,100,260,780,1240,1,2,6,7,0,0,0,1,9,9,14,16,8.0,8.0,13.0,15.0,0.0,0.0,71.662,71.662,0.0,0.0,23.906,23.906,0.0,0.0,23.887,23.887,,,3.0,3.0,0,0,TRAIN,71.662
5,1,8,0,44.0,67.0,94.0,127.0,2.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,535.0,1075.0,1766.0,2523.0,53.5,43.0,30.982,25.745,10,25,57,98,0,8,17689.0,25283.0,32983.0,39425.0,802.0,802.0,802.0,802.0,7594.0,7700.0,6442.0,0.025,0.032,0.038,350,590,980,1830,1,4,5,8,1,1,1,3,9,10,10,10,8.0,9.0,9.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0,0,TRAIN,0.0


## 2. Datasets preparation

In [9]:
TARGET = 'ltv_30'

In [10]:
# features = [x for x in df.columns.tolist() if x not in ['segment', TARGET]]

In [11]:
features = [
    'n_payments_d7',
    'wealth_on_login_min_d0',
    'payment_sum_d7',
    'payment_max_d7',
    'n_payments_d0',
    'levels_unique_d1',
    'n_finish_quest_50_d3',
    'n_payments_d3',
    'payment_mean_d3',
    'payment_mean_d7',
    'payment_mean_d1',
    'levels_unique_d7',
    'session_mean_d1',
    'wealth_growth_d3_d7',
    'n_finish_quest_50_d7',
    'n_finish_quest_40_d7',
    'battles_lost_d7',
    'level_up_max_d0',
    'wealth_on_login_max_d3',
    'levels_unique_d3',
    'wealth_growth_ratio_d1_d3',
    'session_mean_d7',
    'n_sessions_d1',
    'wealth_on_login_max_d0',
    'n_finish_quest_40_d0',
]

In [12]:
train_df = df[df['segment'] == 'TRAIN']
dev_df = df[df['segment'] == 'DEV']

In [13]:
print(train_df.shape)
print(dev_df.shape)

(3889, 83)
(973, 83)


In [14]:
X_train = train_df[features]#.values
y_train = train_df[TARGET].values

X_dev = dev_df[features]#.values
y_dev = dev_df[TARGET].values

In [15]:
print(X_train.shape)
print(y_train.shape)
print()
print(X_dev.shape)
print(y_dev.shape)

(3889, 25)
(3889,)

(973, 25)
(973,)


## 3. Best model training
### 3.1. Training

In [16]:
XGB_HYPERPARAMS_FIXED = {
    'booster': 'gbtree',
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'n_jobs': -1,
    
    'random_state': 42,
}

In [17]:
BEST_HP = {
    'n_estimators': 157,
    'max_depth': 6,
    'learning_rate': 0.03210921617155426,
    'subsample': 0.8979614161067629,
    'colsample_bytree': 0.9024503388694571,
    'min_child_weight': 1.4590048269858573,
    'gamma': 1.6730631959514086,
    'reg_alpha': 7.0586731716217646,
    'reg_lambda': 1.4294052894285947,
}

In [18]:
hp = XGB_HYPERPARAMS_FIXED | BEST_HP

In [19]:
best_model = XGBRegressor(**hp)

In [20]:
best_model.fit(X_train, y_train)

In [21]:
get_gain_ranking(best_model)

Unnamed: 0,gain
payment_sum_d7,0.183
n_payments_d7,0.18
session_mean_d1,0.077
payment_mean_d7,0.067
wealth_growth_ratio_d1_d3,0.055
session_mean_d7,0.046
levels_unique_d3,0.042
wealth_on_login_max_d3,0.041
payment_mean_d3,0.034
payment_max_d7,0.033


In [22]:
df['y_pred'] = best_model.predict(df[features].values)

In [23]:
df['y_pred'] = np.where(df['payment_sum_d0'] == df['payment_sum_d7'], df['payment_sum_d7'], df['y_pred'])

## 3.2. Performance measuring

In [24]:
y_pred_dev = df.loc[dev_df.index, 'y_pred']

In [45]:
DEV_METRICS = {
    'RMSE': float(rmse(y_dev, y_pred_dev)),
    'MAE': float(mae(y_dev, y_pred_dev)),
    'R2': float(r2_score(y_dev, y_pred_dev)),
}

In [47]:
for k, v in DEV_METRICS.items():
    print('{}: {:.4f}'.format(k, v))

RMSE: 412.1940
MAE: 79.3300
R2: 0.7005


## 4. Test set prediction
### 4.1. Dataset import

In [26]:
TEST_USERS_FILE_NAME = 'user_profile_test.csv'
TEST_EVENTS_FILE_NAME = 'events_test.csv'

In [27]:
test_users_file_path = os.path.join(DATA_DIR, TEST_USERS_FILE_NAME)
test_events_file_path = os.path.join(DATA_DIR, TEST_EVENTS_FILE_NAME)

In [28]:
test_df = pd.read_csv(test_users_file_path).set_index('user_id')

In [29]:
test_events_df = pd.read_csv(test_events_file_path)

In [30]:
test_df.shape

(604, 3)

In [31]:
test_events_df.shape

(57076, 4)

### 4.2. Preprocessing

In [32]:
test_df['reg_ts'] = pd.to_datetime(test_df['reg_ts'], unit='s')

In [33]:
test_events_df['event_ts'] = pd.to_datetime(test_events_df['event_ts'], unit='s')

In [34]:
test_df['entry_point=android'] = (test_df['entry_point'] == 'android').astype(int)

In [35]:
test_df['country=US'] = (test_df['country'] == 'US').astype(int)

In [36]:
COUNTRY_RANK_DICT = {
    'CH': 1,
    'GB': 2,
    'US': 3,
    'DE': 4,
    'RO': 5,
    'CA': 6,
    'AU': 7,
    'PL': 8,
    'IT': 9,
    'ES': 10,
    'FR': 11,
    'ZA': 12,
    # 'OTHER': 13,
    'TR': 14,
    'BR': 15,
    'CL': 16,
    'AR': 17,
    'KZ': 18,
    'IN': 19,
    'PK': 20,
    'MX': 21
}

In [37]:
test_df['country_e'] = test_df['country'].map(COUNTRY_RANK_DICT).fillna(13).astype(int)

In [38]:
test_df = test_df.join(generate_event_features(test_df, test_events_df), how='left')

## 4.3. Prediction

In [39]:
X_test = test_df[features].values

In [40]:
test_df['target'] = best_model.predict(X_test)

In [41]:
test_df['target'] = np.where(test_df['payment_sum_d0'] == test_df['payment_sum_d7'], test_df['payment_sum_d7'], test_df['target'])

### 4.4. Export

In [42]:
EXPORT_FILE_NAME = 'user_profile_test_export.csv'

In [43]:
export_file_path = os.path.join(DATA_DIR, EXPORT_FILE_NAME)

In [44]:
test_df[['target']].sort_index().to_csv(export_file_path)