In [1]:
### Essential
import numpy as np
import pandas as pd
### Tools
import warnings
import gc
from joblib import dump, load
### ML
from sklearn.preprocessing import OneHotEncoder, QuantileTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from lightgbm import Booster
from sklearn.metrics import mean_squared_error,mean_absolute_percentage_error,mean_absolute_error
## Config
pd.set_option('display.max_columns', None)
pd.options.display.float_format = '{:.4f}'.format
warnings.filterwarnings('ignore')


# Data Loading

In [2]:
test = pd.read_csv("data/test.csv")
print('Shape: ', test.shape)
test.install_date = pd.to_datetime(test.install_date)
test.dropna(inplace=True)
test.reset_index(inplace=True,drop=True)
test.head(5)

Shape:  (73392, 63)


Unnamed: 0,total_sessions_day0,total_sessions_day1,total_sessions_day3,total_sessions_day7,chapters_finished_day0,chapters_finished_day1,chapters_finished_day3,chapters_finished_day7,chapters_opened_day0,chapters_opened_day1,chapters_opened_day3,chapters_opened_day7,chapters_closed_day0,chapters_closed_day1,chapters_closed_day3,chapters_closed_day7,diamonds_received_day0,diamonds_received_day1,diamonds_received_day3,diamonds_received_day7,diamonds_spent_day0,diamonds_spent_day1,diamonds_spent_day3,diamonds_spent_day7,tickets_spent_day0,tickets_spent_day1,tickets_spent_day3,tickets_spent_day7,retained_day1,retained_day3,retained_day7,chapters_finished_session1,chapters_finished_session3,chapters_finished_session9,chapters_opened_session1,chapters_opened_session3,chapters_opened_session9,chapters_closed_session1,chapters_closed_session3,chapters_closed_session9,diamonds_spent_session1,diamonds_spent_session3,diamonds_spent_session9,tickets_spent_session1,tickets_spent_session3,tickets_spent_session9,app_sub_ltv_day0,app_sub_ltv_day1,app_sub_ltv_day3,app_iap_ltv_day0,app_iap_ltv_day1,app_iap_ltv_day3,media_source,install_date,country_code,ad_ltv_day0,ad_ltv_day1,ad_ltv_day3,platform,target_sub_ltv_day30,target_iap_ltv_day30,target_ad_ltv_day30,target_full_ltv_day30
0,1.0,1.0,1.0,1.0,0,0,0,0,1,1,1,1,1,1,1,1,20,20,20,20,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,SOURCE_4,2021-12-30,COUNTRY_13,0.0,0.0,0.0,android,0.0,0.0,0.0,0.0
1,1.0,1.0,1.0,1.0,1,1,1,1,3,3,3,3,2,2,2,2,27,27,27,27,0,0,0,0,0,0,0,0,0,0,0,1,1,1,3,3,3,2,2,2,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,SOURCE_17,2021-12-21,COUNTRY_141,0.0,0.0,0.0,ios,0.0,0.0,0.0,0.0
2,1.0,1.0,1.0,1.0,0,0,0,0,1,1,1,1,0,0,0,0,20,20,20,20,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,SOURCE_9,2021-12-13,COUNTRY_141,0.0,0.0,0.0,ios,0.0,0.0,0.0,0.0
3,1.0,1.0,1.0,1.0,0,0,0,0,1,1,1,1,0,0,0,0,25,25,25,25,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,SOURCE_23,2022-01-22,COUNTRY_141,0.0,0.0,0.0,ios,0.0,0.0,0.0,0.0
4,9.0,14.0,14.0,14.0,6,12,12,12,7,13,13,13,0,4,4,4,34,76,76,76,17,70,70,70,6,11,11,11,1,0,0,5,5,6,6,6,7,0,0,0,17,17,17,5,5,6,0.0,0.0,0.0,0.0,0.0,0.0,SOURCE_17,2021-12-30,COUNTRY_208,0.0373,0.0373,0.0373,ios,0.0,0.0,0.0373,0.0373


In [3]:
# remove very rare negative values
numerical_cols = [col for col in test.columns if test[col].dtype in ['float64','int64']]
print('Total negative values', (test[numerical_cols]<0).astype(int).sum().sum() )
print('Size before: ', test.shape[0])
for col in numerical_cols:
    test.drop(test[test[col]<0].index,inplace=True)
print('Size after: ', test.shape[0])

Total negative values 0
Size before:  73313
Size after:  73313


# Feature Engineering

In [4]:
top_sources = test.media_source.value_counts().iloc[:20]
test['media_source'] = test.media_source.apply(lambda source: source if source in top_sources[:10].index else 'NOT_COMMON')
top_countries = test.country_code.value_counts().iloc[:20]
test['country_code'] = test.country_code.apply(lambda country: country if country in top_countries[:15].index else 'NOT_COMMON')

In [5]:
test['active'] = test[['retained_day1','retained_day3','retained_day7']].sum(axis=1) == 3

In [6]:
test['days_since_install'] = (test.install_date.max() - test.install_date).dt.days

In [7]:
test['no_chapters_finished'] = test['chapters_finished_day7']==0
test['no_chapters_closed'] = test['chapters_closed_day7']==0
test['no_chapters_opened'] = test['chapters_opened_day7']==0
test['afk'] = test['no_chapters_opened'] & test['no_chapters_finished'] & test['no_chapters_closed']
test['diamonds_zero'] = test.diamonds_spent_session9 == 0
test['tickets_zero'] = test.tickets_spent_session9 == 0
test['all_zero'] = test['diamonds_zero'] & test['tickets_zero']

# Preparing Data

In [8]:
drop_columns = [
'target_iap_ltv_day30', 
'target_ad_ltv_day30',
'target_sub_ltv_day30',
'target_full_ltv_day30',
'install_date',
]

y = test.target_full_ltv_day30
X = test.drop(columns=drop_columns)

In [9]:

class BoolTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.columns = None
    
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = X*1
        self.columns = X.columns
        return X
    def get_feature_names_out(self,names=None):
        return self.columns

In [10]:

def get_transformer(df):

    numerical_cols = [cname for cname in df.columns if df[cname].dtype in ['int64', 'float64','int32','float32']]
    categorical_cols = [cname for cname in df.columns if df[cname].dtype == "object"]
    bool_cols = [cname for cname in df.columns if df[cname].dtype == "bool"]

    # QuantileTransformer повинен зменшити скос даних, що зробить модель більш робастною
    numerical_transformer = Pipeline(steps=[('scaler',QuantileTransformer(output_distribution='normal'))])
    categorical_transformer = Pipeline(steps=[('one hot',OneHotEncoder(drop='if_binary',handle_unknown='ignore',sparse=False))])
    bool_transformer = Pipeline(steps=[('bool',BoolTransformer())])

    ct = ColumnTransformer(
        transformers=[
            ('bool',bool_transformer,bool_cols),
            ('num',numerical_transformer,numerical_cols),
            ('cat',categorical_transformer,categorical_cols)
        ],
        remainder='passthrough'
    )
    return ct

ct = get_transformer(X)
# можна використати пайплайн, підігнаний за тренувальним датасетом. Трохи покращує результат
# ct = load('models/custom_transformer.pkl')

X = ct.fit_transform(X)

В умові запропонували 3 метрики: MAPE, RMSE, MAE

- MAPE &mdash; одразу ні. Відбувається ділення на 0, якщо y_true=0, а в наших даних цих нулів просто тони. Тому ця метрика просто не буде працювати правильно і видавати великі числа.
- MAE &mdash; слід задуматися. З одного боку вона не чутлива до викидів, яких якраз в нас багато. Але з іншого боку ці викиди для нас важливі, бо це ті гравці, які створюють основний прибуток.
- RMSE &mdash; ця метрика буде головною. Вона метрика досить чутлива до викидів, а це для нас важливо, як стало зрозумілим у попередньому пункті.

Отже, основною метрикою стане RMSE


# Loading and measuring the model

In [11]:
def measure_model(y_true,y_pred):
    print('RMSE:', mean_squared_error(y_true,y_pred))
    print('MAPE:', mean_absolute_percentage_error(y_true,y_pred))
    print('MAE:', mean_absolute_error(y_true,y_pred))

In [12]:
FOLDS = 10
predictions = np.zeros(len(X))
# load cross-fold models
for i in range(FOLDS):
    model = Booster(model_file=f'models/lgbm_regressor_fold_{i}.txt')
    predictions += model.predict(X)
    
predictions = predictions/FOLDS
measure_model(y,predictions)

RMSE: 0.938777986630971
MAPE: 21387325219214.137
MAE: 0.08386739924100974


In [13]:
# improving results by setting threshold on low values
final_preds = np.where(predictions<0.01,0,predictions)
print('Final test predictions with threshold')
measure_model(y,final_preds)

Final test predictions with threshold
RMSE: 0.9379296104224866
MAPE: 6610524349892.934
MAE: 0.08032824488733463


In [14]:
# predicted vs true sum
pd.DataFrame(data={'pred':final_preds,'true':y}).sum()

pred   18791.3373
true   19326.1195
dtype: float64