In [47]:
import time
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, RepeatedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_absolute_error

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

In [48]:
data = pd.read_csv("iasa_df.csv")

In [49]:
data = data.sample(n = 100000)

In [50]:
# id column
data.insert(0, 'id', list(np.arange(1,len(data)+1)))

In [51]:
# data.info()

In [52]:
# percentage of missing data
data.isnull().sum() / data.shape[0]*100

id                       0.000
total_sessions_day0      0.084
total_sessions_day1      0.084
total_sessions_day3      0.084
total_sessions_day7      0.084
                         ...  
platform                 0.000
target_sub_ltv_day30     0.000
target_iap_ltv_day30     0.000
target_ad_ltv_day30      0.000
target_full_ltv_day30    0.000
Length: 64, dtype: float64

In [53]:
# list of columns with missing data
nan_columns = data.columns[data.isna().any()]

In [54]:
# correlation for columns with missing data
# data.corr(numeric_only = True)[nan_columns]

In [55]:
# filling missing data
def clean(data, nan_columns):
    null_data = data[data.isnull().any(axis=1)]
    
    for col in nan_columns:
        null_data[col] = null_data.apply(
            lambda row: data[col].median()
            if np.isnan(row[col])
            else row[col], axis=1)
        
    for i in null_data['id']:
        data[data['id'] == i] = null_data
    
    return data

In [56]:
data = clean(data, nan_columns)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  null_data[col] = null_data.apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  null_data[col] = null_data.apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  null_data[col] = null_data.apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_index

In [57]:
# data.info()

In [58]:
y = data[['target_full_ltv_day30',
         'target_sub_ltv_day30',
         'target_iap_ltv_day30',
         'target_ad_ltv_day30']].copy().astype(int)

X = data.drop(['id',
               'target_full_ltv_day30',
               'target_sub_ltv_day30',
               'target_iap_ltv_day30',
               'target_ad_ltv_day30',
               'install_date',
               'country_code'], axis=1).copy()

In [59]:
# Test-train-validation split for all targets
# train - 98%
# test - 2%

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size = 0.02,
                                                    random_state=69) 

In [60]:
# Three independent targets
y_train_sub = y_train['target_sub_ltv_day30'].copy().astype(int)
y_test_sub = y_test['target_sub_ltv_day30'].copy().astype(int)

y_train_iap = y_train['target_iap_ltv_day30'].copy().astype(int)
y_test_iap = y_test['target_iap_ltv_day30'].copy().astype(int)

y_train_ad = y_train['target_ad_ltv_day30'].copy().astype(int)
y_test_ad = y_test['target_ad_ltv_day30'].copy().astype(int)

y_train.drop(['target_sub_ltv_day30',
         'target_iap_ltv_day30',
         'target_ad_ltv_day30'], axis=1, inplace = True)

y_test.drop(['target_sub_ltv_day30',
         'target_iap_ltv_day30',
         'target_ad_ltv_day30'], axis=1, inplace = True)

In [79]:
# sub doesn't really need iap and ltv features and so on

X_train_sub = X_train.drop(['app_iap_ltv_day0',
                'app_iap_ltv_day1',
                'app_iap_ltv_day3',
                'ad_ltv_day0',
                'ad_ltv_day1',
                'ad_ltv_day3'], axis=1).copy()

X_test_sub = X_test.drop(['app_iap_ltv_day0',
                'app_iap_ltv_day1',
                'app_iap_ltv_day3',
                'ad_ltv_day0',
                'ad_ltv_day1',
                'ad_ltv_day3'], axis=1).copy()

X_train_iap = X_train.drop(['app_sub_ltv_day0',
                'app_sub_ltv_day1',
                'app_sub_ltv_day3',
                'ad_ltv_day0', 
                'ad_ltv_day1',
                'ad_ltv_day3'], axis=1).copy()

X_test_iap = X_test.drop(['app_sub_ltv_day0',
                'app_sub_ltv_day1',
                'app_sub_ltv_day3',
                'ad_ltv_day0', 
                'ad_ltv_day1',
                'ad_ltv_day3'], axis=1).copy()

X_train_ad = X_train.drop(['app_sub_ltv_day0',
               'app_sub_ltv_day1',
               'app_sub_ltv_day3',
               'app_iap_ltv_day0', 
               'app_iap_ltv_day1',
               'app_iap_ltv_day3'], axis=1).copy()

X_test_ad = X_test.drop(['app_sub_ltv_day0',
               'app_sub_ltv_day1',
               'app_sub_ltv_day3',
               'app_iap_ltv_day0', 
               'app_iap_ltv_day1',
               'app_iap_ltv_day3'], axis=1).copy()

In [80]:
# Indentify numerical and categorical columns
numerical_cols_sub = [cname for cname in X_train_sub.columns if X_train_sub[cname].dtype in ['int64', 'float64']]
categorical_cols_sub = [cname for cname in X_train_sub.columns if X_train_sub[cname].dtype == "object"]

numerical_cols_iap = [cname for cname in X_train_iap.columns if X_train_iap[cname].dtype in ['int64', 'float64']]
categorical_cols_iap = [cname for cname in X_train_iap.columns if X_train_iap[cname].dtype == "object"]

numerical_cols_ad = [cname for cname in X_train_ad.columns if X_train_ad[cname].dtype in ['int64', 'float64']]
categorical_cols_ad = [cname for cname in X_train_ad.columns if X_train_ad[cname].dtype == "object"]

# Scale numerical data to have mean=0 and variance=1
numerical_transformer = Pipeline(steps=[('scaler',
                                         StandardScaler())])

# One-hot encode categorical data
categorical_transformer = Pipeline(steps=[('onehot',
                                           OneHotEncoder(drop='if_binary',
                                            handle_unknown='ignore',
                                            sparse_output=False))])

# Combine preprocessing
ct_sub = ColumnTransformer(transformers=[
        ('num', numerical_transformer, numerical_cols_sub),
        ('cat', categorical_transformer, categorical_cols_sub)],
        remainder='passthrough')

ct_iap = ColumnTransformer(transformers=[
        ('num', numerical_transformer, numerical_cols_iap),
        ('cat', categorical_transformer, categorical_cols_iap)],
        remainder='passthrough')

ct_ad = ColumnTransformer(transformers=[
        ('num', numerical_transformer, numerical_cols_ad),
        ('cat', categorical_transformer, categorical_cols_ad)],
        remainder='passthrough')

# Apply preprocessing
X_train_sub = ct_sub.fit_transform(X_train_sub)
X_test_sub = ct_sub.fit_transform(X_test_sub)

X_train_iap = ct_iap.fit_transform(X_train_iap)
X_test_iap = ct_iap.fit_transform(X_test_iap)

X_train_ad = ct_ad.fit_transform(X_train_ad)
X_test_ad = ct_ad.fit_transform(X_test_ad)

# Print new shape
print('Training set shapes:',
      X_train_sub.shape,
      X_train_iap.shape,
      X_train_ad.shape)
print('Test set shapes:',
      X_test_sub.shape,
      X_test_iap.shape,
      X_test_ad.shape)

Training set shapes: (98000, 71) (98000, 71) (98000, 71)
Test set shapes: (2000, 64) (2000, 64) (2000, 64)


In [18]:
model_sub = XGBRegressor(n_estimators=5000,
                     max_depth=7,
                     eta=0.05,
                     subsample=1,
                     colsample_bytree=1,
                     tree_method='hist',
                     random_state=69)

model_iap = XGBRegressor(n_estimators=5000,
                     max_depth=7,
                     eta=0.05,
                     subsample=1,
                     colsample_bytree=1,
                     tree_method='hist',
                     random_state=69)

model_ad = XGBRegressor(n_estimators=5000,
                     max_depth=7,
                     eta=0.05,
                     subsample=1,
                     colsample_bytree=1,
                     tree_method='hist',
                     random_state=69)

In [19]:
'''
cv = RepeatedKFold(n_splits=10,
                   n_repeats=3,
                   random_state=69)

scores_sub = cross_val_score(model_sub,
                             X_train_sub,
                             y_train_sub,
                             scoring='neg_mean_absolute_error',
                             cv=cv,
                             n_jobs=-1)

scores_sub = np.absolute(scores_sub)
print('Mean MAE: %.3f (%.3f)' % (scores_sub.mean(), scores_sub.std()))

scores_iap = cross_val_score(model_iap,
                             X_train_iap,
                             y_train_iap,
                             scoring='neg_mean_absolute_error',
                             cv=cv,
                             n_jobs=-1)

scores_siap = np.absolute(scores_iap)
print('Mean MAE: %.3f (%.3f)' % (scores_iap.mean(), scores_iap.std()))

scores_ad = cross_val_score(model_ad,
                             X_train_ad,
                             y_train_ad,
                             scoring='neg_mean_absolute_error',
                             cv=cv,
                             n_jobs=-1)

scores_ad = np.absolute(scores_ad)
print('Mean MAE: %.3f (%.3f)' % (scores_ad.mean(), scores_ad.std()))
'''

"\ncv = RepeatedKFold(n_splits=10,\n                   n_repeats=3,\n                   random_state=69)\n\nscores_sub = cross_val_score(model_sub,\n                             X_train_sub,\n                             y_train_sub,\n                             scoring='neg_mean_absolute_error',\n                             cv=cv,\n                             n_jobs=-1)\n\nscores_sub = np.absolute(scores_sub)\nprint('Mean MAE: %.3f (%.3f)' % (scores_sub.mean(), scores_sub.std()))\n\nscores_iap = cross_val_score(model_iap,\n                             X_train_iap,\n                             y_train_iap,\n                             scoring='neg_mean_absolute_error',\n                             cv=cv,\n                             n_jobs=-1)\n\nscores_siap = np.absolute(scores_iap)\nprint('Mean MAE: %.3f (%.3f)' % (scores_iap.mean(), scores_iap.std()))\n\nscores_ad = cross_val_score(model_ad,\n                             X_train_ad,\n                             y_train_ad,\n

In [20]:
model_sub.fit(X_train_sub, y_train_sub)

In [21]:
model_iap.fit(X_train_iap, y_train_iap)

In [22]:
model_ad.fit(X_train_ad, y_train_ad)

In [23]:
predictions_sub = model_sub.predict(X_test_sub)
print(mean_absolute_error(predictions_sub, y_test_sub))

ValueError: Feature shape mismatch, expected: 73, got 66

In [None]:
predictions_iap = model_iap.predict(X_test_iap)
print(mean_absolute_error(predictions_iap, y_test_iap))

In [None]:
predictions_ad = model_ad.predict(X_test_ad)
print(mean_absolute_error(predictions_ad, y_test_ad))

In [None]:
predictions = predictions_sub + predictions_iap + predictions_ad

In [None]:
mean_absolute_error(predictions, y_test)