In [1]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_absolute_error

from xgboost import XGBRegressor

In [2]:
data = pd.read_csv("iasa_df.csv")

In [3]:
# id column
data.insert(0, 'id', list(np.arange(1,len(data)+1)))

In [4]:
# percentage of missing data
data.isnull().sum() / data.shape[0]*100

id                       0.000000
total_sessions_day0      0.092994
total_sessions_day1      0.093131
total_sessions_day3      0.093199
total_sessions_day7      0.093607
                           ...   
platform                 0.000000
target_sub_ltv_day30     0.000000
target_iap_ltv_day30     0.000000
target_ad_ltv_day30      0.000000
target_full_ltv_day30    0.000000
Length: 64, dtype: float64

In [5]:
# list of columns with missing data
nan_columns = data.columns[data.isna().any()]

In [6]:
# correlation for columns with missing data
# data.corr(numeric_only = True)[nan_columns]

In [7]:
# filling missing data
def clean(data, nan_columns):
    null_data = data[data.isnull().any(axis=1)]
    
    for col in nan_columns:
        null_data[col] = null_data.apply(
            lambda row: data[col].median()
            if np.isnan(row[col])
            else row[col], axis=1)
        
    for i in null_data['id']:
        data[data['id'] == i] = null_data
    
    return data

In [8]:
data = clean(data, nan_columns)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  null_data[col] = null_data.apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  null_data[col] = null_data.apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  null_data[col] = null_data.apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_index

In [9]:
y = data[['target_full_ltv_day30',
         'target_sub_ltv_day30',
         'target_iap_ltv_day30',
         'target_ad_ltv_day30']].copy().astype(float)

X = data.drop(['id',
               'target_full_ltv_day30',
               'target_sub_ltv_day30',
               'target_iap_ltv_day30',
               'target_ad_ltv_day30',
               'install_date',
               'country_code'], axis=1).copy()

In [10]:
# Three independent targets
y_sub = y['target_sub_ltv_day30'].copy().astype(float)

y_iap = y['target_iap_ltv_day30'].copy().astype(float)

y_ad = y['target_ad_ltv_day30'].copy().astype(float)

y_full = y['target_full_ltv_day30'].copy().astype(float)

In [11]:
# sub doesn't really need iap and ltv features and so on

X_sub = X.drop(['app_iap_ltv_day0',
                'app_iap_ltv_day1',
                'app_iap_ltv_day3',
                'ad_ltv_day0',
                'ad_ltv_day1',
                'ad_ltv_day3'], axis=1).copy()

X_iap = X.drop(['app_sub_ltv_day0',
                'app_sub_ltv_day1',
                'app_sub_ltv_day3',
                'ad_ltv_day0', 
                'ad_ltv_day1',
                'ad_ltv_day3'], axis=1).copy()

X_ad = X.drop(['app_sub_ltv_day0',
               'app_sub_ltv_day1',
               'app_sub_ltv_day3',
               'app_iap_ltv_day0', 
               'app_iap_ltv_day1',
               'app_iap_ltv_day3'], axis=1).copy()

In [12]:
print(X_sub.shape, X_iap.shape, X_ad.shape)

(1467832, 51) (1467832, 51) (1467832, 51)


In [13]:
# Indentify numerical and categorical columns
numerical_cols_sub = [cname for cname in X_sub.columns if X_sub[cname].dtype in ['int64', 'float64']]
categorical_cols_sub = [cname for cname in X_sub.columns if X_sub[cname].dtype == "object"]

numerical_cols_iap = [cname for cname in X_iap.columns if X_iap[cname].dtype in ['int64', 'float64']]
categorical_cols_iap = [cname for cname in X_iap.columns if X_iap[cname].dtype == "object"]

numerical_cols_ad = [cname for cname in X_ad.columns if X_ad[cname].dtype in ['int64', 'float64']]
categorical_cols_ad = [cname for cname in X_ad.columns if X_ad[cname].dtype == "object"]


# Scale numerical data to have mean=0 and variance=1
numerical_transformer = Pipeline(steps=[('scaler',
                                         StandardScaler())])

# # One-hot encode categorical data
categorical_transformer = Pipeline(steps=[('onehot',
                                           OneHotEncoder(drop='if_binary',
                                            handle_unknown='ignore',
                                            sparse=False))])

# # Combine preprocessing
ct_sub = ColumnTransformer(transformers=[
        ('num', numerical_transformer, numerical_cols_sub),
        ('cat', categorical_transformer, categorical_cols_sub)
        ], remainder='passthrough')

ct_iap = ColumnTransformer(transformers=[
        ('num', numerical_transformer, numerical_cols_iap),
        ('cat', categorical_transformer, categorical_cols_iap)
        ], remainder='passthrough')

ct_ad = ColumnTransformer(transformers=[
        ('num', numerical_transformer, numerical_cols_ad),
        ('cat', categorical_transformer, categorical_cols_ad)
        ], remainder='passthrough')

# Apply preprocessing
X_sub = ct_sub.fit_transform(X_sub)

X_iap = ct_iap.fit_transform(X_iap)

X_ad = ct_ad.fit_transform(X_ad)

# # Print new shape
print('Training set shapes:',
      X_sub.shape,
      X_iap.shape,
      X_ad.shape)



Training set shapes: (1467832, 74) (1467832, 74) (1467832, 74)


In [17]:
pivot = int((X.shape[0]) * 0.98)

X_sub_train = X_sub[:pivot]
X_sub_test = X_sub[pivot:]
y_sub_train = y_sub[:pivot]
y_sub_test = y_sub[pivot:]

X_iap_train = X_iap[:pivot]
X_iap_test = X_iap[pivot:]
y_iap_train = y_iap[:pivot]
y_iap_test = y_iap[pivot:]

X_ad_train = X_ad[:pivot]
X_ad_test = X_ad[pivot:]
y_ad_train = y_ad[:pivot]
y_ad_test = y_ad[pivot:]

y_full_test = y_full[pivot:]

In [18]:
model_sub = XGBRegressor(n_estimators=5000,
                     max_depth=7,
                     eta=0.05,
                     subsample=1,
                     colsample_bytree=1,
                     tree_method='hist',
                     random_state=69)

model_iap = XGBRegressor(n_estimators=5000,
                     max_depth=7,
                     eta=0.05,
                     subsample=1,
                     colsample_bytree=1,
                     tree_method='hist',
                     random_state=69)

model_ad = XGBRegressor(n_estimators=5000,
                     max_depth=7,
                     eta=0.05,
                     subsample=1,
                     colsample_bytree=1,
                     tree_method='hist',
                     random_state=69)

In [19]:
model_sub.fit(X_sub_train, y_sub_train)

In [20]:
model_iap.fit(X_iap_train, y_iap_train)

In [21]:
model_ad.fit(X_ad_train, y_ad_train)

In [22]:
predictions_sub = model_sub.predict(X_sub_test)
print(mean_absolute_error(predictions_sub, y_sub_test))

0.03447284903922754


In [23]:
predictions_iap = model_iap.predict(X_iap_test)
print(mean_absolute_error(predictions_iap, y_iap_test))

0.08107050488389671


In [24]:
predictions_ad = model_ad.predict(X_ad_test)
print(mean_absolute_error(predictions_ad, y_ad_test))

0.009201803607474971


In [25]:
predictions = predictions_sub + predictions_iap + predictions_ad

In [26]:
mean_absolute_error(predictions, y_full_test)

0.1097266506607088