In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import classification_report,confusion_matrix

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.python.keras.metrics import RootMeanSquaredError
from tensorflow.keras.optimizers import Adam

In [2]:
data = pd.read_csv("iasa_df.csv")

In [3]:
# id column
data.insert(0, 'id', list(np.arange(1,len(data)+1)))

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1467832 entries, 0 to 1467831
Data columns (total 64 columns):
 #   Column                      Non-Null Count    Dtype  
---  ------                      --------------    -----  
 0   id                          1467832 non-null  int64  
 1   total_sessions_day0         1466467 non-null  float64
 2   total_sessions_day1         1466465 non-null  float64
 3   total_sessions_day3         1466464 non-null  float64
 4   total_sessions_day7         1466458 non-null  float64
 5   chapters_finished_day0      1467832 non-null  int64  
 6   chapters_finished_day1      1467832 non-null  int64  
 7   chapters_finished_day3      1467832 non-null  int64  
 8   chapters_finished_day7      1467832 non-null  int64  
 9   chapters_opened_day0        1467832 non-null  int64  
 10  chapters_opened_day1        1467832 non-null  int64  
 11  chapters_opened_day3        1467832 non-null  int64  
 12  chapters_opened_day7        1467832 non-null  int64  
 1

In [5]:
# percentage of missing data
data.isnull().sum() / data.shape[0]*100

id                       0.000000
total_sessions_day0      0.092994
total_sessions_day1      0.093131
total_sessions_day3      0.093199
total_sessions_day7      0.093607
                           ...   
platform                 0.000000
target_sub_ltv_day30     0.000000
target_iap_ltv_day30     0.000000
target_ad_ltv_day30      0.000000
target_full_ltv_day30    0.000000
Length: 64, dtype: float64

In [6]:
# list of columns with missing data
nan_columns = data.columns[data.isna().any()]

In [7]:
# correlation for columns with missing data
data.corr(numeric_only = True)[nan_columns]

Unnamed: 0,total_sessions_day0,total_sessions_day1,total_sessions_day3,total_sessions_day7
id,-0.03544,-0.041747,-0.04361,-0.04511
total_sessions_day0,1.0,0.893938,0.76955,0.64585
total_sessions_day1,0.893938,1.0,0.921799,0.803469
total_sessions_day3,0.76955,0.921799,1.0,0.932322
total_sessions_day7,0.64585,0.803469,0.932322,1.0
chapters_finished_day0,0.468981,0.496851,0.472214,0.427647
chapters_finished_day1,0.524887,0.608194,0.579884,0.522936
chapters_finished_day3,0.499239,0.6004,0.634031,0.599333
chapters_finished_day7,0.465746,0.571321,0.630265,0.654373
chapters_opened_day0,0.476621,0.504584,0.480074,0.435721


In [8]:
# filling missing data
def clean(data, nan_columns):
    
    null_data = data[data.isnull().any(axis=1)]
    
    for col in nan_columns:
        null_data[col] = null_data.apply(
            lambda row: data[col].median()
            if np.isnan(row[col])
            else row[col], axis=1)
        
    for i in null_data['id']:
        data[data['id'] == i] = null_data
    
    return data

In [9]:
data = clean(data, nan_columns)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  null_data[col] = null_data.apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  null_data[col] = null_data.apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  null_data[col] = null_data.apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_index

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1467832 entries, 0 to 1467831
Data columns (total 64 columns):
 #   Column                      Non-Null Count    Dtype  
---  ------                      --------------    -----  
 0   id                          1467832 non-null  int64  
 1   total_sessions_day0         1467832 non-null  float64
 2   total_sessions_day1         1467832 non-null  float64
 3   total_sessions_day3         1467832 non-null  float64
 4   total_sessions_day7         1467832 non-null  float64
 5   chapters_finished_day0      1467832 non-null  int64  
 6   chapters_finished_day1      1467832 non-null  int64  
 7   chapters_finished_day3      1467832 non-null  int64  
 8   chapters_finished_day7      1467832 non-null  int64  
 9   chapters_opened_day0        1467832 non-null  int64  
 10  chapters_opened_day1        1467832 non-null  int64  
 11  chapters_opened_day3        1467832 non-null  int64  
 12  chapters_opened_day7        1467832 non-null  int64  
 1

In [11]:
# full = sub + iap + ad

y = data['target_full_ltv_day30'].copy().astype(int)

# Three independent targets
y_sub = data['target_sub_ltv_day30'].copy().astype(int)
y_iap = data['target_iap_ltv_day30'].copy().astype(int)
y_ad = data['target_ad_ltv_day30'].copy().astype(int)

# Maybe we shouldn't drop country_code and use it instead???

# TODO: deal with install_date

X = data.drop(['id',
               'target_full_ltv_day30',
               'target_sub_ltv_day30',
               'target_iap_ltv_day30',
               'target_ad_ltv_day30',
               'country_code',
               'install_date'], axis=1).copy()

In [12]:
# sub doesn't really need iap and ltv features and so on

X_sub = X.drop(['app_iap_ltv_day0',
                'app_iap_ltv_day1',
                'app_iap_ltv_day3',
                'ad_ltv_day0',
                'ad_ltv_day1',
                'ad_ltv_day3'], axis=1).copy()

X_iap = X.drop(['app_sub_ltv_day0',
                'app_sub_ltv_day1',
                'app_sub_ltv_day3',
                'ad_ltv_day0', 
                'ad_ltv_day1',
                'ad_ltv_day3'], axis=1).copy()

X_ad = X.drop(['app_sub_ltv_day0',
               'app_sub_ltv_day1',
               'app_sub_ltv_day3',
               'app_iap_ltv_day0', 
               'app_iap_ltv_day1',
               'app_iap_ltv_day3'], axis=1).copy()

In [13]:
# Indentify numerical and categorical columns
numerical_cols_sub = [cname for cname in X_sub.columns if X_sub[cname].dtype in ['int64', 'float64']]
categorical_cols_sub = [cname for cname in X_sub.columns if X_sub[cname].dtype == "object"]

numerical_cols_iap = [cname for cname in X_iap.columns if X_iap[cname].dtype in ['int64', 'float64']]
categorical_cols_iap = [cname for cname in X_iap.columns if X_iap[cname].dtype == "object"]

numerical_cols_ad = [cname for cname in X_ad.columns if X_ad[cname].dtype in ['int64', 'float64']]
categorical_cols_ad = [cname for cname in X_ad.columns if X_ad[cname].dtype == "object"]

# Scale numerical data to have mean=0 and variance=1
numerical_transformer = Pipeline(steps=[('scaler', StandardScaler())])

# One-hot encode categorical data
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(drop='if_binary', handle_unknown='ignore',sparse_output=False))])

# Combine preprocessing
ct_sub = ColumnTransformer(transformers=[
        ('num', numerical_transformer, numerical_cols_sub),
        ('cat', categorical_transformer, categorical_cols_sub)],
        remainder='passthrough')

ct_iap = ColumnTransformer(transformers=[
        ('num', numerical_transformer, numerical_cols_iap),
        ('cat', categorical_transformer, categorical_cols_iap)],
        remainder='passthrough')

ct_ad = ColumnTransformer(transformers=[
        ('num', numerical_transformer, numerical_cols_ad),
        ('cat', categorical_transformer, categorical_cols_ad)],
        remainder='passthrough')

# Apply preprocessing
X_sub = ct_sub.fit_transform(X_sub)
X_iap = ct_iap.fit_transform(X_iap)
X_ad = ct_ad.fit_transform(X_ad)

# Print new shape
print('Training set shape:', X_sub.shape)

Training set shape: (1467832, 75)


In [14]:
# Test-train-validation split for all targets
# train - 98%
# test - 1.5% 
# valid - 0.5% 

# sub 
X_train_sub, X_test_sub, y_train_sub, y_test_sub = train_test_split(X_sub, y_sub, test_size = 0.02, random_state=69)
X_valid_sub, X_test_sub, y_valid_sub, y_test_sub = train_test_split(X_test_sub, y_test_sub, test_size=0.25, random_state=69)

# iap
X_train_iap, X_test_iap, y_train_iap, y_test_iap = train_test_split(X_iap, y_iap, test_size = 0.02, random_state=69)
X_valid_iap, X_test_iap, y_valid_iap, y_test_iap = train_test_split(X_test_iap, y_test_iap, test_size=0.25, random_state=69)

# ad
X_train_ad, X_test_ad, y_train_ad, y_test_ad = train_test_split(X_ad, y_ad, test_size = 0.02, random_state=69)
X_valid_ad, X_test_ad, y_valid_ad, y_test_ad = train_test_split(X_test_ad, y_test_ad, test_size=0.25, random_state=69)

In [None]:
# SVC model 
# Try grid search ???
# It may take a lot of time

# Maybe we should train model on a small
# sample to decide which is better

svc_model_sub = SVC()
svc_model_sub.fit(X_train_sub,y_train_sub)

svc_model_iap = SVC()
svc_model_iap.fit(X_train_iap,y_train_iap)

svc_model_ad = SVC()
svc_model_ad.fit(X_train_ad,y_train_ad)

sub = svc_model_sub.predict(X_test_sub)
iap = svc_model_iap.predict(X_test_iap)
ad = svc_model_ad.predict(X_test_ad)

predictions = sub + iap + ad

In [None]:
print("Sub confusion matrix: ")
print(confusion_matrix(y_test, sub))

print("Iap confusion matrix: ")
print(confusion_matrix(y_test, iap))

print("Ad confusion matrix: ")
print(confusion_matrix(y_test, ad))

In [None]:
# Here I tried to create a simple neural network,
# doesn't work for now
def create_model():
    model = Sequential()

    model.add(Dense(53,activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(27,activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(14,activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(7,activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(4,activation='relu'))
    model.add(Dense(1))
    
    model.compile(optimizer='Adam',
              loss='categorical_crossentropy',
              metrics=[RootMeanSquaredError()])
    
    return model

In [None]:
model_sub = create_model()
model_iap = create_model()
model_ad = create_model()

In [None]:
model_sub.fit(x=X_train_sub,
              y=y_train_sub,
              batch_size=32,
              epochs=3,
              validation_data=(X_valid_sub, y_valid_sub),
              verbose=1)

model_iap.fit(x=X_train_iap,
              y=y_train_iap.values,
              batch_size=32,
              epochs=3,
              validation_data=(X_valid_iap, y_valid_iap),
              verbose=1)

model_ad.fit(x=X_train_ad,
              y=y_train_ad,
              batch_size=32,
              epochs=3,
              validation_data=(X_valid_ad, y_valid_ad),
              verbose=1)