In [1]:
# Jupyter notebook with EDA process and model training

In [2]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv('data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1467832 entries, 0 to 1467831
Data columns (total 63 columns):
 #   Column                      Non-Null Count    Dtype  
---  ------                      --------------    -----  
 0   total_sessions_day0         1466467 non-null  float64
 1   total_sessions_day1         1466465 non-null  float64
 2   total_sessions_day3         1466464 non-null  float64
 3   total_sessions_day7         1466458 non-null  float64
 4   chapters_finished_day0      1467832 non-null  int64  
 5   chapters_finished_day1      1467832 non-null  int64  
 6   chapters_finished_day3      1467832 non-null  int64  
 7   chapters_finished_day7      1467832 non-null  int64  
 8   chapters_opened_day0        1467832 non-null  int64  
 9   chapters_opened_day1        1467832 non-null  int64  
 10  chapters_opened_day3        1467832 non-null  int64  
 11  chapters_opened_day7        1467832 non-null  int64  
 12  chapters_closed_day0        1467832 non-null  int64  
 1

In [4]:
df.isnull().sum()

total_sessions_day0       1365
total_sessions_day1       1367
total_sessions_day3       1368
total_sessions_day7       1374
chapters_finished_day0       0
                          ... 
platform                     0
target_sub_ltv_day30         0
target_iap_ltv_day30         0
target_ad_ltv_day30          0
target_full_ltv_day30        0
Length: 63, dtype: int64

In [5]:
def clean_data(path: str, sample_size = 50000):
    df = pd.read_csv(path)
    
    # Get rid of missing some values for columns
    # total_sessions_day, total_sessions_day1, total_sessions_day3, total_sessions_day7
    # We have got relatively not much missing data, so it won't affect the result
    
    df = df.dropna()
    df.describe()
    
    # Convert platform, media_source and country_code columns into dummie variables columns
    
    platform_dummies = pd.get_dummies(df['platform'], drop_first=True)
    df = df.drop(['platform'],axis=1)
    df = pd.concat([df, platform_dummies],axis=1)
    
    media_source_dummies = pd.get_dummies(df['media_source'], drop_first=True)
    df = df.drop(['media_source'], axis=1)
    df = pd.concat([df, media_source_dummies], axis=1)
   
    country_code_dummies = pd.get_dummies(df['country_code'], drop_first=True)
    df = df.drop(['country_code'], axis=1)
    df = pd.concat([df, country_code_dummies], axis=1)
    
    # Get rid of install_date because we don't need it
    
    df = df.drop(['install_date'], axis=1)
    if (sample_size):
        df = df.sample(sample_size)
    Y = df[['target_full_ltv_day30']]
    
    df.drop(['target_sub_ltv_day30', 'target_iap_ltv_day30', 'target_ad_ltv_day30', 'target_full_ltv_day30'], axis = 1, inplace = True)
    
    return df, Y

In [6]:
X, Y = clean_data('data.csv', 50000)

In [7]:
from sklearn.model_selection import train_test_split

def split(X, y):
    return train_test_split(X, y, test_size=0.001)
X_train, X_test, y_train, y_test = split(X, Y)

In [8]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [9]:
# Regression problem is considered one of the most common Machine Learning tasks.
# There are various approaches, for example, using a standalone model
# of the Linear Regression or the Decision Tree. However, if you work
# with a single model you will probably not get any good results.

In [10]:
# Therefore, ensemble learning and Random Forests may be quite a good aproach

In [None]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(max_depth=500)
model.fit(X_train, y_train)

  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
prediction = model.predict(X_test)
prediction = np.where(prediction > 0, prediction, 0) 

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error

In [None]:
print("MAE: ", mean_absolute_error(prediction, y_test))
print("MAPE: ", mean_absolute_percentage_error(prediction, y_test))
print("RMSE: ", np.sqrt(mean_squared_error(prediction, y_test)))

In [None]:
# Our main metric here was a RMSE
# itâ€™s better to use the RMSE to measure error because
# the RMSE is more sensitive to observations that are further from the mean.
# As we can see, the model works quite well 
# and gives relatively good results for all metrics
# A large depth of the forest also helped to get more accurate target prediction

In [None]:
import joblib
joblib.dump(model, "Model.joblib")