In [1]:
import gdown
url = 'https://drive.google.com/uc?id=1mL9uvzYPuk_mNabWbDIUGpGxd0pFWuBo' 
output = 'data.csv'
gdown.download(url, output, quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1mL9uvzYPuk_mNabWbDIUGpGxd0pFWuBo
To: /content/data.csv
100%|██████████| 322M/322M [00:03<00:00, 99.9MB/s]


'data.csv'

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error

In [3]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [4]:
def get_classification_data(path: str, sample_size = 50000):
    df = pd.read_csv(path)
    
    df = df.dropna()
    df.describe()
    
    platform_dummies = pd.get_dummies(df['platform'], drop_first=True)
    df = df.drop(['platform'],axis=1)
    df = pd.concat([df, platform_dummies],axis=1)
    
    media_source_dummies = pd.get_dummies(df['media_source'], drop_first=True)
    df = df.drop(['media_source'], axis=1)
    df = pd.concat([df, media_source_dummies], axis=1)
   
    country_code_dummies = pd.get_dummies(df['country_code'], drop_first=True)
    df = df.drop(['country_code'], axis=1)
    df = pd.concat([df, country_code_dummies], axis=1)
    
    df = df.drop(['install_date'], axis=1)
    df = df.sample(sample_size)
    Y = df[['target_full_ltv_day30']]
    

    
    df.drop(['target_sub_ltv_day30', 'target_iap_ltv_day30', 'target_ad_ltv_day30', 'target_full_ltv_day30'], axis = 1, inplace = True)
    
    return df, Y

In [5]:
X, Y = get_classification_data('data.csv')

In [6]:
from sklearn.model_selection import train_test_split

def split(X, y):
    return train_test_split(X, y, test_size=0.001)
X_train, X_test, y_train, y_test = split(X, Y)

In [7]:
X_train.shape

(49950, 302)

In [8]:
y_train.shape

(49950, 1)

### **Linear Regression**

In [9]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
fitted = reg.fit(X_train, y_train)

In [10]:
fitted.score(X_train, y_train)

0.8477400552128159

In [11]:
prediction = fitted.predict(X_test)
prediction = np.where(prediction > 0, prediction, 0)

In [12]:
print("MAE: ", mean_absolute_error(prediction, y_test))
print("MAPE: ", mean_absolute_percentage_error(prediction, y_test))
print("RMSE: ", mean_squared_error(prediction, y_test, squared=False))

MAE:  0.12341260604099384
MAPE:  28317303649179.656
RMSE:  0.5799096508063304


## Support Machine Regresion

In [13]:
from tables.tests.common import verbosePrint
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

n_samples, n_features = 10, 5
rng = np.random.RandomState(0)

X = rng.randn(n_samples, n_features)
regr = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.2))

regr.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svr', SVR(epsilon=0.2))])

In [14]:
prediction = regr.predict(X_test)
prediction = np.where(prediction > 0, prediction, 0)


In [15]:
print("MAE: ", mean_absolute_error(prediction, y_test))
print("MAPE: ", mean_absolute_percentage_error(prediction, y_test))
print("RMSE: ", mean_squared_error(prediction, y_test, squared=False))

MAE:  0.19636675954045044
MAPE:  1278836536780.3887
RMSE:  0.6948330550569647


### **Random Forest**

In [16]:
from sklearn.ensemble import RandomForestRegressor

In [17]:
regr = RandomForestRegressor(max_depth=200)

In [18]:
regr.fit(X_train, y_train)

  """Entry point for launching an IPython kernel.


RandomForestRegressor(max_depth=200)

In [19]:
prediction = regr.predict(X_test)
prediction = np.where(prediction > 0, prediction, 0)


In [20]:
print("MAE: ", mean_absolute_error(prediction, y_test))
print("MAPE: ", mean_absolute_percentage_error(prediction, y_test))
print("RMSE: ", mean_squared_error(prediction, y_test, squared=False))

MAE:  0.11181994491413594
MAPE:  0.14056582348564806
RMSE:  0.6314756599889025


### **Increasing depth of tree**

In [21]:
regr = RandomForestRegressor(max_depth=500)

In [22]:
regr.fit(X_train, y_train)

  """Entry point for launching an IPython kernel.


RandomForestRegressor(max_depth=500)

In [23]:
prediction = regr.predict(X_test)
prediction = np.where(prediction > 0, prediction, 0)


In [24]:
print("MAE: ", mean_absolute_error(prediction, y_test))
print("MAPE: ", mean_absolute_percentage_error(prediction, y_test))
print("RMSE: ", mean_squared_error(prediction, y_test, squared=False))

MAE:  0.1081600275801501
MAPE:  0.15402044533772355
RMSE:  0.6177095718580563


### **Gausian Regression**

In [25]:
from sklearn.datasets import make_friedman2
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel

In [None]:
kernel = DotProduct() + WhiteKernel()
# NOT ENOUGH RAM  !!!
#gpr = GaussianProcessRegressor(kernel=kernel,
#         random_state=0).fit(X_train, y_train)

In [None]:
#prediction = gpr.predict(X_test)
#prediction = np.where(prediction > 0, prediction, 0)