In [1]:
import numpy as np
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

def clean_data(path: str, sample_size = 50000):
    df = pd.read_csv(path)
    df = df.dropna()
    df.describe()
    platform_dummies = pd.get_dummies(df['platform'], drop_first=True)
    df = df.drop(['platform'],axis=1)
    df = pd.concat([df, platform_dummies],axis=1)
    media_source_dummies = pd.get_dummies(df['media_source'], drop_first=True)
    df = df.drop(['media_source'], axis=1)
    df = pd.concat([df, media_source_dummies], axis=1)
    country_code_dummies = pd.get_dummies(df['country_code'], drop_first=True)
    df = df.drop(['country_code'], axis=1)
    df = pd.concat([df, country_code_dummies], axis=1)
    df = df.drop(['install_date'], axis=1)
    if (sample_size):
        df = df.sample(sample_size)
    Y = df[['target_full_ltv_day30']]
    df.drop(['target_sub_ltv_day30', 'target_iap_ltv_day30', 'target_ad_ltv_day30', 'target_full_ltv_day30'], axis = 1, inplace = True)
    return df, Y

X, y = clean_data('data.csv', 0)
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

model = joblib.load("Model.joblib")

prediction = model.predict(X)
prediction = np.where(prediction > 0, prediction, 0) 

# Our main metric is Root Mean Squared Error
# it’s better to use the RMSE to measure error because
# the RMSE is more sensitive to observations that are further from the mean.

print("RMSE: ", np.sqrt(mean_squared_error(prediction, y)))

RMSE:  2.8037448735049324
