In [1]:
import pandas as pd
import numpy as np


In [2]:
raw_data = pd.read_csv("data.csv")
raw_data.shape

(1467832, 63)

In [3]:
raw_data.columns

Index(['total_sessions_day0', 'total_sessions_day1', 'total_sessions_day3',
       'total_sessions_day7', 'chapters_finished_day0',
       'chapters_finished_day1', 'chapters_finished_day3',
       'chapters_finished_day7', 'chapters_opened_day0',
       'chapters_opened_day1', 'chapters_opened_day3', 'chapters_opened_day7',
       'chapters_closed_day0', 'chapters_closed_day1', 'chapters_closed_day3',
       'chapters_closed_day7', 'diamonds_received_day0',
       'diamonds_received_day1', 'diamonds_received_day3',
       'diamonds_received_day7', 'diamonds_spent_day0', 'diamonds_spent_day1',
       'diamonds_spent_day3', 'diamonds_spent_day7', 'tickets_spent_day0',
       'tickets_spent_day1', 'tickets_spent_day3', 'tickets_spent_day7',
       'retained_day1', 'retained_day3', 'retained_day7',
       'chapters_finished_session1', 'chapters_finished_session3',
       'chapters_finished_session9', 'chapters_opened_session1',
       'chapters_opened_session3', 'chapters_opened_session9'

In [4]:
platform_dummies = pd.get_dummies(raw_data['platform'], prefix = 'platform')
media_dummies = pd.get_dummies(raw_data['media_source'], prefix = '')
country_dummies = pd.get_dummies(raw_data['country_code'], prefix = '')

In [5]:
dummy_data = pd.concat([raw_data, platform_dummies, media_dummies, country_dummies], axis = 1)
dummy_data.shape

(1467832, 313)

In [6]:
dummy_data.drop('platform', axis = 1, inplace = True)
dummy_data.drop('country_code', axis = 1, inplace = True)
dummy_data.drop('media_source', axis = 1, inplace = True)



dummy_data.shape

(1467832, 310)

In [7]:
raw_data['install_date'][120000]

'2022-01-24'

In [8]:
type(raw_data['install_date'][0])

str

In [9]:
from datetime import datetime
"""
Data needed to be transformed into:
1) Time since 1970 (Unix date) continuous
2) Day of a year [1, 365] categorical
3) Month of a year [1, 12] categorical
4) Day of a week in [1, 7] categorical
5) Is weekend binary
"""


def transform_date(dates_frame: pd.DataFrame): # return pd.DataFrame
    
    py_dates = []
    for el in dates_frame:
        py_dates.append(datetime.strptime(el, '%Y-%m-%d'))

    
    dates_list = [] # Unix time 
    day_of_a_year_list = [] # Day of year
    month_of_a_year = [] # month of a year
    day_of_a_week = [] # day of a week
    is_weekend = [] # is day a weekend 


    for el in py_dates:
        dates_list.append(el.strftime("%s"))
        day_of_a_year_list.append(el.timetuple().tm_yday)
        month_of_a_year.append(el.timetuple().tm_mon)
        day_of_a_week.append(el.weekday())
        is_weekend.append((el.weekday() > 4))
    
    
    a = pd.DataFrame({"_":day_of_a_year_list})
    b = pd.DataFrame({"_":day_of_a_week})
    c = pd.DataFrame({"_": month_of_a_year})
                     
    d_year_day = pd.get_dummies(a["_"], prefix = 'day')
    d_month_of_year = pd.get_dummies(c["_"], prefix = 'month')
    d_weekday = pd.get_dummies(b["_"], prefix = 'weekday')
    d_weekend = pd.DataFrame({"is_weekend" : is_weekend})

    print(d_year_day.shape, d_month_of_year.shape, d_weekday.shape, d_weekend.shape, sep = '\n')
    
    return pd.concat([pd.DataFrame({"unix date": dates_list}),
                        d_year_day, d_month_of_year, d_weekday, d_weekend], axis = 1)



In [10]:
dummy_data = pd.concat([dummy_data,transform_date(raw_data['install_date'])], axis = 1)
dummy_data.drop('install_date', axis = 1, inplace=True)
dummy_data.shape

# Now dataframe is fully prepeared for machine learning

(1467832, 62)
(1467832, 2)
(1467832, 7)
(1467832, 1)


(1467832, 382)

In [8]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [12]:
# Spliting dataset for training 
train, test = train_test_split(dummy_data, test_size=0.0001)

train_set_y = train.drop(train.columns.difference(['target_sub_ltv_day30', 'target_iap_ltv_day30', 'target_ad_ltv_day30',
       'target_full_ltv_day30']), 1) 

train.drop(['target_sub_ltv_day30', 'target_iap_ltv_day30', 'target_ad_ltv_day30',
       'target_full_ltv_day30'], axis = 1, inplace = True)


test_set_y = test.drop(train.columns.difference(['target_sub_ltv_day30', 'target_iap_ltv_day30', 'target_ad_ltv_day30',
       'target_full_ltv_day30']), 1) 

test.drop(['target_sub_ltv_day30', 'target_iap_ltv_day30', 'target_ad_ltv_day30',
       'target_full_ltv_day30'], axis = 1, inplace = True)

# just synonims

train_set_x = train
test_set_x = test

train_set_x.shape
train_set_y.shape
test_set_x.shape
test_set_y.shape


  train_set_y = train.drop(train.columns.difference(['target_sub_ltv_day30', 'target_iap_ltv_day30', 'target_ad_ltv_day30',
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train.drop(['target_sub_ltv_day30', 'target_iap_ltv_day30', 'target_ad_ltv_day30',
  test_set_y = test.drop(train.columns.difference(['target_sub_ltv_day30', 'target_iap_ltv_day30', 'target_ad_ltv_day30',
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test.drop(['target_sub_ltv_day30', 'target_iap_ltv_day30', 'target_ad_ltv_day30',


(147, 4)

In [13]:
train_set_x.shape

(1467685, 305)

In [14]:
train_set_y.shape

(1467685, 4)

In [15]:
test_set_x.shape

(147, 305)

In [28]:
#reg = LinearRegression().fit(train_set_x, train_set_y)
#reg.score(train_set_x, train_set_y)

# such colums as 'total_sessions_day0', 'total_sessions_day1', 'total_sessions_day3', 'total_sessions_day7' 
# can contain nan values 

train_set_x.fillna(0)
test_set_x.fillna(0)

train_set_y.fillna(0)
test_set_y.fillna(0)

Unnamed: 0,target_sub_ltv_day30,target_iap_ltv_day30,target_ad_ltv_day30,target_full_ltv_day30
1444320,0.0,0.00000,0.000000,0.000000
258693,0.0,0.00000,0.000000,0.000000
1136736,0.0,0.40409,0.050822,0.454912
1092325,0.0,0.00000,0.000000,0.000000
528678,0.0,0.00000,0.000000,0.000000
...,...,...,...,...
345002,0.0,0.00000,0.000000,0.000000
783683,0.0,0.00000,0.000000,0.000000
1145602,0.0,0.00000,0.000000,0.000000
550303,0.0,0.00000,0.000000,0.000000


In [31]:
# Training simple logistoic regression

#unfortunately now it throws error about containing infinity. IDK where they are, so ...

#reg = LinearRegression().fit(train_set_x, train_set_y)
#reg.score(train_set_x, train_set_y)