## Initial Set-Up

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [2]:
data = pd.read_csv('working_data.csv', parse_dates=[0], infer_datetime_format= True)

In [3]:
len(data)

5000000

In [4]:
cats = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

data['day_name'] = pd.Categorical(data['day_name'], categories=cats, ordered=True)

In [5]:
cats = ['January', 'February', 'March', 'April', 'May', 'June', 
       'July', 'August', 'September', 'October', 'November', 'December']

data['month_name'] = pd.Categorical(data['month_name'], categories=cats, ordered=True)

In [6]:
data.head()

Unnamed: 0,date,device_id,sessions,tts,tps,app_name,android,month,month_name,day,...,sessions_stand,tts_stand,tps_stand,app_number,android_num,sessions_rolling_avg,tts_rolling_avg,tps_rolling_avg,DAU,MAU
0,2021-05-21,yb0YMYO4QAyxb9cSsO4iGBpV9VSbXa,1,392,392.0,instagram,unknown,3,May,4,...,-0.570834,-0.284362,0.740284,1,1,7.0,4147.571429,467.777143,1674,47581
1,2021-09-26,JzG2JaSOoO7wvPmfmnocXKQdoI9r24,1,6,6.0,instagram,unknown,4,September,6,...,-0.570834,-1.872475,-1.909764,1,1,17.428571,1563.571429,82.915714,1709,47540
2,2021-11-23,baPdvtjbrRxztRCNnuaQodr6ZvwwFn,2,266,133.0,whatsapp,android,3,November,1,...,-0.51079,-0.455858,0.029497,6,0,14.285714,2219.428571,111.131429,1664,47176
3,2021-07-01,8zfpLx8jdEwirtXKTPlrGQUB0y58Rb,22,1987,90.32,pinterest,android,1,July,3,...,0.690078,0.496257,-0.220575,2,0,10.857143,1489.428571,144.034286,1687,47080
4,2021-03-31,PRJOvGZtFVtGlS4qcgXZaqKicB8g7Q,88,2561,29.1,facebook,unknown,7,March,2,...,4.652944,0.628054,-0.939398,0,1,25.428571,2276.571429,186.078571,1729,47672


## Some Regressions

In [7]:
import statsmodels.api as sm

In [8]:
def run_regression_descrip(X, y):
    model = sm.OLS(y, X).fit()
    print(model.summary())
    return

In [9]:
X = data[['month', 'day', 'app_number', 'android_num', 'DAU', 'MAU']]
y = data['sessions']

run_regression_descrip(X,y)

                                 OLS Regression Results                                
Dep. Variable:               sessions   R-squared (uncentered):                   0.285
Model:                            OLS   Adj. R-squared (uncentered):              0.285
Method:                 Least Squares   F-statistic:                          3.317e+05
Date:                Thu, 23 Mar 2023   Prob (F-statistic):                        0.00
Time:                        12:26:32   Log-Likelihood:                     -2.1158e+07
No. Observations:             5000000   AIC:                                  4.232e+07
Df Residuals:                 4999994   BIC:                                  4.232e+07
Df Model:                           6                                                  
Covariance Type:            nonrobust                                                  
                  coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------

In [10]:
X = data[['month', 'day', 'app_number', 'DAU', 'MAU']]
y = data['sessions_stand']

run_regression_descrip(X,y)

                                 OLS Regression Results                                
Dep. Variable:         sessions_stand   R-squared (uncentered):                   0.000
Model:                            OLS   Adj. R-squared (uncentered):             -0.000
Method:                 Least Squares   F-statistic:                             0.1830
Date:                Thu, 23 Mar 2023   Prob (F-statistic):                       0.969
Time:                        12:26:37   Log-Likelihood:                     -7.0947e+06
No. Observations:             5000000   AIC:                                  1.419e+07
Df Residuals:                 4999995   BIC:                                  1.419e+07
Df Model:                           5                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [11]:
X = data[['month', 'day', 'app_number', 'DAU', 'MAU']]
y = data['tts']

run_regression_descrip(X,y)

                                 OLS Regression Results                                
Dep. Variable:                    tts   R-squared (uncentered):                   0.205
Model:                            OLS   Adj. R-squared (uncentered):              0.205
Method:                 Least Squares   F-statistic:                          2.581e+05
Date:                Thu, 23 Mar 2023   Prob (F-statistic):                        0.00
Time:                        12:26:41   Log-Likelihood:                     -5.0695e+07
No. Observations:             5000000   AIC:                                  1.014e+08
Df Residuals:                 4999995   BIC:                                  1.014e+08
Df Model:                           5                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [12]:
X = data[['month', 'day', 'app_number', 'DAU', 'MAU']]
y = data['tts_stand']

run_regression_descrip(X,y)

                                 OLS Regression Results                                
Dep. Variable:              tts_stand   R-squared (uncentered):                   0.000
Model:                            OLS   Adj. R-squared (uncentered):             -0.000
Method:                 Least Squares   F-statistic:                             0.8150
Date:                Thu, 23 Mar 2023   Prob (F-statistic):                       0.539
Time:                        12:26:46   Log-Likelihood:                     -7.0947e+06
No. Observations:             5000000   AIC:                                  1.419e+07
Df Residuals:                 4999995   BIC:                                  1.419e+07
Df Model:                           5                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [13]:
X = data[['month', 'day', 'app_number', 'DAU', 'MAU']]
y = data['tps_stand']

run_regression_descrip(X,y)

                                 OLS Regression Results                                
Dep. Variable:              tps_stand   R-squared (uncentered):                   0.000
Model:                            OLS   Adj. R-squared (uncentered):              0.000
Method:                 Least Squares   F-statistic:                              1.163
Date:                Thu, 23 Mar 2023   Prob (F-statistic):                       0.324
Time:                        12:26:51   Log-Likelihood:                     -7.0947e+06
No. Observations:             5000000   AIC:                                  1.419e+07
Df Residuals:                 4999995   BIC:                                  1.419e+07
Df Model:                           5                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

## Alternative Regression Ideas

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [15]:
def run_regression_alt(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model = LinearRegression()
    model.fit(X_train, y_train)
    
    print('Test Data: \n', X_test[500:501])
    print('\nPrediction Value: \n', model.predict(X_test[500:501]))
    print('\nTrain Data Value: \n', y_train[500:501].values)
    
    y_hat = model.predict(X_test)
    
    print("\nMAE: ", mean_absolute_error(y_test, y_hat))
    print("MSE: ", mean_squared_error(y_test, y_hat))
    print("R2: ", r2_score(y_test, y_hat))
    return


In [16]:
X = data[['DAU', 'MAU']]
y = data['sessions_stand']

run_regression_alt(X,y)

Test Data: 
        DAU    MAU
9328  1688  47115

Prediction Value: 
 [0.0001376]

Train Data Value: 
 [-0.15052974]

MAE:  0.6036868228344251
MSE:  0.9988864967859375
R2:  -5.204961346372272e-07


In [17]:
X = data[['DAU', 'MAU']]
y = data['tts_stand']

run_regression_alt(X,y)

Test Data: 
        DAU    MAU
9328  1688  47115

Prediction Value: 
 [0.0005905]

Train Data Value: 
 [0.94417821]

MAE:  0.834718400194679
MSE:  1.000242666678621
R2:  -1.8415138309801904e-06


In [18]:
X = data[['DAU', 'MAU']]
y = data['tps_stand']

run_regression_alt(X,y)

Test Data: 
        DAU    MAU
9328  1688  47115

Prediction Value: 
 [0.00070363]

Train Data Value: 
 [0.99808244]

MAE:  0.8011874860942807
MSE:  1.000731475195136
R2:  -2.1491134654105792e-06
