In [1]:
import math
import numpy as np
import pandas as pd
from scipy import stats

In [4]:
# Load the data from the test
data = pd.read_csv("background_color_experiment.csv")
# Print the first 10 rows
data.head(10)

Unnamed: 0,user_id,user_type,session_duration
0,BM3C0BJ7CS,variation,15.528769
1,MJWN6XNH6L,variation,32.28759
2,46ZPHHABLS,variation,43.718217
3,OHA298DHUG,variation,49.519702
4,AKJ77X6F4A,control,61.709028
5,BFNWMGU6DX,variation,71.779283
6,UFO2V8ZKFB,variation,23.291835
7,4CEIM3VRS9,control,25.219461
8,90AGF68FF8,control,26.240482
9,R3DQFO6068,variation,20.780244


In [5]:
print(f"The dataset size is: {len(data)}")

The dataset size is: 4186


In [7]:
# X_c stores the session tome for the control group and X_v, for the variation group. 
X_c = control_sd_data.to_numpy()
X_v = variation_sd_data.to_numpy()

In [9]:
def get_stats(X):
    """
    Calculate basic statistics of a given data set.
    """   
    n = len(X)
    x = X.mean()
    s = X.std(ddof=1)
    return (n,x,s)

In [11]:
n_c, x_c, s_c = get_stats(X_c)
n_v, x_v, s_v = get_stats(X_v)

In [14]:
def degrees_of_freedom(n_v, s_v, n_c, s_c):
    """ Computes the degrees of freedom for two samples.
    """    
    s_v_n_v = np.square(s_v)/n_v
    s_c_n_c = np.square(s_c)/n_c
    numerator = np.square(s_v_n_v+s_c_n_c)
    denominator = np.square(s_c_n_c)/(n_c-1)+np.square(s_v_n_v)/(n_v-1)
    dof = numerator/denominator
        
    return dof

In [16]:
d = degrees_of_freedom(n_v, s_v, n_c, s_c)
print(f"The degrees of freedom for the t-student in this scenario is: {d:.2f}")

The degrees of freedom for the t-student in this scenario is: 4182.97


In [17]:
def t_value(n_v, x_v, s_v, n_c, x_c, s_c):
    
    s_v_n_v = np.square(s_v)/n_v
    s_c_n_c = np.square(s_c)/n_c
    numerator = x_v-x_c
    denominator = np.sqrt(s_v_n_v+s_c_n_c)
    t = numerator/denominator

    return t

In [18]:
t = t_value(n_v, x_v, s_v, n_c, x_c, s_c)
print(f"The t-value for this experiment is: {t:.2f}")

The t-value for this experiment is: 1.64


In [19]:
t_10 = stats.t(df = 10)
cdf = t_10.cdf(1.21)
print(f"The CDF for the t-student distribution with 10 degrees of freedom and t-value = 1.21, or equivalently P(t_10 < 1.21) is equal to: {cdf:.2f}")

The CDF for the t-student distribution with 10 degrees of freedom and t-value = 1.21, or equivalently P(t_10 < 1.21) is equal to: 0.87


In [25]:
def p_value(d, t_value):
    t_d = stats.t(df=d)
    p = 1-t_d.cdf(t_value)
    return p

In [32]:
def make_decision(X_v, X_c, alpha = 0.05):

    n_v, x_v, s_v = get_stats(X_v)

    n_c, x_c, s_c = get_stats(X_c)

    d = degrees_of_freedom(n_v, s_v, n_c, s_c)
    
    t = t_value(n_v, x_v, s_v, n_c, x_c, s_c)

    p = p_value(d, t)

    if p<alpha:
        return 'Reject H_0'
    else:
        return 'Do not reject H_0'

In [None]:
alphas = [0.06, 0.05, 0.04, 0.01]
for alpha in alphas:
    print(f"For an alpha of {alpha} the decision is to: {make_decision(X_v, X_c, alpha = alpha)}")