In [1]:
## Statistics

from scipy.stats import ttest_1samp

## Read the data

import pandas as pd
mall = pd.read_csv("Mall_Customers.csv")

## One Sample Test

#### Used to compare the sample mean to the population mean
#### For small amount of data and use t distribution

In [4]:
## One Sample Test

hm = 50
t_stat,p_value = ttest_1samp(mall["Age"],hm)   # p-value
alpha = 0.05

# Interpret the result
print(f"T-Statistics: {t_stat}")
print(f"P-Value: {p_value}")

if p_value < alpha:
    print(f"Reject the null hypothesis at alpha = {alpha}")
else:
    print(f"Fail to reject the null hypothesis at alpha = {alpha}")

T-Statistics: -11.48828951949721
P-Value: 8.234394751496506e-24
Reject the null hypothesis at alpha = 0.05


In [6]:
mm = [14.8,15.1,14.9,15.3,15.0,15.2,14.9,15.0,15.1,14.8]
hm = 15
t_stat,p_value=ttest_1samp(mm,hm)
alpha = 0.05

# Interpret the result
print(f"T-Statistics: {t_stat}")
print(f"P-Value: {p_value}")

if p_value < alpha:
    print(f"Reject the null hypothesis at alpha = {alpha}")
else:
    print(f"Fail to reject the null hypothesis at alpha = {alpha}")

T-Statistics: 0.1901172751573396
P-Value: 0.8534362081257318
Fail to reject the null hypothesis at alpha = 0.05


## 2 Sample Independent t-test

In [7]:
import scipy.stats as stats
import numpy as np

# Creating data groups

# H0 => μ1 = μ2 (population mean of male is equal to female)

# HA => μ1 ≠ μ2 (population mean of male is different from female)

male = mall[mall["Gender"]=="Male"]
female = mall[mall["Gender"]=="Female"]

male_age = male["Age"]
#print(male_age)

female_age = female["Age"]
#print(female_age)

In [9]:
male_group = np.array(male_age)

female_group = np.array(female_age)
#stats.ttest_ind(a=male_group,b=female_group)

# Perform the two sample t-test with equal variances
t_stat,p_value=stats.ttest_ind(a=male_group,b=female_group)  # equal_var = True
t_stat,p_value


(0.6379442227710554, 0.5242397135319609)

## Paired t-test

In [1]:
from scipy import stats

# Sample data: Measurements from the same subjects at two different times

before_diet = [23,21,18,30,25]
after_diet = [20,22,19,29,24]

# Perform paired t-test

t_statistic,p_value = stats.ttest_rel(before_diet,after_diet)

print("t-statistics:",t_statistic)
print("p-value:",p_value)

t-statistics: 0.8017837257372731
p-value: 0.46760475460939743


In [3]:
import numpy as np

diff = [3,-1,-1,1,1]
md=np.mean(diff)
md

0.6

In [5]:
sd=np.std(diff)
sd

1.4966629547095767

## 1-Way ANOVA

In [5]:
from scipy.stats import f_oneway
from scipy import stats
group1 = [10,9,8,7.5,8.5]
group2 = [8,9,10,8,8.5]
group3 = [9,8,7,10,9]
# f statistic is the ratio of the variance between the groups to the variance within the groups
# perform onw-way ANOVA

f_statistic,p_value = stats.f_oneway(group1,group2,group3)

print("F-statistics:",f_statistic)
print("p-value:",p_value)

F-statistics: 0.017094017094017096
p-value: 0.9830751493312739


## 2-Way ANOVA

In [3]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols

# Create a Sample dataset

data = pd.DataFrame({
    'Diet':['A','A','A','B','B','B','B','B'],
    'Exercise':['1','1','1','1','2','2','2','2'],
    'Weightloss':[5,6,7,4,8,6,5,9]
})

# Fit the two-way ANOVA model

model= ols('Weightloss ~ C(Diet) + C(Exercise) + C(Diet):C(Exercise)',data=data).fit()

anova_table=sm.stats.anova_lm(model,type=2)
print(anova_table)

                      df     sum_sq   mean_sq         F    PR(>F)
C(Diet)              1.0   0.300000  0.300000  0.125000  0.738093
C(Exercise)          1.0   7.200000  7.200000  3.000000  0.143811
C(Diet):C(Exercise)  1.0   1.963858  1.963858  0.818274  0.407157
Residual             5.0  12.000000  2.400000       NaN       NaN


## Chi square tset

In [4]:
import numpy as np
from scipy.stats import chi2_contingency

# Observed frequency table

observed = np.array([[20,30],
                   [30,20]])

chi2_contingency(observed,correction=False)

Chi2ContingencyResult(statistic=4.0, pvalue=0.04550026389635857, dof=1, expected_freq=array([[25., 25.],
       [25., 25.]]))

## AB Testing

## Comparing which version is effective.

In [5]:
import numpy as np

v_A = np.random.normal(loc=10, scale=2, size=500)  # loc-means mean value, scale-means standard deviation
v_B = np.random.normal(loc=12, scale=2, size=500)

from scipy.stats import ttest_ind

# Performing the t-test

t,p = ttest_ind(v_A,v_B)

# Printing the result

print(f"t = {t:.3f}")
print(f"p = {p:.3f}")

# From the result v_A is smaller than v_B as it shows negative value and also the number 15 is large so the difference

t = -14.386
p = 0.000
