In [1]:
from scipy.stats import chi2_contingency
import numpy as np
import pandas as pd

#### Chisquare test for Independence

In [2]:
# Create a contingency table (frequency counts of categorical variables)
data = pd.DataFrame({
    'Gender': ['Male', 'Male', 'Female', 'Female', 'Male', 'Female', 'Male', 'Female'],
    'Purchase': ['Organic', 'Non-Organic', 'Organic', 'Non-Organic', 'Non-Organic', 'Organic', 'Organic', 'Non-Organic']
})

# Create a cross-tabulation table
contingency_table = pd.crosstab(data['Gender'], data['Purchase'])
print(contingency_table)
# Perform Chi-Square test for independence
chi2, p_value, dof, expected = chi2_contingency(contingency_table)

print(f"Chi-Square Statistic: {chi2}")
print(f"P-Value: {p_value}")

if p_value < 0.05:
    print("Reject the null hypothesis: Gender and purchasing preference are dependent.")
else:
    print("Fail to reject the null hypothesis: No significant relationship.")


Purchase  Non-Organic  Organic
Gender                        
Female              2        2
Male                2        2
Chi-Square Statistic: 0.0
P-Value: 1.0
Fail to reject the null hypothesis: No significant relationship.


#### Notes
1. Expected frequency is calculated as E[i][j]=(row_total*col_total)/grand_total
2. Observed frequency is the number in the contigency table i.e. O[i][j]=2 ## For female non-organic
3. chisquare stastic is given as summation((O[i][j]-E[i][j])/E[i][j])
##### Consider the above table
1.  E[1][1]=4*4/8=2 and this is same for every cell 
2.  O[i][j] for i, j is equal to 2 as we can see. 
#### Hence our chisquare will be equal to 0 and p-value would be 1.0 .

#### Chisuqare Goodness of fit test 

In [3]:
from scipy.stats import chisquare

# Observed visitor counts (actual data)
observed_visitors = [120, 130, 140, 125, 135, 110, 140]

# Expected visitor counts (if traffic were equal)
expected_visitors = [sum(observed_visitors)/7] * 7

# Perform Chi-Square goodness-of-fit test
chi2, p_value = chisquare(observed_visitors, expected_visitors)

print(f"Chi-Square Statistic: {chi2}")
print(f"P-Value: {p_value}")

if p_value < 0.05:
    print("Reject the null hypothesis: Website traffic is not evenly distributed.")
else:
    print("Fail to reject the null hypothesis: No significant difference in traffic distribution.")


Chi-Square Statistic: 5.722222222222221
P-Value: 0.45501499186838734
Fail to reject the null hypothesis: No significant difference in traffic distribution.


##### One Sample T-Test 
##### Comparing a Sample Mean to a Known Value

In [4]:
from scipy.stats import ttest_1samp

# Sample website load times after optimization
sample_data = np.array([2.8, 2.7, 3.1, 2.9, 2.6, 3.0, 2.7])

# Perform one-sample t-test
stat, p_value = ttest_1samp(sample_data, 3) #here 3 is mean

print(f"T-Statistic: {stat}, P-Value: {p_value}")

if p_value < 0.05:
    print("Reject the null hypothesis: The new optimization significantly reduces load time.")
else:
    print("Fail to reject the null hypothesis: No significant improvement in load time.")


T-Statistic: -2.5205041512504156, P-Value: 0.04525626534780619
Reject the null hypothesis: The new optimization significantly reduces load time.


##### 2-sample Independent T-Test
##### Assumptions
1. Normality
2. Equal Variance (set equal_var=True/False) based on whether two samples have same variance or not
3. Two samples must be independent

In [5]:
from scipy.stats import ttest_ind

# Engagement scores for two marketing campaigns
campaign_A = [50, 55, 53, 57, 54, 51, 56, 52]
campaign_B = [60, 65, 62, 67, 64, 61, 66, 63]

# Perform independent t-test
stat, p_value = ttest_ind(campaign_A, campaign_B, equal_var=True)

print(f"T-Statistic: {stat}, P-Value: {p_value}")

if p_value < 0.05:
    print("Reject the null hypothesis: The campaigns have significantly different performances.")
else:
    print("Fail to reject the null hypothesis: No significant difference in campaign performance.")


T-Statistic: -8.16496580927726, P-Value: 1.0789760448977756e-06
Reject the null hypothesis: The campaigns have significantly different performances.


##### Paired- Dependent T-test (After/Before)

In [6]:
from scipy.stats import ttest_rel

# Blood pressure before and after treatment
before_treatment = [140, 138, 145, 142, 150, 147, 139]
after_treatment = [135, 130, 140, 136, 142, 144, 132]

# Perform paired t-test
stat, p_value = ttest_rel(before_treatment, after_treatment)

print(f"T-Statistic: {stat}, P-Value: {p_value}")

if p_value < 0.05:
    print("Reject the null hypothesis: The drug has a significant effect on blood pressure.")
else:
    print("Fail to reject the null hypothesis: The drug has no significant effect.")


T-Statistic: 8.694826047713663, P-Value: 0.0001278167558329041
Reject the null hypothesis: The drug has a significant effect on blood pressure.
