<h1 style='font-size: 30px; font-family: Agency FB; font-weight: 700; text-align: center'>Hypothesis Testing</h1>

---

<h3 style='font-family: Agency FB; font-size: 20px;  font-weight: 600''>1.0. Import Required Libraries</h3
                                                                                                          >

In [17]:
from scipy.stats import shapiro, levene
import matplotlib.pyplot as plt
import scipy.stats as stats  
import seaborn as sns  
import pandas as pd  
import numpy as np 
import math
import re

import warnings  
warnings.simplefilter("ignore")  
pd.set_option('display.max_columns', 10) 
pd.set_option('display.float_format', lambda x: '%.2f' % x) 

<h3 style='font-family: Agency FB; font-size: 20px;  font-weight: 600''>2.0. Load and Inspect Dataset</h3>

In [4]:
DATA_PATH = "./Datasets/Marketing Campaign Dataset.csv"
full_data = pd.read_csv(DATA_PATH)
display(full_data)

Unnamed: 0,Store_ID,Date,Month Index,Store Size,Marketing Strategy,Average Order Values (Tsh),Post Campaign Period,Sales Revenue (Tsh)
0,1,2023-01-01,1,Small,Pilot Launch Regions,31398,0,1038101.58
1,1,2023-02-01,2,Small,Pilot Launch Regions,31398,0,859521.34
2,1,2023-03-01,3,Small,Pilot Launch Regions,31398,0,921424.44
3,1,2023-04-01,4,Small,Pilot Launch Regions,31398,0,899762.87
4,1,2023-05-01,5,Small,Pilot Launch Regions,31398,0,1072438.54
...,...,...,...,...,...,...,...,...
2995,100,2025-02-01,26,Medium,Business-as-Usual (BAU) Regions,37190,1,1594221.19
2996,100,2025-03-01,27,Medium,Business-as-Usual (BAU) Regions,37190,1,1330848.99
2997,100,2025-04-01,28,Medium,Business-as-Usual (BAU) Regions,37190,1,1535028.72
2998,100,2025-05-01,29,Medium,Business-as-Usual (BAU) Regions,37190,1,1601262.58


In [18]:
full_data['Date'] = pd.to_datetime(full_data['Date'])
event_date = full_data['Date'].min() + pd.DateOffset(months=18)
df = full_data[full_data['Date'] < event_date]
display(df)

Unnamed: 0,Store_ID,Date,Month Index,Store Size,Marketing Strategy,Average Order Values (Tsh),Post Campaign Period,Sales Revenue (Tsh)
0,1,2023-01-01,1,Small,Pilot Launch Regions,31398,0,1038101.58
1,1,2023-02-01,2,Small,Pilot Launch Regions,31398,0,859521.34
2,1,2023-03-01,3,Small,Pilot Launch Regions,31398,0,921424.44
3,1,2023-04-01,4,Small,Pilot Launch Regions,31398,0,899762.87
4,1,2023-05-01,5,Small,Pilot Launch Regions,31398,0,1072438.54
...,...,...,...,...,...,...,...,...
2983,100,2024-02-01,14,Medium,Business-as-Usual (BAU) Regions,37190,0,1385822.26
2984,100,2024-03-01,15,Medium,Business-as-Usual (BAU) Regions,37190,0,1352717.41
2985,100,2024-04-01,16,Medium,Business-as-Usual (BAU) Regions,37190,0,1372174.06
2986,100,2024-05-01,17,Medium,Business-as-Usual (BAU) Regions,37190,0,1383435.11


<h3 style='font-family: Agency FB; font-size: 20px; font-weight: 600'>3.0: Homogeneity of Variance (Levene's Test)</h3>

In [10]:
def Levene_test(df, variables=None, groups=None): 
    if variables is None:
        variables = df.select_dtypes(include=['number']).columns.tolist()
        for g in groups:
            if g in variables:
                variables.remove(g)
    
    results = []
    for group in groups:
        for variable in variables:
            grouped_data = [g[variable].dropna().values for _, g in df.groupby(group)]
            if all(len(g) > 1 for g in grouped_data):  # Ensure each group has enough data
                levene_stat, levene_p = levene(*grouped_data)
                interpretation = 'Homogeneous' if levene_p > 0.05 else 'Not Homogeneous'
                
            else:
                levene_stat, levene_p, interpretation = None, None, 'Insufficient data'
            
            results.append({
                'Category': group,
                'Variable': variable,
                'Test Statistic': levene_stat,
                'P-Value': levene_p,
                'Interpretation': interpretation
            })
    
    return pd.DataFrame(results)
variables = ["Average Order Values (Tsh)", "Sales Revenue (Tsh)"]
result_df = Levene_test(df, variables, groups=["Store Size", "Marketing Strategy"])
display(result_df)

Unnamed: 0,Category,Variable,Test Statistic,P-Value,Interpretation
0,Store Size,Average Order Values (Tsh),19.74,0.0,Not Homogeneous
1,Store Size,Sales Revenue (Tsh),0.56,0.57,Homogeneous
2,Marketing Strategy,Average Order Values (Tsh),2.34,0.13,Homogeneous
3,Marketing Strategy,Sales Revenue (Tsh),32.72,0.0,Not Homogeneous


<h3 style='font-family: Agency FB; font-size: 20px; font-weight: 600'>4.0: Normality Test (Shapiro Wilk Test)</h3>


In [14]:
def bootstrapping(df, column, num_samples=1000, sample_size=30):
    sample_means = []
    for _ in range(num_samples):
        sample = df[column].dropna().sample(n=sample_size, replace=True)
        sample_means.append(sample.mean())
    return sample_means

def shapiro_wilk_test(df, group_cols, numeric_cols=None, use_bootstrap=True, num_samples=1000, sample_size=30): 
    if numeric_cols is None:
        numeric_cols = df.select_dtypes(include=['number']).columns.tolist()
  
    results = []
    for group_col in group_cols:
        for group, group_df in df.groupby(group_col):
            for col in numeric_cols:
                if use_bootstrap: data = bootstrapping(group_df, col, num_samples=num_samples, sample_size=sample_size)
                else:
                    data = group_df[col].dropna()
                    
                if len(data) >= 3:
                    stat, p_value = shapiro(data)
                    interpretation = 'Normal' if p_value > 0.05 else 'Not Normal'
                else:
                    stat, p_value, interpretation = None, None, 'Insufficient data'
                
                results.append({
                    'Category': group_col,
                    'Group': group,
                    'Variable': col,
                    'Test Statistic': stat,
                    'P-Value': p_value,
                    'Interpretation': interpretation,
                    'Used Bootstrap': use_bootstrap
                })
        
    results_df = pd.DataFrame(results)
    return results_df

result_df = shapiro_wilk_test(df, group_cols=["Store Size", "Marketing Strategy"], use_bootstrap=True)
display(result_df)

Unnamed: 0,Category,Group,Variable,Test Statistic,P-Value,Interpretation,Used Bootstrap
0,Store Size,Large,Store_ID,1.0,0.4,Normal,True
1,Store Size,Large,Month Index,1.0,0.14,Normal,True
2,Store Size,Large,Average Order Values (Tsh),1.0,0.29,Normal,True
3,Store Size,Large,Post Campaign Period,1.0,1.0,Normal,True
4,Store Size,Large,Sales Revenue (Tsh),1.0,0.7,Normal,True
5,Store Size,Medium,Store_ID,1.0,0.72,Normal,True
6,Store Size,Medium,Month Index,1.0,0.61,Normal,True
7,Store Size,Medium,Average Order Values (Tsh),1.0,0.63,Normal,True
8,Store Size,Medium,Post Campaign Period,1.0,1.0,Normal,True
9,Store Size,Medium,Sales Revenue (Tsh),1.0,0.05,Normal,True


In [15]:
result_df = shapiro_wilk_test(df, group_cols=["Store Size", "Marketing Strategy"], use_bootstrap=False)
display(result_df)

Unnamed: 0,Category,Group,Variable,Test Statistic,P-Value,Interpretation,Used Bootstrap
0,Store Size,Large,Store_ID,0.91,0.0,Not Normal,False
1,Store Size,Large,Month Index,0.95,0.0,Not Normal,False
2,Store Size,Large,Average Order Values (Tsh),0.86,0.0,Not Normal,False
3,Store Size,Large,Post Campaign Period,1.0,1.0,Normal,False
4,Store Size,Large,Sales Revenue (Tsh),1.0,0.91,Normal,False
5,Store Size,Medium,Store_ID,0.93,0.0,Not Normal,False
6,Store Size,Medium,Month Index,0.95,0.0,Not Normal,False
7,Store Size,Medium,Average Order Values (Tsh),0.96,0.0,Not Normal,False
8,Store Size,Medium,Post Campaign Period,1.0,1.0,Normal,False
9,Store Size,Medium,Sales Revenue (Tsh),1.0,0.61,Normal,False


<h3 style='font-family: Agency FB; font-size: 20px; font-weight: 600'>5.0: One Way  Analysis of Varience</h3>

In [6]:
from statsmodels.formula.api import ols as smf_ols 
from statsmodels.stats.anova import anova_lm  
from statsmodels.formula.api import ols
import statsmodels.formula.api as smf 
import statsmodels.api as sm  

def rename(text): return re.sub(r'[^a-zA-Z]', "_", text)

def One_way_anova(data, dependent_ariables, Independents_variables):
    results = []
    Independents_variables = [rename(col) for col in Independents_variables]
    data = data.rename(columns={col: rename(col) for col in data.columns})
    for Independents_variable in Independents_variables:
        for col in dependent_ariables:
            column_name = rename(col)  
            formula = f"{column_name} ~ C({Independents_variable})" 
            model = smf.ols(formula, data=data).fit()
            anova_table = sm.stats.anova_lm(model, typ=2)
            for source, row in anova_table.iterrows():
                p_value = row["PR(>F)"]
                interpretation = "Significant" if p_value < 0.05 else "No significant"
                if source == "Residual": interpretation = "-"
        
                results.append({
                    "Variable": col,
                    "Factor": Independents_variable.title(),
                    "Source": source,
                    "Sum Sq": row["sum_sq"],
                    "df": row["df"],
                    "F-Value": row["F"],
                    "p-Value": p_value,
                    "Interpretation": interpretation
                })

    return pd.DataFrame(results)

Independents_variables = ["Store Size", "Marketing Strategy"]
dependent_ariables = ["Average Order Values (Tsh)", "Sales Revenue (Tsh)"]
anova_results = One_way_anova(df, dependent_ariables, Independents_variables)
anova_results

Unnamed: 0,Variable,Factor,Source,Sum Sq,df,F-Value,p-Value,Interpretation
0,Average Order Values (Tsh),Store_Size,C(Store_Size),106487361.2,2.0,2.12,0.12,No significant
1,Average Order Values (Tsh),Store_Size,Residual,45090293944.48,1797.0,,,-
2,Sales Revenue (Tsh),Store_Size,C(Store_Size),86935849314370.72,2.0,4536.89,0.0,Significant
3,Sales Revenue (Tsh),Store_Size,Residual,17217039246938.81,1797.0,,,-
4,Average Order Values (Tsh),Marketing_Strategy,C(Marketing_Strategy),1920908954.88,1.0,79.81,0.0,Significant
5,Average Order Values (Tsh),Marketing_Strategy,Residual,43275872350.8,1798.0,,,-
6,Sales Revenue (Tsh),Marketing_Strategy,C(Marketing_Strategy),408612612824.55,1.0,7.08,0.01,Significant
7,Sales Revenue (Tsh),Marketing_Strategy,Residual,103744275948484.52,1798.0,,,-


<h2 style='font-size: 20px; font-family: Agency FB; font-weight: 600'>6.0: Welch's ANOVA (Welch's F test)</h2>

In [7]:
import pingouin as pg

def welchs_anova(data, dependent_variables, Independent_variables):
    results = []
    
    Independent_variables = [rename(col) for col in Independent_variables]
    data = data.rename(columns={col: rename(col) for col in data.columns})
    for group in Independent_variables:
        for col in dependent_variables:
            column_name = rename(col)
            
            aov = pg.welch_anova(data=data, dv=column_name, between=group)
            for _, row in aov.iterrows():
                p_value = row["p-unc"]
                interpretation = "Significant" if p_value < 0.05 else "Not Significant"
                results.append({
                    "Variable": col,
                    "Grouping Factor": group.title(),
                    "Source": row["Source"],
                    "df": row["ddof1"],
                    "F-Value": row["F"],
                    "p-Value": p_value,
                    "Significance": interpretation
                })

    return pd.DataFrame(results)

Independent_variables = ["Store Size", "Marketing Strategy"]
dependent_variables = ["Average Order Values (Tsh)", "Sales Revenue (Tsh)"]
welch_results = welchs_anova(df, dependent_variables, Independent_variables)
welch_results

Unnamed: 0,Variable,Grouping Factor,Source,df,F-Value,p-Value,Significance
0,Average Order Values (Tsh),Store_Size,Store_Size,2,1.85,0.16,Not Significant
1,Sales Revenue (Tsh),Store_Size,Store_Size,2,4558.85,0.0,Significant
2,Average Order Values (Tsh),Marketing_Strategy,Marketing_Strategy,1,79.81,0.0,Significant
3,Sales Revenue (Tsh),Marketing_Strategy,Marketing_Strategy,1,7.08,0.01,Significant


<h2 style='font-size: 20px; font-family: Agency FB; font-weight: 600'>7.0: Kruskal-Wallis Test</h2>

In [8]:
from scipy.stats import kruskal

def kruskall_wallis(df, independent_variables: str, dependent_variables: list = None):
    if dependent_variables is None:
        dependent_variables = df.select_dtypes(include=[np.number]).columns.tolist()
        for g in independent_variables:
            if g in dependent_variables:
                dependent_variables.remove(g)
    results = []
    for group_column in independent_variables:
        for column in dependent_variables:
            groups = [group[column].dropna().values for name, group in df.groupby(group_column)]
            stats, p_value = kruskal(*groups)
            interpretation = 'Significant' if p_value < 0.05 else 'Not Significant'
            results.append({
                'Group': group_column,
                'Variables': column,
                'Kruskal-Wallis Statistic': stats,
                'P-value': p_value,
                'Significant (α<0.05)': interpretation
            })
    return pd.DataFrame(results)

independent_variables = ["Store Size", "Marketing Strategy"] 
dependent_variables = ["Average Order Values (Tsh)", "Sales Revenue (Tsh)"]
results = kruskall_wallis(df, independent_variables, dependent_variables)
display(results)

Unnamed: 0,Group,Variables,Kruskal-Wallis Statistic,P-value,Significant (α<0.05)
0,Store Size,Average Order Values (Tsh),4.39,0.11,Not Significant
1,Store Size,Sales Revenue (Tsh),1437.47,0.0,Significant
2,Marketing Strategy,Average Order Values (Tsh),60.65,0.0,Significant
3,Marketing Strategy,Sales Revenue (Tsh),1.95,0.16,Not Significant


<h2 style='font-family: Agency FB; font-size: 20px; font-weight: 600'>8.0: Tukey's Honest Significant Difference (THSD)</h2>


In [9]:
from statsmodels.stats.multicomp import pairwise_tukeyhsd  

def Turkey_results(df, variables, group_cols):
    results_data = []
    for group in group_cols:
        for metric in variables:
            turkey_results = pairwise_tukeyhsd(endog=df[metric], groups=df[group], alpha=0.05)
            results_table = turkey_results.summary()
            
            for i in range(1, len(results_table)):
                row = results_table.data[i]
                results_data.append({
                    'group': group,
                    'Metric': metric,
                    'Group1': row[0],
                    'Group2': row[1],
                    'Mean Difference': row[2],
                    'P-Value': row[3],
                    'Lower CI': row[4],
                    'Upper CI': row[5],
                    'Reject Null': row[6]
                })
        
        result_df = pd.DataFrame(results_data)
    return result_df

group_cols = ["Store Size", "Marketing Strategy"] 
variables = ["Average Order Values (Tsh)", "Sales Revenue (Tsh)"]
results = Turkey_results(df, variables, group_cols = group_cols)
results

Unnamed: 0,group,Metric,Group1,Group2,Mean Difference,P-Value,Lower CI,Upper CI,Reject Null
0,Store Size,Average Order Values (Tsh),Large,Medium,742.21,0.11,-127.69,1612.11,False
1,Store Size,Average Order Values (Tsh),Large,Small,695.92,0.15,-185.59,1577.43,False
2,Store Size,Average Order Values (Tsh),Medium,Small,-46.28,0.98,-641.1,548.53,False
3,Store Size,Sales Revenue (Tsh),Large,Medium,-371103.76,0.0,-388102.13,-354105.4,True
4,Store Size,Sales Revenue (Tsh),Large,Small,-666105.63,0.0,-683330.86,-648880.4,True
5,Store Size,Sales Revenue (Tsh),Medium,Small,-295001.87,0.0,-306624.93,-283378.8,True
6,Marketing Strategy,Average Order Values (Tsh),Business-as-Usual (BAU) Regions,Pilot Launch Regions,-2066.08,0.0,-2519.67,-1612.49,True
7,Marketing Strategy,Sales Revenue (Tsh),Business-as-Usual (BAU) Regions,Pilot Launch Regions,30133.5,0.01,7924.89,52342.12,True


---

This analysis was performed by **Jabulente**, a passionate and dedicated data analyst with a strong commitment to using data to drive meaningful insights and solutions. For inquiries, collaborations, or further discussions, please feel free to reach out via.  

----

<div align="center">  
    
[![GitHub](https://img.shields.io/badge/GitHub-Jabulente-black?logo=github)](https://github.com/Jabulente)  [![LinkedIn](https://img.shields.io/badge/LinkedIn-Jabulente-blue?logo=linkedin)](https://linkedin.com/in/jabulente-208019349)  [![Email](https://img.shields.io/badge/Email-jabulente@hotmail.com-red?logo=gmail)](mailto:Jabulente@hotmail.com)  

</div>


<h1 style='font-size: 35px; color: red; font-family: Agency FB; font-weight: 700; text-align: center'>Data to Drive Meaningful Insights and Solutions</h1>