In [1]:
# Imports
import pandas as pd
from scipy.stats import chi2_contingency, chi2

In [7]:
# Get data
data = pd.read_excel('Data/grocery_database.xlsx', sheet_name='campaign_data')

In [8]:
# filter data
data = data.loc[data['mailer_type'] != 'Control']

In [12]:
# Create a matrix of observed values
observed_values = pd.crosstab(data['mailer_type'], 
                              data['signup_flag']
                             ).values
observed_values

array([[252, 123],
       [209, 127]])

In [18]:
mailer1_rates = 123 / (252 + 123)
mailer2_rates = 127 / (209 + 127)
print(f'mailer 1 rates: {mailer1_rates} \nmailer 2 rates : {mailer2_rates}')

mailer 1 rates: 0.328 
mailer 2 rates : 0.37797619047619047


In [19]:
# State hypotheses

null_hypothesis = 'There is NO relationship between mailer type and signup rates'

alternate_hypothesis = 'There IS a relationship between mailer type and signup rates'

acceptance_criteria = 0.05

In [21]:
# Calculate expected frequencies & Chi square statistics

chi2_statistic, p_value, dof, expected_values = chi2_contingency(observed_values,
                                                                 correction = False # For degree of freedom of 1
                                                                )

print(f'chi2 : {chi2_statistic}\nP Value : {p_value}')

chi2 : 1.9414468614812481
P Value : 0.16351152223398197


In [23]:
# Find the critical value of the test

critical_value = chi2.ppf(1-acceptance_criteria, dof)

print(critical_value)

3.841458820694124


In [26]:
# Print of result

if chi2_statistic >= critical_value:
    print(f'As our chi2 of {chi2_statistic} is higher than our critical value of {critical_value}, we reject the null-hypothesis and conclude that {alternate_hypothesis}')
    
else:
    print(f'As our chi2 of {chi2_statistic} is lower than our critical value of {critical_value}, we retain the null-hypothesis and conclude that {null_hypothesis}')

As our chi2 of 1.9414468614812481 is lower than our critical value of 3.841458820694124, we retain the null-hypothesis and conclude that There is NO relationship between mailer type and signup rates


In [27]:
# P-Value version

if p_value <= acceptance_criteria:
    print(f'As our P-Value of {p_value} is lower than our acceptance criteria of {acceptance_criteria}, we reject the null-hypothesis and conclude that {alternate_hypothesis}')
    
else:
    print(f'As our P-Value of {p_value} is higher than our acceptance criteria of {acceptance_criteria}, we retain the null-hypothesis and conclude that {null_hypothesis}')

As our P-Value of 0.16351152223398197 is higher than our acceptance criteria of 0.05, we retain the null-hypothesis and conclude that There is NO relationship between mailer type and signup rates
