# Day 12: Hypothesis Testing
## Author: Kush Mehta

In [2]:
# Find working directory on your local computer; yours will be different from mine
import os
path = os.getcwd()
print(path)

/Users/kush/Python


In [4]:
# Import necessary packages
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as stats
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind
from statsmodels.stats.proportion import proportions_ztest

In [6]:
# Import Data
birth = pd.read_csv("/Users/kush/Data/BirthdataNC.csv")

In [13]:
# Data's information
birth.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   fage            829 non-null    float64
 1   mage            1000 non-null   int64  
 2   mature          1000 non-null   object 
 3   weeks           998 non-null    float64
 4   premie          998 non-null    object 
 5   visits          991 non-null    float64
 6   marital         999 non-null    object 
 7   gained          973 non-null    float64
 8   weight          1000 non-null   float64
 9   lowbirthweight  1000 non-null   object 
 10  gender          1000 non-null   object 
 11  habit           999 non-null    object 
 12  whitemom        998 non-null    object 
dtypes: float64(5), int64(1), object(7)
memory usage: 101.7+ KB


## Hypothesis Testing for Difference in Means 

In [16]:
# Create 2 samples by gender to test for difference in weight
female = birth[birth['gender']=='female']['weight']
male = birth[birth['gender']=='male']['weight']

In [20]:
# Test for difference in means
ttest_ind(female,male)

TtestResult(statistic=-4.211995513148301, pvalue=2.760111651635877e-05, df=998.0)

In [24]:
# Round the result
test_gen = ttest_ind(female,male)
print('test-stat: ',round(test_gen[0], 4))
print('p-value: ',round(test_gen[1], 4))

test-stat:  -4.212
p-value:  0.0


In [26]:
# Statistically significant at the 5% level of significance: reject the null hypothesis
# There is evidence that there is difference in weights between boys and girls

In [30]:
# Lower-tailed test
test_gen = ttest_ind(female,male,alternative='less')
print('test-stat: ',round(test_gen[0], 4))
print('p-value: ',round(test_gen[1], 4))

test-stat:  -4.212
p-value:  0.0


In [32]:
# Statistically significant at the 5% level: reject null
# There is evidence that the girls weigh less than the boys at average

In [34]:
# Upper-tailed test
test_gen = ttest_ind(female,male,alternative='greater')
print('test-stat: ',round(test_gen[0], 4))
print('p-value: ',round(test_gen[1], 4))

test-stat:  -4.212
p-value:  1.0


In [36]:
# Not statistically significant at 5% level: fail to reject null
# There is no evidence that girls weigh more than boys on average

In [38]:
# Test for difference in weight between babies born to smokers versus non-smokers
smoker = birth[birth['habit']=='smoker']['weight']
nonsmoker = birth[birth['habit']=='nonsmoker']['weight']

In [40]:
#Test
test_habit = ttest_ind(smoker,nonsmoker)
print('test-stat: ',round(test_habit[0], 4))
print('p-value: ',round(test_habit[1], 4))

test-stat:  -2.2034
p-value:  0.0278


In [42]:
# Statistically significant at 5% level: reject null
# There is evidence of difference in weight between babies born to smokers and non-smokers

## Hypothesis Testing for Difference in Proportions

In [45]:
# Distribution of weights by gender
pd.crosstab(birth['gender'],birth['lowbirthweight'],margins=True)

lowbirthweight,low,not low,All
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,59,444,503
male,52,445,497
All,111,889,1000


In [47]:
# Test for difference in proportions
test_pro_gen = proportions_ztest([59,52],[503,497])
print('test_stat: ', round(test_pro_gen[0], 4))
print('p-value: ', round(test_pro_gen[1], 4))

test_stat:  0.6376
p-value:  0.5237


In [49]:
# Not statistically significant at 5% level: fail to reject null
# No evidence of difference in proportion of babies with low birth weight between female and male body populations

In [51]:
# Lower tailed test
test_pro_gen = proportions_ztest([59,52],[503,497], alternative='smaller')
print('test_stat: ', round(test_pro_gen[0], 4))
print('p-value: ', round(test_pro_gen[1], 4))

test_stat:  0.6376
p-value:  0.7381


In [2]:
# Not statistically significant at 5% level: fail to reject null
# No evidence that the proportion of girls with low birth weight is less than that of boys in population

In [55]:
# Upper tailed test
test_pro_gen = proportions_ztest([59,52],[503,497], alternative='larger')
print('test_stat: ', round(test_pro_gen[0], 4))
print('p-value: ', round(test_pro_gen[1], 4))

test_stat:  0.6376
p-value:  0.2619


In [57]:
# Not statistically significant at 5% level: fail to reject null
# No evidence that the proportion of girls with low birth weight is less than that of boys in population

In [61]:
# Test for difference in proportions of babies with low birth weight by mom's habit: smoker vs nonsmoker
pd.crosstab(birth['habit'],birth['lowbirthweight'],margins=True)

lowbirthweight,low,not low,All
habit,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
nonsmoker,92,781,873
smoker,18,108,126
All,110,889,999


In [63]:
# Test for difference in proportions
test_pro_habit = proportions_ztest([92,18],[873,126])
print('test_stat: ', round(test_pro_habit[0], 4))
print('p-value: ', round(test_pro_habit[1], 4))

test_stat:  -1.2562
p-value:  0.2091


In [4]:
# Not statistically significant at 5% level: fail to reject null
# No evidence that the proportion of babies with low birth weight based population