In [1]:
import numpy as np
import pandas as pd
import scipy.stats.distributions as dist

# DATASET
data = pd.read_csv('train.csv')
data['Survived'] = data['Survived'].map({1:'Survived',0:'Not Survived'}) #Encode the values
data = data.dropna() #Drop the nan values
contingency_table = pd.crosstab(data.Survived,data.Sex) #Contingency Table
print("contingency_table")
contingency_table

contingency_table


Sex,female,male
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1
Not Survived,6,54
Survived,82,41


In [2]:
print("Crosstab data")
pd.crosstab(data.Survived,data.Sex).apply(lambda r:r/r.sum(),axis=0)

Crosstab data


Sex,female,male
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1
Not Survived,0.068182,0.568421
Survived,0.931818,0.431579


In [3]:
total_proportion_survived = (data.Survived == "Survived").mean()
num_female = data[data.Sex=="female"].shape[0]
num_male = data[data.Sex=="male"].shape[0]
assert num_female*total_proportion_survived>10, "Assumptions not met"
assert num_male*total_proportion_survived>10, "Assumptions not met"
assert num_female*(1-total_proportion_survived)>10, "Assumptions not met"
assert num_male*(1-total_proportion_survived)>10, "Assumptions not met"

#This table helps us to calculate the SE.
prop = data.groupby("Sex")["Survived"].agg([lambda z: np.mean(z=="Survived"), "size"])
prop.columns = ['proportions_survived','total_counts']

print("Prop Data")
prop.head()

Prop Data


Unnamed: 0_level_0,proportions_survived,total_counts
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1
female,0.931818,88
male,0.431579,95


In [4]:
#Calculating standard error
 
variance = total_proportion_survived * (1 - total_proportion_survived)
standard_error = np.sqrt(variance * (1 / prop.total_counts.female + 1 / prop.total_counts.male))
print("Sample Standard Error",standard_error)

Sample Standard Error 0.06945435736849903


In [5]:
# Calculate the test statistic 
best_estimate = (prop.proportions_survived.female - prop.proportions_survived.male)
print("The best estimate is",best_estimate)
hypothesized_estimate = 0
test_stat = (best_estimate-hypothesized_estimate) / standard_error
print("Computed Test Statistic is",test_stat)

The best estimate is 0.5002392344497607
Computed Test Statistic is 7.202416859113346


In [10]:
# Calculate the  p-value
pvalue = 2*dist.norm.cdf(-np.abs(test_stat)) # Multiplied by two indicates a two tailed testing.
print("Computed P-value is", pvalue)


# We can clearly see that p-value is way lesser than the significance level of 0.10. 
# So we can safely reject the null hypothesis in favour of the alternative hypothesis. 
# We infer that the difference in proportions between that of survived females to that of survived males is significant and certainly not equal 
# to zero(females might had had better access to lifeboats).

Computed P-value is 5.915451296155379e-13
