In [85]:
import pandas as pd
import numpy as np

In [86]:
data = pd.read_csv('water.txt', delimiter = '\t')
data.head()

Unnamed: 0,location,town,mortality,hardness
0,South,Bath,1247,105
1,North,Birkenhead,1668,17
2,South,Birmingham,1466,5
3,North,Blackburn,1800,14
4,North,Blackpool,1609,18


In [87]:
p_corr = data.corr(method='pearson')['mortality']['hardness']
print(f"Pearson's correlation: {p_corr:.4f}")

Pearson's correlation: -0.6548


In [88]:
s_corr = data.corr(method='spearman')['mortality']['hardness']
print(f"Spearman's correlation: {s_corr:.4f}")

Spearman's correlation: -0.6317


In [89]:
data_south = data[data['location'] == 'South']
data_south.head()

Unnamed: 0,location,town,mortality,hardness
0,South,Bath,1247,105
2,South,Birmingham,1466,5
7,South,Bournemouth,1299,78
9,South,Brighton,1359,84
10,South,Bristol,1392,73


In [90]:
data_north = data[data['location'] == 'North']
data_north.head()

Unnamed: 0,location,town,mortality,hardness
1,North,Birkenhead,1668,17
3,North,Blackburn,1800,14
4,North,Blackpool,1609,18
5,North,Bolton,1558,10
6,North,Bootle,1807,15


In [91]:
corr_north = data_north.corr()['mortality']['hardness']
corr_south = data_south.corr()['mortality']['hardness']

print(f"Correlation: {corr_north if abs(corr_north) < abs(corr_south) else corr_south:.4f}")

Correlation: -0.3686


In [92]:
w_freq = 203 # c
m_freq = 239 # a

w_seldom = 718 # d
m_seldom = 515 # b

In [93]:
def mcc(a, b, c, d):
    return (a*d - b*c)/(np.sqrt((a+b) * (a+c) * (b+d) * (c+d)))
    
print(f"MCC: {mcc(m_freq, m_seldom, w_freq, w_seldom):.3f}")

MCC: 0.109


In [94]:
from scipy import stats

table1 = np.array([[m_freq, m_seldom], [w_freq, w_seldom]])
_, p_1, _, _ = stats.chi2_contingency(table1)
print(f"p-value: {p_1}")

p-value: 1.0558987006638725e-05


In [95]:
fem = np.append(np.ones(w_freq), np.zeros(w_seldom))
mal = np.append(np.ones(m_freq), np.zeros(m_seldom))

In [96]:
def proportions_diff_confint_ind(sample1, sample2, alpha = 0.05):    
    z = scipy.stats.norm.ppf(1 - alpha / 2.)
    
    p1 = float(sum(sample1)) / len(sample1)
    p2 = float(sum(sample2)) / len(sample2)
    
    left_boundary = (p1 - p2) - z * np.sqrt(p1 * (1 - p1)/ len(sample1) + p2 * (1 - p2)/ len(sample2))
    right_boundary = (p1 - p2) + z * np.sqrt(p1 * (1 - p1)/ len(sample1) + p2 * (1 - p2)/ len(sample2))
    
    return (left_boundary, right_boundary)

In [97]:
interval = proportions_diff_confint_ind(mal, fem)
lower_boarder = interval[0]
print(f"Lower boarder: {lower_boarder:.4f}")

Lower boarder: 0.0539


In [98]:
def proportions_diff_z_stat_ind(sample1, sample2):
    n1 = len(sample1)
    n2 = len(sample2)
    
    p1 = float(sum(sample1)) / n1
    p2 = float(sum(sample2)) / n2 
    P = float(p1*n1 + p2*n2) / (n1 + n2)
    
    return (p1 - p2) / np.sqrt(P * (1 - P) * (1. / n1 + 1. / n2))

In [99]:
def proportions_diff_z_test(z_stat, alternative = 'two-sided'):
    if alternative not in ('two-sided', 'less', 'greater'):
        raise ValueError("alternative not recognized\n"
                         "should be 'two-sided', 'less' or 'greater'")
    
    if alternative == 'two-sided':
        return 2 * (1 - scipy.stats.norm.cdf(np.abs(z_stat)))
    
    if alternative == 'less':
        return scipy.stats.norm.cdf(z_stat)

    if alternative == 'greater':
        return 1 - scipy.stats.norm.cdf(z_stat)

In [100]:
proportions_diff_z_test(proportions_diff_z_stat_ind(mal, fem))

8.153453089576601e-06

In [101]:
table2 = np.array([[197, 111, 33], [382, 685, 331], [110, 342, 333]])
chi2, p_2, _, _ = stats.chi2_contingency(table2)

In [102]:
print(f"Chi2 statistic: {chi2:.4f}")

Chi2 statistic: 293.6831


In [103]:
print(f"p-value: {p_2}")

p-value: 2.4964299580093467e-62
