In [61]:
import numpy as np
from scipy import stats

In [62]:
yawn_train = np.zeros(34)
yawn_train[:10] = 1

yawn_test = np.zeros(16)
yawn_test[:4] = 1

In [63]:
def proportions_diff_z_stat_ind(sample1, sample2):
    n1 = len(sample1)
    n2 = len(sample2)
    
    p1 = sum(sample1)/n1
    p2 = sum(sample2)/n2
    
    P = (p1*n1 + p2*n2) / (n1 + n2)
    
    return (p1-p2) / np.sqrt(P * (1-P) * (1/n1 + 1/n2))

In [64]:
def proportions_diff_z_test(z_stat, alternative = 'two-sided'):
    if alternative not in ('two-sided', 'less', 'greater'):
        raise ValueError("alternative not recognized\n"
                         "should be 'two-sided', 'less' or 'greater'")
    
    if alternative == 'two-sided':
        return 2 * (1 - stats.norm.cdf(np.abs(z_stat)))
    
    if alternative == 'less':
        return stats.norm.cdf(z_stat)

    if alternative == 'greater':
        return 1 - stats.norm.cdf(z_stat)

In [65]:
p_value = proportions_diff_z_test(proportions_diff_z_stat_ind(yawn_test, yawn_train), alternative='less')
print(f"p-value: {p_value:.4f}")

p-value: 0.3729


In [66]:
import pandas as pd

In [67]:
data = pd.read_csv('banknotes.txt', delimiter='\t')
data.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,real
0,214.8,131.0,131.1,9.0,9.7,141.0,1
1,214.6,129.7,129.7,8.1,9.5,141.7,1
2,214.8,129.7,129.7,8.7,9.6,142.2,1
3,214.8,129.7,129.6,7.5,10.4,142.0,1
4,215.0,129.6,129.7,10.4,7.7,141.8,1


In [68]:
y = data['real']
X = data.drop('real', axis=1)

In [69]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=50, random_state=1)

In [70]:
X_train_1 = X_train[['X1', 'X2', 'X3']]
X_train_2 = X_train[['X3', 'X4', 'X5']]

X_test_1 = X_test[['X1', 'X2', 'X3']]
X_test_2 = X_test[['X3', 'X4', 'X5']]

In [71]:
import warnings
warnings.filterwarnings('ignore')

In [72]:
from sklearn.linear_model import LogisticRegression

y_pred1 = LogisticRegression().fit(X_train_1, y_train).predict(X_test_1)
y_pred2 = LogisticRegression().fit(X_train_2, y_train).predict(X_test_2)

In [73]:
from sklearn.metrics import accuracy_score

p1 = 1 - accuracy_score(y_test, y_pred1)
p2 = 1 - accuracy_score(y_test, y_pred2)

p1, p2

(0.19999999999999996, 0.020000000000000018)

In [74]:
def proportions_diff_z_stat_rel(sample1, sample2):
    sample = list(zip(sample1, sample2))
    n = len(sample)
    
    f = sum([1 if (x[0] == 1 and x[1] == 0) else 0 for x in sample])
    g = sum([1 if (x[0] == 0 and x[1] == 1) else 0 for x in sample])
    
    return float(f - g) / np.sqrt(f + g - float((f - g)**2) / n )

In [75]:
err1 = [1 if y_pred1[i] == y_test.values[i] else 0 for i in range(n)]
err2 = [1 if y_pred2[i] == y_test.values[i] else 0 for i in range(n)]

In [76]:
p_value = proportions_diff_z_test(proportions_diff_z_stat_rel(err1, err2))
print(p_value)

0.0032969384555543435


In [77]:
def proportions_diff_confint_rel(sample1, sample2, alpha = 0.05):
    z = stats.norm.ppf(1 - alpha / 2.)
    sample = list(zip(sample1, sample2))
    n = len(sample)
        
    f = sum([1 if (x[0] == 1 and x[1] == 0) else 0 for x in sample])
    g = sum([1 if (x[0] == 0 and x[1] == 1) else 0 for x in sample])
    
    left_boundary = float(f - g) / n  - z * np.sqrt(float((f + g)) / n**2 - float((f - g)**2) / n**3)
    right_boundary = float(f - g) / n  + z * np.sqrt(float((f + g)) / n**2 - float((f - g)**2) / n**3)
    return (left_boundary, right_boundary)

In [78]:
conf_int = proportions_diff_confint_rel(err1, err2)
print(f"({np.round(conf_int[0], 4)}, {np.round(conf_int[1], 4)})")

(-0.3001, -0.0599)


In [79]:
z = (541.4-525) / (100/np.sqrt(100))
density = stats.norm.cdf(z)
p_value = 1-density

print(f"p-value: {p_value:.4f}")

p-value: 0.0505


In [80]:
z = (541.5-525) / (100/np.sqrt(100))
density = stats.norm.cdf(z)
p_value = 1-density

print(f"p-value: {p_value:.4f}")

p-value: 0.0495
