## 3 -  Z-критерий для разности долей (связанные выборки)

In [7]:
import numpy as np

def proportions_diff_z_stat_ind(sample1, sample2):
    n1 = len(sample1)
    n2 = len(sample2)
    
    p1 = float(sum(sample1)) / n1
    p2 = float(sum(sample2)) / n2 
    P = float(p1*n1 + p2*n2) / (n1 + n2)
    
    return (p1 - p2) / np.sqrt(P * (1 - P) * (1. / n1 + 1. / n2))

In [8]:
import scipy

def proportions_diff_z_test(z_stat, alternative = 'two-sided'):
    if alternative not in ('two-sided', 'less', 'greater'):
        raise ValueError("alternative not recognized\n"
                         "should be 'two-sided', 'less' or 'greater'")
    
    if alternative == 'two-sided':
        return 2 * (1 - scipy.stats.norm.cdf(np.abs(z_stat)))
    
    if alternative == 'less':
        return scipy.stats.norm.cdf(z_stat)

    if alternative == 'greater':
        return 1 - scipy.stats.norm.cdf(z_stat)

In [9]:
yawn = [1]*10 + [0]*24
normal = [1]*4 + [0]*12

In [10]:
from statsmodels.stats.weightstats import *

Z = proportions_diff_z_stat_ind(yawn, normal)
yawn_p_value = proportions_diff_z_test(Z, 'greater')
print "p-value = %.4f" % yawn_p_value  # 0.3729

p-value = 0.3729


## 4 - Z-критерий для разности долей (связанные выборки)

In [11]:
def proportions_diff_z_stat_rel(sample1, sample2):
    sample = zip(sample1, sample2)
    n = len(sample)
    
    f = sum([1 if (x[0] == 1 and x[1] == 0) else 0 for x in sample])
    g = sum([1 if (x[0] == 0 and x[1] == 1) else 0 for x in sample])
    
    return float(f - g) / np.sqrt(f + g - float((f - g)**2) / n )

In [12]:
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression

banknotes = pd.read_csv("banknotes.txt", sep="\t")
data_train, data_test, labels_train, labels_test = train_test_split(
    banknotes.drop("real", axis=1), banknotes.real, test_size=50, random_state=1)

# train on different features
log_reg_1 = LogisticRegression()
log_reg_1.fit(data_train[[0, 1, 2]], labels_train)
log_reg_2 = LogisticRegression()
log_reg_2.fit(data_train[[3, 4, 5]], labels_train)

first_score = abs(log_reg_1.predict(data_test[[0, 1, 2]]) - labels_test)
second_score = abs(log_reg_2.predict(data_test[[3, 4, 5]]) - labels_test)

Z = proportions_diff_z_stat_rel(first_score, second_score)
print "p-value: %f" % proportions_diff_z_test(Z, "two-sided")  # 3

p-value: 0.003297


In [13]:
# 5
def proportions_diff_confint_rel(sample1, sample2, alpha = 0.05):
    z = scipy.stats.norm.ppf(1 - alpha / 2.)
    sample = zip(sample1, sample2)
    n = len(sample)
        
    f = sum([1 if (x[0] == 1 and x[1] == 0) else 0 for x in sample])
    g = sum([1 if (x[0] == 0 and x[1] == 1) else 0 for x in sample])
    
    left_boundary = float(f - g) / n  - z * np.sqrt(float((f + g)) / n**2 - float((f - g)**2) / n**3)
    right_boundary = float(f - g) / n  + z * np.sqrt(float((f + g)) / n**2 - float((f - g)**2) / n**3)
    return (left_boundary, right_boundary)

In [14]:
print "95%% confidence interval for a difference between proportions: [%.4f, %.4f]" \
      % proportions_diff_confint_rel(first_score, second_score)  # 0.0599

95% confidence interval for a difference between proportions: [0.0599, 0.3001]


In [15]:
# 6
Z = (541.4 - 525) / (100 / np.sqrt(100))
print "%.4f" % proportions_diff_z_test(Z, "greater")

0.0505


In [10]:
# 7
Z = (541.5 - 525) / (100 / np.sqrt(100))
print "%.4f" % proportions_diff_z_test(Z, "greater")

0.0495
