In [13]:
import numpy as np
import pandas as pd
import random

import scipy
from statsmodels.stats.weightstats import *
from statsmodels.stats.proportion import proportion_confint
import statsmodels

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

## Зевота

In [32]:
zeval = [1]*10 + [0]*24
ne_zeval = [1]*4 + [0]*12
10/34-4/16

0.04411764705882354

In [3]:
def proportions_diff_confint_ind(sample1, sample2, alpha = 0.05):    
    z = scipy.stats.norm.ppf(1 - alpha / 2.)
    
    p1 = float(sum(sample1)) / len(sample1)
    p2 = float(sum(sample2)) / len(sample2)
    
    left_boundary = (p1 - p2) - z * np.sqrt(p1 * (1 - p1)/ len(sample1) + p2 * (1 - p2)/ len(sample2))
    right_boundary = (p1 - p2) + z * np.sqrt(p1 * (1 - p1)/ len(sample1) + p2 * (1 - p2)/ len(sample2))
    
    return (left_boundary, right_boundary)

def proportions_diff_z_stat_ind(sample1, sample2):
    n1 = len(sample1)
    n2 = len(sample2)
    
    p1 = float(sum(sample1)) / n1
    p2 = float(sum(sample2)) / n2 
    P = float(p1*n1 + p2*n2) / (n1 + n2)
    
    return (p1 - p2) / np.sqrt(P * (1 - P) * (1. / n1 + 1. / n2))

def proportions_diff_z_test(z_stat, alternative = 'two-sided'):
    if alternative not in ('two-sided', 'less', 'greater'):
        raise ValueError("alternative not recognized\n"
                         "should be 'two-sided', 'less' or 'greater'")
    
    if alternative == 'two-sided':
        return 2 * (1 - scipy.stats.norm.cdf(np.abs(z_stat)))
    
    if alternative == 'less':
        return scipy.stats.norm.cdf(z_stat)

    if alternative == 'greater':
        return 1 - scipy.stats.norm.cdf(z_stat)

In [33]:
print("p-value: %f" % proportions_diff_z_test(proportions_diff_z_stat_ind(zeval, ne_zeval),'greater'))

p-value: 0.372930


## Банкноты

In [16]:
banknotes = pd.read_csv("banknotes.txt", sep='\t')
print(banknotes.shape)
banknotes.head()

(200, 7)


Unnamed: 0,X1,X2,X3,X4,X5,X6,real
0,214.8,131.0,131.1,9.0,9.7,141.0,1
1,214.6,129.7,129.7,8.1,9.5,141.7,1
2,214.8,129.7,129.7,8.7,9.6,142.2,1
3,214.8,129.7,129.6,7.5,10.4,142.0,1
4,215.0,129.6,129.7,10.4,7.7,141.8,1


In [20]:
banknotes_X = banknotes[['X1', 'X2', 'X3', 'X4', 'X5', 'X6']]
banknotes_y = banknotes['real']
X_train, X_test, y_train, y_test = train_test_split(banknotes_X, banknotes_y, test_size=50, random_state=1)

In [51]:
log_reg1 = LogisticRegression().fit(X_train[['X1', 'X2', 'X3']], y_train)
log_reg2 = LogisticRegression().fit(X_train[['X4', 'X5', 'X6']], y_train)

predict1 = log_reg1.predict(X_test[['X1', 'X2', 'X3']])
predict2 = log_reg2.predict(X_test[['X4', 'X5', 'X6']])

acc1 = accuracy_score(y_test, predict1)
acc2 = accuracy_score(y_test, predict2)
acc = acc2-acc1
answers1 = [y_test.values == predict1][0]
answers2 = [y_test.values == predict2][0]



In [50]:
scipy.stats.ttest_ind(answers1, answers2)

Ttest_indResult(statistic=-2.9731538225529537, pvalue=0.003709780594742533)

In [55]:
def proportions_diff_confint_rel(sample1, sample2, alpha = 0.05):
    z = scipy.stats.norm.ppf(1 - alpha / 2.)
    sample = list(zip(sample1, sample2))
    n = len(sample)
        
    f = sum([1 if (x[0] == 1 and x[1] == 0) else 0 for x in sample])
    g = sum([1 if (x[0] == 0 and x[1] == 1) else 0 for x in sample])
    
    left_boundary = float(f - g) / n  - z * np.sqrt(float((f + g)) / n**2 - float((f - g)**2) / n**3)
    right_boundary = float(f - g) / n  + z * np.sqrt(float((f + g)) / n**2 - float((f - g)**2) / n**3)
    return (left_boundary, right_boundary)

def proportions_diff_z_stat_rel(sample1, sample2):
    sample = list(zip(sample1, sample2))
    n = len(sample)
    
    f = sum([1 if (x[0] == 1 and x[1] == 0) else 0 for x in sample])
    g = sum([1 if (x[0] == 0 and x[1] == 1) else 0 for x in sample])
    
    return float(f - g) / np.sqrt(f + g - float((f - g)**2) / n )

In [65]:
print("95%% confidence interval for a difference between proportions: [%f, %f]" \
      % proportions_diff_confint_rel(answers2, answers1))

95% confidence interval for a difference between proportions: [0.059945, 0.300055]


## Тест

In [29]:
random.seed(1)
normal = [0]*100
for i in range(100):
    normal[i] = random.normalvariate(525, 100)

In [30]:
test = [541.4]*100
scipy.stats.ttest_ind(normal, test, equal_var = False)

Ttest_indResult(statistic=-2.649442821702495, pvalue=0.009385859122476808)

In [31]:
test = [541.5]*100
scipy.stats.ttest_ind(normal, test, equal_var = False)

Ttest_indResult(statistic=-2.6586298635876116, pvalue=0.009150624782116533)

In [35]:
def new_proportions_diff_z_stat_ind(n1, n2, mu1, mu2, sigma1, sigma2):
    return (mu1 - mu2) / np.sqrt(sigma1*sigma1 / n1 + sigma2*sigma2 / n2)

def new_proportions_diff_z_test(z_stat, alternative = 'two-sided'):
    if alternative not in ('two-sided', 'less', 'greater'):
        raise ValueError("alternative not recognized\n"
                         "should be 'two-sided', 'less' or 'greater'")
    
    if alternative == 'two-sided':
        return 2 * (1 - scipy.stats.norm.cdf(np.abs(z_stat)))
    
    if alternative == 'less':
        return scipy.stats.norm.cdf(z_stat)

    if alternative == 'greater':
        return 1 - scipy.stats.norm.cdf(z_stat)
    
new_proportions_diff_z_test(new_proportions_diff_z_stat_ind(200000, 100, 525, 541.4, 100, 100), 'less')

0.0505452058673069

In [36]:
new_proportions_diff_z_test(new_proportions_diff_z_stat_ind(200000, 100, 525, 541.5, 100, 100), 'less')

0.049513650849649773