#   Доверительные интервалы для оценки среднего

In [1]:
from sklearn import datasets, linear_model, metrics, model_selection
import numpy as np
import matplotlib.pylab as plt

In [2]:
#   Generating data
blobs = datasets.make_blobs(300, centers=2, cluster_std=6, random_state=1)
#plt.figure(figsize=(8,8))
#plt.scatter(list(map(lambda x: x[0], blobs[0])), list(map(lambda x: x[1], blobs[0])), c=blobs[1], cmap='autumn', s=300)

In [3]:
train_data, test_data, train_label, test_label = model_selection.train_test_split(blobs[0], blobs[1], test_size=15, random_state=1)

In [4]:
ridge_model = linear_model.RidgeClassifierCV()
ridge_model.fit(train_data, train_label)
metrics.roc_auc_score(test_label, ridge_model.predict(test_data))

0.8888888888888888

In [5]:
sgd_model = linear_model.SGDClassifier(random_state=0, max_iter=1000)
sgd_model.fit(train_data, train_label)
metrics.roc_auc_score(test_label, sgd_model.predict(test_data))

0.7777777777777778

#   Оценка среднего

In [6]:
sqd_auc_scores = model_selection.cross_val_score(linear_model.SGDClassifier(),
                                                 blobs[0], blobs[1], scoring="roc_auc",
                                                 cv=20)

In [7]:
ridge_auc_scores = model_selection.cross_val_score(linear_model.RidgeClassifier(),
                                                   blobs[0], blobs[1], scoring="roc_auc",
                                                   cv=20)

In [10]:
print(f'SQD mean: {sqd_auc_scores.mean():.3f}, STD: {sqd_auc_scores.std(ddof=1):.3f}')
print(f'RIDGE mean: {ridge_auc_scores.mean():.3f}, STD: {ridge_auc_scores.std(ddof=1):.3f}')

SQD mean: 0.904, STD: 0.096
RIDGE mean: 0.937, STD: 0.071


#   Интервальная оценка среднего

In [18]:
from statsmodels.stats.weightstats import _zconfint_generic, _tconfint_generic

sqd_mean = sqd_auc_scores.mean()
ridge_mean = ridge_auc_scores.mean()

print('sqd interval: start - {0[0]}, end - {0[1]}'.format(_zconfint_generic(sqd_mean, np.sqrt(0.25/ len(sqd_auc_scores)), alpha=0.05, alternative='two-sided')))
print('ridge interval: start - {0[0]}, end - {0[1]}'.format(_zconfint_generic(ridge_mean, np.sqrt(0.25/ len(sqd_auc_scores)), alpha=0.05, alternative='two-sided')))

sqd interval: start - 0.6853336505701402, end - 1.123594920858431
ridge interval: start - 0.7174765077129974, end - 1.155737778001288


#   Выборочная дисперсия

In [23]:
sqd_auc_scores_mean_std = sqd_auc_scores.std(ddof=1) / np.sqrt(len(sqd_auc_scores))
print('sqd interval: start - {0[0]}, end - {0[1]}'.format(_tconfint_generic(sqd_mean, sqd_auc_scores_mean_std, len(sqd_auc_scores) - 1, 0.05, 'two-sided')))
ridge_auc_scores_mean_std = ridge_auc_scores.std(ddof=1) / np.sqrt(len(ridge_auc_scores))
print('sqd interval: start - {0[0]}, end - {0[1]}'.format(_tconfint_generic(ridge_mean, ridge_auc_scores_mean_std, len(sqd_auc_scores) - 1, 0.05, 'two-sided')))

sqd interval: start - 0.8595584788727291, end - 0.9493700925558421
sqd interval: start - 0.903235324375978, end - 0.9699789613383075
