In [9]:
import numpy as np
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')

In [10]:
X, y = datasets.load_iris(return_X_y=True)

**********

In [14]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import train_test_split

In [15]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=0)

In [16]:
lda = LinearDiscriminantAnalysis()

LDA = lda.fit(X_train,y_train)
LDA.score(X_test,y_test)

0.9777777777777777

In [19]:
LDA.predict(X_test)

array([2, 1, 0, 2, 0, 2, 0, 1, 1, 1, 2, 1, 1, 1, 1, 0, 1, 1, 0, 0, 2, 1,
       0, 0, 2, 0, 0, 1, 1, 0, 2, 1, 0, 2, 2, 1, 0, 2, 1, 1, 2, 0, 2, 0,
       0])

In [21]:
lda_ = LinearDiscriminantAnalysis()

scores = cross_val_score(lda_,X,y)
scores.mean()

0.9800000000000001

**********

###  计算交叉验证指标

In [17]:
from sklearn.model_selection import cross_val_score

In [18]:
lr = LogisticRegression()
# 当 cv 参数为整数时， cross_val_score 默认使用 KFold方式
scores = cross_val_score(lr, X, y, cv=5)  # cv表示fold个数
scores

array([0.96666667, 1.        , 0.93333333, 0.96666667, 1.        ])

In [19]:
# 得分估计(score estimate)的平均得分和95%置信区间
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.97 (+/- 0.05)


#### 传入一个交叉验证迭代器(cross validation iterator)来使用其他交叉验证策略

In [20]:
from sklearn.model_selection import ShuffleSplit

In [46]:
n_samples = X.shape[0]
cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
cross_val_score(lr, X, y, cv=cv)

array([0.97777778, 0.91111111, 0.95555556, 0.93333333, 0.95555556])

### 测试集的数据转换

In [26]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [27]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.4, random_state=0)

scaler = StandardScaler().fit(X_train)

X_train_transformed = scaler.transform(X_train)

lr = LogisticRegression()
lr.fit(X_train_transformed, y_train)

X_test_transformed = scaler.transform(X_test)
lr.score(X_test_transformed, y_test)

0.9333333333333333

### 管道(Pipeline)使组合估计器变得更加容易，在交叉验证下提供此行为：

In [28]:
from sklearn.pipeline import make_pipeline

In [47]:
lr = make_pipeline(StandardScaler(), LogisticRegression())
cross_val_score(lr, X, y, cv=cv)

array([0.97777778, 0.91111111, 0.95555556, 0.93333333, 0.95555556])

## 交叉验证函数与多指标评价

In [48]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import recall_score

In [59]:
scoring = ['precision_macro', 'recall_macro']
lr = LogisticRegression()
# return_estimator=True来保留在每个训练集上拟合的估计器。
# return_train_score=True可以返回训练集上评估的分数, 但会额外消耗时间
scores = cross_validate(lr, X, y, scoring=scoring,
                        return_estimator=True,
                        return_train_score=True)

In [60]:
scores

{'fit_time': array([0.01994562, 0.0209415 , 0.02095532, 0.01698995, 0.01894307]),
 'score_time': array([0.00199461, 0.00102544, 0.00099897, 0.00195312, 0.00099754]),
 'estimator': (LogisticRegression(),
  LogisticRegression(),
  LogisticRegression(),
  LogisticRegression(),
  LogisticRegression()),
 'test_precision_macro': array([0.96969697, 1.        , 0.94444444, 0.96969697, 1.        ]),
 'train_precision_macro': array([0.96741855, 0.96741855, 0.98333333, 0.98412698, 0.97519283]),
 'test_recall_macro': array([0.96666667, 1.        , 0.93333333, 0.96666667, 1.        ]),
 'train_recall_macro': array([0.96666667, 0.96666667, 0.98333333, 0.98333333, 0.975     ])}

## 通过交叉验证获取预测

In [61]:
from sklearn.model_selection import cross_val_predict