### 1.分类
1. from sklearn.model_selection import train_test_split  
2. X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=666)

### 2.KNN算法
1. from sklearn.neighbors import KNeighborsClassifier
2. kNN_classifier = KNeighborsClassifier(n_neighbors=6)
3. kNN_classifier.fit(X_train, y_train)
    * y_predict = kNN_classifier.predict(X_test)   预测值
    * knn_clf.score(X_test, y_test)   准确度

### 3.Grid Search(采用了交叉验证)
1.
```
param_grid = [
    {
        'weights': ['uniform'], 
        'n_neighbors': [i for i in range(1, 11)]
    },
    {
        'weights': ['distance'],
        'n_neighbors': [i for i in range(1, 11)], 
        'p': [i for i in range(1, 6)]
    }
]
```
2. knn_clf = KNeighborsClassifier()
3. from sklearn.model_selection import GridSearchCV
4. grid_search = GridSearchCV(knn_clf, param_grid, cv=5)  ## cv是交叉验证份数，可省略
5. grid_search.fit(X_train, y_train)
    * grid_search.best_estimator_
        1. best_knn_clf = grid_search.best_estimator_
        2. best_knn_clf.score(X_test, y_test)
    * grid_search.best_score_
    * grid_search.best_params_

### 4.归一化(特征缩放)
1. from sklearn.preprocessing import StandardScaler
2. standardScalar = StandardScaler() 
3. standardScalar.fit(X_train)
    * standardScalar.mean_   (均值)
    * standardScalar.scale_   (标准差)
4. X_train_standard = standardScalar.transform(X_train)   (对X_train进行归一化)
5. 以KNN举例 knn_clf.fit(X_train_standard, y_train)
6. knn_clf.score(X_test_standard, y_test)

### 5.数据集导入(以鸢尾花数据集为例)
1. from sklearn import datasets
2. iris = datasets.load_iris()
    * iris.keys()   # 输出：dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names'])
    * print(iris.DESCR)
    * iris.feature_names
3. X = iris.data
4. y = iris.target

### 6.MSE(均方误差)和MAE(平均绝对误差)和MRE(均方根误差)和R²，sklearn中个关于指标的方法都放在sklearn.metrics包中
from sklearn.metrics import mean_squared_error  
from sklearn.metrics import mean_absolute_error  
from sklearn.metrics import root_mean_squared_error  
from sklearn.metrics import r2_score  
* r2_score(y_test, y_predict)

### 7.线性回归(多元，简单，梯度下降)
1. from sklearn.linear_model import LinearRegression
2. lin_reg = LinearRegression()
3. lin_reg.fit(X_train, y_train)
    * lin_reg.coef_
    * lin_reg.intercept_
    * lin_reg.score(X_test, y_test)   # R²值

### 8.SGD(随机梯度下降)
只能解决线性模型
1. from sklearn.linear_model import SGDRegressor
2. sgd_reg = SGDRegressor(n_iter=50)
3. sgd_reg.fit(X_train_standard, y_train)
    * sgd_reg.score(X_test_standard, y_test)

### 9.PCA(梯度上升，特征压缩)
1. from sklearn.decomposition import PCA
2. pca = PCA(n_components=1)   # n_components表示特征维度
    * pca = PCA(0.95)  # 表示不知道要取几个维度，但是取的主成分个数能解释原数据95%的方差
3. pca.fit(X)
    * pca.components_
    * pca.n_components_  # 通过pca = PCA(0.95)得出的主成分个数
4. X_reduction = pca.transform(X)
5. X_restore = pca.inverse_transform(X_reduction)
#### 与其他数据预测算法结合(以KNN算法为例)
    1. X_train_reduction = pca.transform(X_train)
    2. X_test_reduction = pca.transform(X_test)
    3. knn_clf = KNeighborsClassifier()
    4. knn_clf.fit(X_train_reduction, y_train)
        * knn_clf.score(X_test_reduction, y_test)

### 10.多项式回归(PCA和多项式回归都是对现有X数据进行变形)
1. from sklearn.preprocessing import PolynomialFeatures
2. poly = PolynomialFeatures(degree=n)   # 为原本的数据集最多添加n次幂这样的特征,sklearn中会自动添加零次幂
3. poly.fit(X)
4. X2 = poly.transform(X)   # 把X转换成多项式X2
#### 与线性回归算法结合(多项式回归只是预处理过程(sklearn.preprocessing)，真正拟合还是得用线性回归算法)
    1. from sklearn.linear_model import LinearRegression
    2. lin_reg2 = LinearRegression()
    3. lin_reg2.fit(X2, y)
        * y_predict2 = lin_reg2.predict(X2)
        * lin_reg2.score(X2, y)

### 11.PipeLine(管道)
```
1. 多项式的特征(或者其他预处理算法)
2. 数据的归一化(或者其他预处理算法)(对于多项式回归，数据标准化是必要的，因为如果超参数degree很大的话，数据之间的差距会很大，比如1的一次方和100的100次方之间的差距)
3. 线性回归(或者其他拟合算法)
...
PipeLine将三步合在一起
```
1. from sklearn.pipeline import Pipeline
2. from sklearn.preprocessing import StandardScaler
3. from sklearn.linear_model import LinearRegression
4.  
```
poly_reg = Pipeline([
    ("poly", PolynomialFeatures(degree=2)),
    ("std_scaler", StandardScaler()),
    ("lin_reg", LinearRegression())
])
```
5. poly_reg.fit(X, y)
    * y_predict = poly_reg.predict(X)

### 12.使用交叉验证
1. from sklearn.model_selection import cross_val_score
2. knn_clf = KNeighborsClassifier()
3. cross_val_score(knn_clf, X_train, y_train, cv=5) ## cv是交叉验证份数，可省略
    * 返回k个模型，每个模型的准确度组成的数组
    * cross_val_score默认分成三份进行交叉验证

### 13.岭回归
1. from sklearn.linear_model import Ridge
2. ridge = Ridge(alpha=1)
#### 使用管道
ridge1_reg = Pipeline([
        ("poly", PolynomialFeatures(degree=20)),
        ("std_scaler", StandardScaler()),
        ("ridge_reg", Ridge(alpha=0.0001))
    ])
3. ridge1_reg.fit(X_train, y_train)
4. y1_predict = ridge1_reg.predict(X_test)

### 14.LASSO回归
1. from sklearn.linear_model import Lasso
2. lasso = Lasso(alpha=1)
#### 使用管道
lasso1_reg = Pipeline([
        ("poly", PolynomialFeatures(degree=20)),
        ("std_scaler", StandardScaler()),
        ("ridge_reg", Lasso(alpha=0.1))
    ])
3. lasso1_reg.fit(X_train, y_train)
4. y1_predict = lasso1_reg.predict(X_test)

### 15.逻辑回归
1. from sklearn.linear_model import LogisticRegression
2. log_reg = LogisticRegression()
3. log_reg.fit(X_train, y_train)
    * log_reg.score(X_train, y_train)
#### 虽然逻辑回归只能解决二分类问题，但是sklearn中自动添加了支持多分类任务的功能，默认采用OVR

### 16.OVO and OVR

1. from sklearn.multiclass import OneVsRestClassifier
    * from sklearn.multiclass import OneVsOneClassifier
2. log_reg = LogisticRegression()
3. ovr = OneVsRestClassifier(log_reg)
    * ovo = OneVsOneClassifier(log_reg)
4. ovr.fit(X_train, y_train)
5. ovr.score(X_test, y_test)

### 17.逻辑回归中的交叉验证
1. from sklearn.linear_model import LogisticRegressionCV

2. log_reg_cv = LogisticRegressionCV()
3. log_reg_cv.fit(X_train, y_train)
    * log_reg_cv.score(X_test, y_test)
    * log_reg_cv.C_  
    array([ 0.00599484,  0.00599484,  0.04641589,  0.35938137,  0.00599484,
        0.35938137,  0.35938137,  2.7825594 ,  0.00599484,  0.04641589])

### 18.混淆矩阵(默认支持多分类问题)，精准率和召回率和F1-Score
1. from sklearn.metrics import confusion_matrix
    * from sklearn.metrics import precision_score(精准率)
    * from sklearn.metrics import recall_score(召回率)
    * from sklearn.metrics import f1_score
2. log_reg = LogisticRegression()
3. log_reg.fit(X_train, y_train)
4. y_log_predict = log_reg.predict(X_test)
    * confusion_matrix(y_test, y_log_predict)
    * precision_score(y_test, y_log_predict)
    * recall_score(y_test, y_log_predict)
    * f1_score(y_test, y_log_predict)

### 19.精准度和召回率的平衡(sklearn中逻辑回归的方法decision_function)

1. 在逻辑回归中，判断样本是属于分类1还是分类2的依据值  
2. 分数小于0则被判断为类别1  
3. 分数大于0则被判断为类别2
```
decision_scores = log_reg.decision_function(X_test)
```
array([-22.05700117, -33.02940957, -16.21334087, -80.3791447 , -48.25125396, -24.54005629, -44.39168773,  -25.04292757, -0.97829292, -19.7174399 ])  
##### 数组里的值表示每个样本点在score数轴上对应的score值  
```
y_predict_1 = log_reg.predict(X_test)[:10]
```
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])  
4. 改变分类依据，现在是分数大于5，则判断为类1  
5. 分数小于5，则判断为类2
```
y_predict_2 = np.array(decision_scores >= 5, dtype='int')
```


### 20. scikit-learn中的Precision-Recall曲线
1. from sklearn.metrics import precision_recall_curve
2. precisions, recalls, thresholds = precision_recall_curve(y_test, decision_scores)
    * plt.plot(thresholds, precisions[:-1])
    * plt.plot(thresholds, recalls[:-1])
    * plt.show()

### 21. scikit-learn中的ROC，AUC
1. from sklearn.metrics import roc_curve
    * from sklearn.metrics import roc_auc_score
2. fprs, tprs, thresholds = roc_curve(y_test, decision_scores)
3. roc_auc_score(y_test, decision_scores)


### 22. scikit-learn中的SVM
#### 先进行标准化
1. from sklearn.preprocessing import StandardScaler
2. standardScaler = StandardScaler()
3. standardScaler.fit(X)
4. X_standard = standardScaler.transform(X)
#### 再进行svm分类
5. from sklearn.svm import LinearSVC
6. svc = LinearSVC(C=1e9)
7. svc.fit(X_standard, y)
#### 这时候svc训练出了一个决策边界，可以对输入数据进行分类
8. y_predict = svc.predict(X_test)
##### 这时就可以进行一些操作，比如精准率召回率的判断
    * from sklearn.metrics import f1_score
    * f1_score(y_test, y_predict)

### 23. 使用多项式特征的SVM
1. from sklearn.preprocessing import PolynomialFeatures, StandardScaler
2. from sklearn.svm import LinearSVC
3. from sklearn.pipeline import Pipeline

In [1]:
'''
def PolynomialSVC(degree, C=1.0):
    return Pipeline([
        ("poly", PolynomialFeatures(degree=degree)),
        ("std_scaler", StandardScaler()),
        ("linearSVC", LinearSVC(C=C))
    ])
'''

'\ndef PolynomialSVC(degree, C=1.0):\n    return Pipeline([\n        ("poly", PolynomialFeatures(degree=degree)),\n        ("std_scaler", StandardScaler()),\n        ("linearSVC", LinearSVC(C=C))\n    ])\n'

4. poly_svc = PolynomialSVC(degree=3)
5. poly_svc.fit(X, y)

### 25. scikit-learn中的决策树
1. from sklearn.tree import DecisionTreeClassifier
##### criterion="entropy"：决策树划分标准为信息熵方式
2. dt_clf = DecisionTreeClassifier(max_depth=2, criterion="entropy", random_state=42)  ## criterion="gini"
3. dt_clf.fit(X, y)

### 26. 决策树解决回归问题
1. from sklearn.tree import DecisionTreeRegressor
2. dt_reg = DecisionTreeRegressor()  ## 可以进行超参数的调试
3. dt_reg.fit(X_train, y_train)

### 27.集成学习Voting Classifier
1. from sklearn.ensemble import VotingClassifier

In [3]:
'''
### voting='hard'表示'少数服从多数'这个集成学习方式
### 在具体使用时，可以先调整这些分类器的参数，把每个算法调整到最好的情况，再一起使用
2. voting_clf = VotingClassifier(estimators=[
    ('log_clf', LogisticRegression()), 
    ('svm_clf', SVC()),
    ('dt_clf', DecisionTreeClassifier(random_state=666))],
                             voting='hard')
'''

"\n### voting='hard'表示'少数服从多数'这个集成学习方式\n### 在具体使用时，可以先调整这些分类器的参数，把每个算法调整到最好的情况，再一起使用\n2. voting_clf = VotingClassifier(estimators=[\n    ('log_clf', LogisticRegression()), \n    ('svm_clf', SVC()),\n    ('dt_clf', DecisionTreeClassifier(random_state=666))],\n                             voting='hard')\n"

3. voting_clf.fit(X_train, y_train)
    * voting_clf.score(X_test, y_test)

#### Soft Voting Classifier

In [5]:
'''
voting_clf2 = VotingClassifier(estimators=[
    ('log_clf', LogisticRegression()), 
    ('svm_clf', SVC(probability=True)),   对于SVC算法，必须要设置probability=True才能计算概率
    ('dt_clf', DecisionTreeClassifier(random_state=666))],
                             voting='soft')
'''

"\nvoting_clf2 = VotingClassifier(estimators=[\n    ('log_clf', LogisticRegression()), \n    ('svm_clf', SVC(probability=True)),   对于SVC算法，必须要设置probability=True才能计算概率\n    ('dt_clf', DecisionTreeClassifier(random_state=666))],\n                             voting='soft')\n"

### 28. 使用Bagging进行集成学习
1. from sklearn.tree import DecisionTreeClassifier
2. from sklearn.ensemble import BaggingClassifier

In [7]:
'''
## 这里的DecisionTreeClassifier()只是采用了一个决策树算法，使用决策树算法只是因为这种非参数算法更能产生出差异非常大的子模型
## 这个位置可以替代为通过Voting-Classifier聚合各种算法生成的模型
## n_estimators表示要集成多少个子模型， max_samples表示每个子模型要看多少个样本数据，bootstrap=True表示放回取样的方式
3. bagging_clf = BaggingClassifier(DecisionTreeClassifier(),
                           n_estimators=5000, max_samples=100,
                           bootstrap=True)
'''

'\nbagging_clf = BaggingClassifier(DecisionTreeClassifier(),\n                           n_estimators=5000, max_samples=100,\n                           bootstrap=True)\n                           \nbagging_clf.fit(X_train, y_train)\nbagging_clf.score(X_test, y_test)\n'

4. bagging_clf.fit(X_train, y_train)
5. bagging_clf.score(X_test, y_test)

#### OOB

In [9]:
'''
## oob_score=True:记录在放回取样的郭恒中都取了哪些样本，哪些样本没有被取到，后续才能调用oob_score_
1. bagging_clf = BaggingClassifier(DecisionTreeClassifier(),
                           n_estimators=5000, max_samples=100,
                           bootstrap=True)
## 有了oob则不需要进行train_test_split
2. bagging_clf.fit(X, y)
## 测试结果
3. bagging_clf.oob_score_
'''

'\n## oob_score=True:记录在放回取样的郭恒中都取了哪些样本，哪些样本没有被取到，后续才能调用oob_score_\nbagging_clf = BaggingClassifier(DecisionTreeClassifier(),\n                           n_estimators=5000, max_samples=100,\n                           bootstrap=True)\n## 有了oob则不需要进行train_test_split\nbagging_clf.fit(X, y)\n'

#### bootstrap_features

In [None]:
'''
## max_features：最大看几个特征,此时bootstrap_features=True
## 如果样本空间特征少的话使用随即特征采样的方式是不合适的
## 此时n_estimators=600, max_samples=500，因为不对样本进行随机采样(前提条件是此时一共有500个样本)
## 如果n_estimators=600, max_samples=100，就表示既对样本数进行放回采样，又对特征数放回采样
random_subspaces_clf = BaggingClassifier(DecisionTreeClassifier(),
                               n_estimators=600, max_samples=500,
                               bootstrap=True, oob_score=True,
                               max_features=1, bootstrap_features=True)
random_subspaces_clf.fit(X, y)
random_subspaces_clf.oob_score_
'''

### 29. sklearn中的随机森林
#### 随机森林拥有决策树和BaggingClassifier的所有参数
1. from sklearn.ensemble import RandomForestClassifier
2. rf_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, oob_score=True, random_state=666, n_jobs=-1)
3. rf_clf.fit(X, y)
    * rf_clf.oob_score_  


### 30. 集成学习解决回归问题
1. from sklearn.ensemble import BaggingRegressor
2. from sklearn.ensemble import RandomForestRegressor
3. from sklearn.ensemble import ExtraTreesRegressor

### 31. AdaBoosting
1. from sklearn.tree import DecisionTreeClassifier
2. from sklearn.ensemble import AdaBoostClassifier
3. ada_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2), n_estimators=500)
4. ada_clf.fit(X_train, y_train)
5. ada_clf.score(X_test, y_test)

### 32. Gradient Boosting
1. from sklearn.ensemble import GradientBoostingClassifier
2. gb_clf = GradientBoostingClassifier(max_depth=2, n_estimators=30)
3. gb_clf.fit(X_train, y_train)
4. gb_clf.score(X_test, y_test)

### 33. Boosting 解决回归问题
1. from sklearn.ensemble import AdaBoostRegressor
2. from sklearn.ensemble import GradientBoostingRegressor