In [18]:
import pandas as pd
import numpy as np
import math
import warnings
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection  import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

In [19]:
warnings.filterwarnings("ignore")

In [20]:
pwd = 'D:/gitProject/Modeling/breast-cancer-wisconsin.csv'

In [21]:
#读取数据无列名
df1 = pd.read_csv(pwd,
                  header='infer',
                  names=['Sample_code_number',
                         'Clump_Thickness',
                         'Uniformity_of_Cell_Size',
                         'Uniformity_of_Cell_Shape',
                         'Marginal_Adhesion',
                         'Single_Epithelial_Cell_Size',
                         'Bare_Nuclei',
                         'Bland_Chromatin',
                         'Normal_Nucleoli',
                         'Mitoses',
                         'Class'])

In [22]:
#异常值剔除
df1 = df1[df1['Bare_Nuclei'] != '?']

In [23]:
df1.head()

Unnamed: 0,Sample_code_number,Clump_Thickness,Uniformity_of_Cell_Size,Uniformity_of_Cell_Shape,Marginal_Adhesion,Single_Epithelial_Cell_Size,Bare_Nuclei,Bland_Chromatin,Normal_Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [24]:
len(df1)

683

In [25]:
#划分训练集测试集
X_train, X_test, y_train, y_test = train_test_split(df1[['Clump_Thickness',
        'Uniformity_of_Cell_Size',
        'Uniformity_of_Cell_Shape',
        'Marginal_Adhesion',
        'Single_Epithelial_Cell_Size',
        'Bare_Nuclei',
        'Bland_Chromatin',
        'Normal_Nucleoli',
        'Mitoses']], df1['Class'], test_size=0.2, random_state=2795)

In [27]:
# 对数据进行归一化处理
standarScaler = StandardScaler()
standarScaler.fit(X_train)
X_train_std = standarScaler.transform(X_train)
X_test_std = standarScaler.transform(X_test)

In [47]:
param_grid = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'C': np.arange(0, 1, 0.2), 
    'gamma': np.arange(0, 1, 0.2),
    'degree': range(1, 2, 1),
    'coef0': np.arange(0, 1, 0.2),
    'decision_function_shape': ['ovr', 'ovo']
}

In [48]:
svc = SVC()
grid_search = GridSearchCV(svc, param_grid, cv=10)
grid_search.fit(X_train_std, y_train)

GridSearchCV(cv=10, estimator=SVC(),
             param_grid={'C': array([0. , 0.2, 0.4, 0.6, 0.8]),
                         'coef0': array([0. , 0.2, 0.4, 0.6, 0.8]),
                         'decision_function_shape': ['ovr', 'ovo'],
                         'degree': range(1, 2),
                         'gamma': array([0. , 0.2, 0.4, 0.6, 0.8]),
                         'kernel': ['linear', 'poly', 'rbf', 'sigmoid']})

In [49]:
#最佳预估器
best_estimator = grid_search.best_estimator_
best_estimator

SVC(C=0.2, coef0=0.2, degree=1, gamma=0.6000000000000001, kernel='sigmoid')

In [50]:
##训练集最佳结果
best_score = grid_search.best_score_
best_score

0.9761616161616162

In [51]:
#预测
y_predict=grid_search.predict(X_test_std)
y_predict

array([2, 2, 2, 2, 4, 4, 2, 2, 4, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 4, 4, 2,
       4, 2, 2, 2, 2, 2, 2, 4, 2, 4, 2, 2, 2, 2, 4, 2, 4, 2, 2, 2, 4, 2,
       2, 4, 4, 2, 2, 4, 2, 2, 2, 4, 4, 2, 2, 2, 2, 4, 2, 4, 2, 2, 4, 2,
       4, 2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2,
       2, 2, 2, 4, 2, 2, 4, 4, 2, 2, 2, 4, 4, 2, 4, 4, 4, 2, 2, 2, 2, 2,
       4, 2, 2, 2, 4, 2, 4, 4, 2, 4, 4, 4, 2, 2, 2, 4, 2, 4, 2, 2, 2, 2,
       4, 2, 2, 2, 2], dtype=int64)

In [53]:
#测试集准确率
score = best_estimator.score(X_test_std, y_test)
score

0.9343065693430657