In [4]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import cv2
import os
import re
from sklearn.model_selection import cross_val_score  #交叉验证
from sklearn.svm import SVC  #支持向量机
from sklearn.metrics import accuracy_score  #混淆矩阵准确度
from sklearn.metrics import classification_report  #混淆矩阵分类性能评估报告
from sklearn.metrics import confusion_matrix    #混淆矩阵数组

## 说明：SVM性别分类程序
本程序为SVM分类器进行性别分类任务，使用dlib库128维特征向量数据集

### 导入数据集

In [43]:
trainDistance = np.load("data/trainDistance.npy")
testDistance = np.load("data/testDistance.npy")
DS_sex_map = np.load("data/DS_sex_map.npy")
DR_sex_map = np.load("data/DR_sex_map.npy")
DS_sex = np.load("data/DS_sex.npy",allow_pickle=True)
DR_sex = np.load("data/DR_sex.npy",allow_pickle=True)
print("data/DR_sex_map: ", DR_sex_map)
print("data/DS_sex_map: ", DS_sex_map)
print("data/DR_sex: ", DR_sex)
print("data/DS_sex: ", DS_sex)

data/DR_sex_map:  [1 1 1 ... 0 0 0]
data/DS_sex_map:  [0 0 1 ... 0 0 0]
data/DR_sex:  ['male' 'male' 'male' ... 'female' 'female' 'female']
data/DS_sex:  ['female' 'female' 'male' ... 'female' 'female' 'female']


### 1.使用线性核函数

In [6]:
#采用sklearn.svm.SVC
clf_SVM1 = SVC(kernel='linear',decision_function_shape='ovo')
clf_SVM1.fit(trainDistance, DR_sex_map)

SVC(decision_function_shape='ovo', kernel='linear')

In [7]:
DS_sex_pre = clf_SVM1.predict(testDistance)
accuracy = accuracy_score(DS_sex_map, DS_sex_pre)
print(accuracy)

0.9218436873747495


网格搜索法，调整惩罚因子C

In [8]:
from sklearn.model_selection import GridSearchCV

# Set the parameters by cross-validation
linear_para = [{'kernel': ['linear'], 'C': [0.01, 0.1, 1, 10, 100, 1000]}]

linear_scores = ['accuracy']

for score in linear_scores:
    print("线性SVM超参调节的评估标准为 %s" % score)
    print()

    linearclf = GridSearchCV(
        SVC(), linear_para, scoring='%s' % score
    )
    linearclf.fit(trainDistance, DR_sex)

    print("最佳参数为:", linearclf.best_params_)
    
    print("在搜索范围内，对应参数的分类准确度如下：")
    means = linearclf.cv_results_['mean_test_score']
    stds = linearclf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, linearclf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
    print()

    linearclf_age_pred = linearclf.predict(testDistance)
    print('对最佳模型做分类结果报告：', '\n', classification_report(DS_sex, linearclf_age_pred))

线性SVM超参调节的评估标准为 accuracy

最佳参数为: {'C': 1, 'kernel': 'linear'}
在搜索范围内，对应参数的分类准确度如下：
0.575 (+/-0.002) for {'C': 0.01, 'kernel': 'linear'}
0.839 (+/-0.281) for {'C': 0.1, 'kernel': 'linear'}
0.853 (+/-0.260) for {'C': 1, 'kernel': 'linear'}
0.842 (+/-0.269) for {'C': 10, 'kernel': 'linear'}
0.840 (+/-0.271) for {'C': 100, 'kernel': 'linear'}
0.840 (+/-0.260) for {'C': 1000, 'kernel': 'linear'}

对最佳模型做分类结果报告： 
               precision    recall  f1-score   support

      female       0.93      0.85      0.89       719
        male       0.92      0.96      0.94      1277

    accuracy                           0.92      1996
   macro avg       0.92      0.91      0.91      1996
weighted avg       0.92      0.92      0.92      1996



对最优参数进行五折交叉验证

In [41]:
from sklearn.model_selection import cross_val_score
X = np.vstack((trainDistance,testDistance))
y = np.hstack((DR_sex, DS_sex))
Linear_best = SVC(kernel='linear', C=1)
scores_best_Linear = cross_val_score(Linear_best, X, y, cv=5)
print('scores = ',scores_best_Linear)
print("Accuracy = %0.2f (+/- %0.2f)" % (scores_best_Linear.mean(), scores_best_Linear.std() * 2))

scores =  [0.75469337 0.93609023 0.9235589  0.9160401  0.90726817]
Accuracy = 0.89 (+/- 0.13)


In [6]:
Linear_sex_pre = linearclf.predict(testDistance)
Linear_sex_accuracy = accuracy_score(DS_sex, Linear_sex_pre)
print('Linear_sex_accuracy = ', Linear_sex_accuracy)

Linear_sex_accuracy =  0.9218436873747495


### 2.使用非线性核函数

非线性核函数有高斯径向基核函数rbf，多项式核函数poly，双曲正切核函数sigmoid

#### （1）不调参

In [17]:
from sklearn.svm import SVC

clf_SVM2 = SVC(kernel='poly')
clf_SVM2.fit(trainDistance, DR_sex_map)

SVC(kernel='poly')

In [18]:
from sklearn.metrics import accuracy_score
DS_sex_pre2 = clf_SVM2.predict(testDistance)
accuracy2 = accuracy_score(DS_sex_map, DS_sex_pre2)
print(accuracy2)

0.9203406813627254


#### （2）网格搜索法

In [None]:
from sklearn.model_selection import GridSearchCV

# Set the parameters by cross-validation
rbf_para = [{'kernel': ['rbf', 'poly', 'sigmoid'], 'C': [1, 100, 1000], 'gamma' : [1e-1, 1e-2, 1e-3, 1e-4]}]

rbf_scores = ['accuracy']

for score in rbf_scores:
    print("高斯核SVM超参调节的评估标准为 %s" % score)
    print()

    rbfclf = GridSearchCV(
        SVC(), rbf_para, scoring='%s' % score
    )
    rbfclf.fit(trainDistance, DR_sex)

    print("最佳参数为:", rbfclf.best_params_)
    print()
    
    print("在搜索范围内，对应参数的分类准确度如下：")
    means = rbfclf.cv_results_['mean_test_score']
    stds = rbfclf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, rbfclf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
        
        
    print()
    
    rbfclf_sex_pred = rbfclf.predict(testDistance)
    print('最佳参数分类器的分类结果报告：')
    print(classification_report(DS_sex, rbfclf_sex_pred))

高斯核SVM超参调节的评估标准为 accuracy

最佳参数为: {'C': 100, 'gamma': 0.01, 'kernel': 'sigmoid'}

在搜索范围内，对应参数的分类准确度如下：
0.845 (+/-0.277) for {'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}
0.575 (+/-0.002) for {'C': 1, 'gamma': 0.1, 'kernel': 'poly'}
0.838 (+/-0.284) for {'C': 1, 'gamma': 0.1, 'kernel': 'sigmoid'}
0.685 (+/-0.091) for {'C': 1, 'gamma': 0.01, 'kernel': 'rbf'}
0.575 (+/-0.002) for {'C': 1, 'gamma': 0.01, 'kernel': 'poly'}
0.575 (+/-0.002) for {'C': 1, 'gamma': 0.01, 'kernel': 'sigmoid'}
0.575 (+/-0.002) for {'C': 1, 'gamma': 0.001, 'kernel': 'rbf'}
0.575 (+/-0.002) for {'C': 1, 'gamma': 0.001, 'kernel': 'poly'}
0.575 (+/-0.002) for {'C': 1, 'gamma': 0.001, 'kernel': 'sigmoid'}
0.575 (+/-0.002) for {'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'}
0.575 (+/-0.002) for {'C': 1, 'gamma': 0.0001, 'kernel': 'poly'}
0.575 (+/-0.002) for {'C': 1, 'gamma': 0.0001, 'kernel': 'sigmoid'}
0.845 (+/-0.255) for {'C': 100, 'gamma': 0.1, 'kernel': 'rbf'}
0.850 (+/-0.261) for {'C': 100, 'gamma': 0.1, 'kernel': 'poly'}


对最优模型参数进行五折交叉验证

In [42]:
from sklearn.model_selection import cross_val_score
X = np.vstack((trainDistance,testDistance))
y = np.hstack((DR_sex, DS_sex))
# clf_best = SVC(kernel='sigmoid', C=100, gamma=1e-2)
clf_best = SVC(kernel='rbf', C=1000, gamma=1e-3)
# clf_best = SVC(kernel='poly', C=100, gamma=1e-1)
scores_best = cross_val_score(clf_best, X, y, cv=5)
print('scores = ',scores_best)
print("Accuracy = %0.2f (+/- %0.2f)" % (scores_best.mean(), scores_best.std() * 2))

scores =  [0.75093867 0.93107769 0.92606516 0.92230576 0.90977444]
Accuracy = 0.89 (+/- 0.14)


In [21]:
rbf_sex_pre = rbfclf.predict(testDistance)
rbf_sex_accuracy = accuracy_score(DS_sex, rbf_sex_pre)
print('rbf_sex_accuracy = ', rbf_sex_accuracy)

rbf_sex_accuracy =  0.9218436873747495
