In [1]:
import pandas as pd

#数据加载
data = pd.read_csv('./voice.csv')
#简单数据探索
pd.set_option('display.max_columns',50)  #将控制台最大打印显示列数调整为50
print(data)
print(data.shape)  #共3168行  21个字段
print(data.isnull().sum())  #共21个字段字段为空的（缺失值）数量为0，不用做数据补全
print('样本个数：{}'.format(data.shape[0]))  #format函数可将结果替换到{}的位置
print('男性个数：{}'.format(data[data.label == 'male'].shape[0]))#查看男性个数和女性个数 均为1584 
print('女性个数：{}'.format(data[data.label == 'female'].shape[0]))


      meanfreq        sd    median       Q25       Q75       IQR       skew  \
0     0.059781  0.064241  0.032027  0.015071  0.090193  0.075122  12.863462   
1     0.066009  0.067310  0.040229  0.019414  0.092666  0.073252  22.423285   
2     0.077316  0.083829  0.036718  0.008701  0.131908  0.123207  30.757155   
3     0.151228  0.072111  0.158011  0.096582  0.207955  0.111374   1.232831   
4     0.135120  0.079146  0.124656  0.078720  0.206045  0.127325   1.101174   
...        ...       ...       ...       ...       ...       ...        ...   
3163  0.131884  0.084734  0.153707  0.049285  0.201144  0.151859   1.762129   
3164  0.116221  0.089221  0.076758  0.042718  0.204911  0.162193   0.693730   
3165  0.142056  0.095798  0.183731  0.033424  0.224360  0.190936   1.876502   
3166  0.143659  0.090628  0.184976  0.043508  0.219943  0.176435   1.591065   
3167  0.165509  0.092884  0.183044  0.070072  0.250827  0.180756   1.705029   

             kurt    sp.ent       sfm      mode  ce

In [2]:
#数据切分，分离特征和label
feature = data.iloc[:,:-1] #iloc（行，列）   此处为iloc[所有行：开始 到 倒数第一列]
label = data.iloc[:,-1] #同上   只取最后一列，所有行

#因为性别列为字符串类型，需要转换为数字类型才可参与运算，所以需要将label进行标签编码
from sklearn.preprocessing import LabelEncoder,StandardScaler
gender_encoder = LabelEncoder()
label = gender_encoder.fit_transform(label)
print(label) #查看转换后结果 [1 1 1 ... 0 0 0]

scaler = StandardScaler() #数据标准化 消除奇异样本数据导致的不良影响
feature = scaler.fit_transform(feature)


[1 1 1 ... 0 0 0]


In [3]:
#进行训练集，测试机切分
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(feature,label,test_size = 0.2)


In [4]:
#训练模型
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

#svc
model = SVC()
model.fit(x_train,y_train)
y_pred = model.predict(x_test)
#print('SVM 预测结果：{}'.format(y_pred))
print('SVM 预测准确率：{}'.format(accuracy_score(y_pred,y_test)))


SVM 预测准确率：0.9842271293375394


In [5]:
#线性svc
model = SVC(kernel='linear')
model.fit(x_train,y_train)
y_pred = model.predict(x_test)
#print('SVM线性 预测结果：{}'.format(y_pred))
print('SVM线性 预测准确率：{}'.format(accuracy_score(y_pred,y_test)))


SVM线性 预测准确率：0.9700315457413249


In [7]:
#xgboost
import xgboost as xgb
param = {'boosting_type':'gbdt',
                         'objective' : 'binary:logistic', #
                         'eval_metric' : 'auc',
                         'eta' : 0.01,
                         'max_depth' : 15,
                         'colsample_bytree':0.8,
                         'subsample': 0.9,
                         'subsample_freq': 8,
                         'alpha': 0.6,
                         'lambda': 0,
        }
train_data = xgb.DMatrix(x_train, y_train)
test_data = xgb.DMatrix(x_test, y_test)
model = xgb.train(param, train_data, evals=[(train_data, 'train'), (test_data, 'valid')], num_boost_round = 3000, early_stopping_rounds=25, verbose_eval=25)
predict = model.predict(test_data)
predict = [1 if x >=0.5 else 0 for x in predict]
#print('xgboost 预测结果：{}'.format(predict))
print('xgboost 预测准确率：{}'.format(accuracy_score(predict,y_test)))

Parameters: { boosting_type, subsample_freq } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-auc:0.98953	valid-auc:0.99316
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 25 rounds.
[25]	train-auc:0.99894	valid-auc:0.99840
[50]	train-auc:0.99898	valid-auc:0.99838
Stopping. Best iteration:
[28]	train-auc:0.99895	valid-auc:0.99845

xgboost 预测准确率：0.9794952681388013
