In [None]:
'''
In Class Competition https://www.kaggle.com/c/bi-attrition-predict/
数据集：3168个录制的声音样本(来自男性和女性演讲者)，采集的频率范围是0hz-280hz，已经对数据进行了预处理
一共有21个属性值，请判断该声音是男还是女？
使用Accuracy作为评价标准

Step1，数据加载
Step2，数据预处理
	分离特征X 和Target y
	使用标签编码，male -> 1, female -> 0
	将特征X矩阵进行规范化
	#标准差标准化，处理后的数据符合标准正态分布
	scaler = StandardScaler()
Step3，数据集切分，train_test_split
Step4，模型训练
	SVM，Linear SVM
Step5，模型预测
'''

import pandas as pd

#数据加载
df=pd.read_csv('./voice.csv')
pd.set_option('display.max_columns', 1000)

print(df.isnull().sum())
print(df.shape))
print('样本个数：{}'.format(df.shape[0]))
print('男性个数：{}'.format(df[df.label=='male'].shape[0]))
print('女性个数：{}'.format(df[df.label=='female'].shape[0]))

#分离特征值与label
X=df.iloc[:, :-1]
y=df.iloc[:, -1]

#使用标签编码
from sklearn.preprocessing import LabelEncoder
gender_encoder=LabelEncoder()
y=gender_encoder.fit_transform(y)
print(y)

scaler=StandardScaler()
X=scaler.fit_transform(X)

from sklearn.model_selection import train_test_split
X_train, y_train, y_train, y_test=train_test_split(X, y, test_size=0.2)

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
svc= SVC()
svc.fit(X_train, y_train)
y_pred=svc.predict(X_test)
print('SVM预测结果：', y_pred)
print('SVM预测准确率：', accuracy_score(y_test, y_pred))
print('SVM预测准确率：', svc.score(X_test, y_test))

svc= SVC(kernel='linear')
svc.fit(X_train, y_train)
y_pred=svc.predict(X_test)
print('Linear SVM预测结果：', y_pred)
print('Linear SVM预测准确率：', accuracy_score(y_test, y_pred))
print('Linear SVM预测准确率：', svc.score(X_test, y_test))

import xgboost as xgb
param = {'boosting_type':'gbdt',
                         'objective' : 'binary:logistic', #任务目标
                         'eval_metric' : 'auc', #评估指标
                         'eta' : 0.01, #学习率
                         'max_depth' : 15, #树最大深度
                         'colsample_bytree':0.8, #设置在每次迭代中使用特征的比例
                         'subsample': 0.9, #样本采样比例
                         'subsample_freq': 8, #bagging的次数
                         'alpha': 0.6, #L1正则
                         'lambda': 0, #L2正则
        }

train_data = xgb.DMatrix(X_train, label=y_train)
valid_data = xgb.DMatrix(X_test, label=y_test)

model = xgb.train(param, train_data, evals=[(train_data, 'train'), (valid_data, 'valid')], num_boost_round = 10000, early_stopping_rounds=200, verbose_eval=25)
 y_pred = model.predict(test_data)
 y_pred = [1 if x>=0.5 else 0 for x in y_pred]
print('XGBoost 预测结果', y_pred)
print('XGBoost 预测准确率：', accuracy_score(y_test, y_pred)
