## 训练集、验证集、测试集
从L5数据预处理的基础上继续。

In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.preprocessing import Normalizer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA

from sklearn.model_selection import train_test_split

# sl:satisfaction_level——False:MinMaxScaler;True:StandardScaler
# le:last_evaluation——False:MinMaxScaler;True:StandardScaler
# npr:number_project——False:MinMaxScaler;True:StandardScaler
# amh:average_monthly_hours——False:MinMaxScaler;True:StandardScaler
# tsc:time_spend_company——False:MinMaxScaler;True:StandardScaler
# wa:Work_accident——False:MinMaxScaler;True:StandardScaler
# pl5:promotion_last_5years——False:MinMaxScaler;True:StandardScaler
# dp:deparment——False:LabelEncoding;True:OneHotEncoding
# slr:salary——False:LabelEncoding;True:OneHotEncoding
# lower_d——False:NotlowerDimension
# ld_n——to n dimensions
def hr_preprocessing(sl=False,le=False,npr=False,amh=False,tsc=False,wa=False,pl5=False,dp=True,slr=False,
                     lower_d=False,ld_n=1):
    df=pd.read_csv('./data/HR.csv')
    # 1、清洗数据
    df=df.dropna(subset=['satisfaction_level','last_evaluation'])
    df=df[df['satisfaction_level']<=1][df['salary']!='nme']
    # 2、得到标注
    label=df['left']
    df=df.drop('left',axis=1)
    # 3、特征选择(例子中的特征先全部保留)
    # 4、特征处理
    scaler_lst=[sl,le,npr,amh,tsc,wa,pl5]
    column_lst=['satisfaction_level','last_evaluation','number_project','average_monthly_hours',
                'time_spend_company','Work_accident','promotion_last_5years']
    for i in range(len(scaler_lst)):
        if not scaler_lst[i]:
            df[column_lst[i]]=MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
        else:
            df[column_lst[i]]=StandardScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
    scaler_lst=[dp,slr]
    column_lst=['department','salary']
    for i in range(len(scaler_lst)):
        if not scaler_lst[i]:
            if column_lst[i]=='salary':
                # 由于LabelEncoding会按照字母顺序来确定0,1,2，破坏了low,med,high的顺序，所以需要重新定义一个函数map_salary               
                df[column_lst[i]]=[map_salary(s) for s in df[column_lst[i]].values]
            else:
                df[column_lst[i]]=LabelEncoder().fit_transform(df[column_lst[i]])  
            # LabelEncoding之后，进行一下归一化处理
            df[column_lst[i]]=MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
        else:
            # OneHot编码，可以直接用pandas里面的get_dummies
            df=pd.get_dummies(df,columns=[column_lst[i]])
    if lower_d:
        # PCA降维与标注Label无关，而LDA降维的n_components不能超过Label的类别（由于left只有0,1，故LDA只能降成1维）
        return PCA(n_components=ld_n).fit_transform(df.values),label
    return df,label

def map_salary(s):
    d=dict([('low',0),('medium',1),('high',2)])
    return d.get(s,0)  # 将low,med,high分别赋值0,1,2，如果没有找到则赋值为0

def hr_modeling(features,label):
    f_v=features.values
    l_v=label.values
    # 先把验证集分离出来，再分割训练集和测试集。训练集、验证集、测试集之比6:2:2。
    X_tt,X_validation,Y_tt,Y_validation=train_test_split(f_v,l_v,test_size=0.2)
    X_train,X_test,Y_train,Y_test=train_test_split(X_tt,Y_tt,test_size=0.25)
    print(len(X_train),len(X_validation),len(X_test))
# print(hr_preprocessing(sl=True,le=True,ld_n=3))
fetures,label=hr_preprocessing()
hr_modeling(fetures,label)

8999 3000 3000




<img src='./image/6.3.png' width=500 /> 

## 分类——KNN
<img src='./image/6.3_2.png' width=500 /> 
<img src='./image/6.3_3.png' width=500 /> 
<img src='./image/6.3_4.png' width=500 /> 
此图中红色的点即为上图中的分割线。

In [13]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.preprocessing import Normalizer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA

from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors,KNeighborsClassifier
from sklearn.metrics import accuracy_score,recall_score,f1_score
# sl:satisfaction_level——False:MinMaxScaler;True:StandardScaler
# le:last_evaluation——False:MinMaxScaler;True:StandardScaler
# npr:number_project——False:MinMaxScaler;True:StandardScaler
# amh:average_monthly_hours——False:MinMaxScaler;True:StandardScaler
# tsc:time_spend_company——False:MinMaxScaler;True:StandardScaler
# wa:Work_accident——False:MinMaxScaler;True:StandardScaler
# pl5:promotion_last_5years——False:MinMaxScaler;True:StandardScaler
# dp:deparment——False:LabelEncoding;True:OneHotEncoding
# slr:salary——False:LabelEncoding;True:OneHotEncoding
# lower_d——False:NotlowerDimension
# ld_n——to n dimensions
def hr_preprocessing(sl=False,le=False,npr=False,amh=False,tsc=False,wa=False,pl5=False,dp=True,slr=False,
                     lower_d=False,ld_n=1):
    df=pd.read_csv('./data/HR.csv')
    # 1、清洗数据
    df=df.dropna(subset=['satisfaction_level','last_evaluation'])
    df=df[df['satisfaction_level']<=1][df['salary']!='nme']
    # 2、得到标注
    label=df['left']
    df=df.drop('left',axis=1)
    # 3、特征选择(例子中的特征先全部保留)
    # 4、特征处理
    scaler_lst=[sl,le,npr,amh,tsc,wa,pl5]
    column_lst=['satisfaction_level','last_evaluation','number_project','average_monthly_hours',
                'time_spend_company','Work_accident','promotion_last_5years']
    for i in range(len(scaler_lst)):
        if not scaler_lst[i]:
            df[column_lst[i]]=MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
        else:
            df[column_lst[i]]=StandardScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
    scaler_lst=[dp,slr]
    column_lst=['department','salary']
    for i in range(len(scaler_lst)):
        if not scaler_lst[i]:
            if column_lst[i]=='salary':
                # 由于LabelEncoding会按照字母顺序来确定0,1,2，破坏了low,med,high的顺序，所以需要重新定义一个函数map_salary               
                df[column_lst[i]]=[map_salary(s) for s in df[column_lst[i]].values]
            else:
                df[column_lst[i]]=LabelEncoder().fit_transform(df[column_lst[i]])  
            # LabelEncoding之后，进行一下归一化处理
            df[column_lst[i]]=MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
        else:
            # OneHot编码，可以直接用pandas里面的get_dummies
            df=pd.get_dummies(df,columns=[column_lst[i]])
    if lower_d:
        # PCA降维与标注Label无关，而LDA降维的n_components不能超过Label的类别（由于left只有0,1，故LDA只能降成1维）
        return PCA(n_components=ld_n).fit_transform(df.values),label
    return df,label

def map_salary(s):
    d=dict([('low',0),('medium',1),('high',2)])
    return d.get(s,0)  # 将low,med,high分别赋值0,1,2，如果没有找到则赋值为0

def hr_modeling(features,label):
    f_v=features.values
    l_v=label.values
    # 先把验证集分离出来，再分割训练集和测试集。训练集、验证集、测试集之比6:2:2。
    X_tt,X_validation,Y_tt,Y_validation=train_test_split(f_v,l_v,test_size=0.2)
    X_train,X_test,Y_train,Y_test=train_test_split(X_tt,Y_tt,test_size=0.25)

    # KNN
    knn_clf=KNeighborsClassifier(n_neighbors=3)
    knn_clf_n5=KNeighborsClassifier(n_neighbors=5)
    knn_clf.fit(X_train,Y_train)
    knn_clf_n5.fit(X_train,Y_train)
    Y_pred=knn_clf.predict(X_validation)
    Y_pred_n5=knn_clf_n5.predict(X_validation)
    print('n=3 ACC:',accuracy_score(Y_validation,Y_pred))
    print('n=3 REC:',recall_score(Y_validation,Y_pred))
    print('n=3 F-Score:',f1_score(Y_validation,Y_pred))
    print('n=5 ACC:',accuracy_score(Y_validation,Y_pred_n5))
    print('n=5 REC:',recall_score(Y_validation,Y_pred_n5))
    print('n=5 F-Score:',f1_score(Y_validation,Y_pred_n5))
# print(hr_preprocessing(sl=True,le=True,ld_n=3))
fetures,label=hr_preprocessing()
hr_modeling(fetures,label)



n=3 ACC: 0.943333333333
n=3 REC: 0.920833333333
n=3 F-Score: 0.886363636364
n=5 ACC: 0.933666666667
n=5 REC: 0.894444444444
n=5 F-Score: 0.866173503699


发现n=3的效果比n=5效果好，因此将n定为3。

In [16]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.preprocessing import Normalizer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA

from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors,KNeighborsClassifier
from sklearn.metrics import accuracy_score,recall_score,f1_score
# sl:satisfaction_level——False:MinMaxScaler;True:StandardScaler
# le:last_evaluation——False:MinMaxScaler;True:StandardScaler
# npr:number_project——False:MinMaxScaler;True:StandardScaler
# amh:average_monthly_hours——False:MinMaxScaler;True:StandardScaler
# tsc:time_spend_company——False:MinMaxScaler;True:StandardScaler
# wa:Work_accident——False:MinMaxScaler;True:StandardScaler
# pl5:promotion_last_5years——False:MinMaxScaler;True:StandardScaler
# dp:deparment——False:LabelEncoding;True:OneHotEncoding
# slr:salary——False:LabelEncoding;True:OneHotEncoding
# lower_d——False:NotlowerDimension
# ld_n——to n dimensions
def hr_preprocessing(sl=False,le=False,npr=False,amh=False,tsc=False,wa=False,pl5=False,dp=True,slr=False,
                     lower_d=False,ld_n=1):
    df=pd.read_csv('./data/HR.csv')
    # 1、清洗数据
    df=df.dropna(subset=['satisfaction_level','last_evaluation'])
    df=df[df['satisfaction_level']<=1][df['salary']!='nme']
    # 2、得到标注
    label=df['left']
    df=df.drop('left',axis=1)
    # 3、特征选择(例子中的特征先全部保留)
    # 4、特征处理
    scaler_lst=[sl,le,npr,amh,tsc,wa,pl5]
    column_lst=['satisfaction_level','last_evaluation','number_project','average_monthly_hours',
                'time_spend_company','Work_accident','promotion_last_5years']
    for i in range(len(scaler_lst)):
        if not scaler_lst[i]:
            df[column_lst[i]]=MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
        else:
            df[column_lst[i]]=StandardScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
    scaler_lst=[dp,slr]
    column_lst=['department','salary']
    for i in range(len(scaler_lst)):
        if not scaler_lst[i]:
            if column_lst[i]=='salary':
                # 由于LabelEncoding会按照字母顺序来确定0,1,2，破坏了low,med,high的顺序，所以需要重新定义一个函数map_salary               
                df[column_lst[i]]=[map_salary(s) for s in df[column_lst[i]].values]
            else:
                df[column_lst[i]]=LabelEncoder().fit_transform(df[column_lst[i]])  
            # LabelEncoding之后，进行一下归一化处理
            df[column_lst[i]]=MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
        else:
            # OneHot编码，可以直接用pandas里面的get_dummies
            df=pd.get_dummies(df,columns=[column_lst[i]])
    if lower_d:
        # PCA降维与标注Label无关，而LDA降维的n_components不能超过Label的类别（由于left只有0,1，故LDA只能降成1维）
        return PCA(n_components=ld_n).fit_transform(df.values),label
    return df,label

def map_salary(s):
    d=dict([('low',0),('medium',1),('high',2)])
    return d.get(s,0)  # 将low,med,high分别赋值0,1,2，如果没有找到则赋值为0

def hr_modeling(features,label):
    f_v=features.values
    l_v=label.values
    # 先把验证集分离出来，再分割训练集和测试集。训练集、验证集、测试集之比6:2:2。
    X_tt,X_validation,Y_tt,Y_validation=train_test_split(f_v,l_v,test_size=0.2)
    X_train,X_test,Y_train,Y_test=train_test_split(X_tt,Y_tt,test_size=0.25)

    # KNN
    knn_clf=KNeighborsClassifier(n_neighbors=3)
    knn_clf.fit(X_train,Y_train)
    Y_pred=knn_clf.predict(X_validation)
    print('validation set:')
    print('ACC:',accuracy_score(Y_validation,Y_pred))
    print('REC:',recall_score(Y_validation,Y_pred))
    print('F-Score:',f1_score(Y_validation,Y_pred))
    Y_pred=knn_clf.predict(X_test)
    print('test set:')
    print('ACC:',accuracy_score(Y_test,Y_pred))
    print('REC:',recall_score(Y_test,Y_pred))
    print('F-Score:',f1_score(Y_test,Y_pred))
    Y_pred=knn_clf.predict(X_train)
    print('train set:')
    print('ACC:',accuracy_score(Y_train,Y_pred))
    print('REC:',recall_score(Y_train,Y_pred))
    print('F-Score:',f1_score(Y_train,Y_pred))

fetures,label=hr_preprocessing()
hr_modeling(fetures,label)



validation set:
ACC: 0.945333333333
REC: 0.912181303116
F-Score: 0.887052341598
test set:
ACC: 0.938
REC: 0.895977808599
F-Score: 0.874154262517
train set:
ACC: 0.971996888543
REC: 0.958955223881
F-Score: 0.942254812099


验证集和测试集上的效果略小于在训练集上的效果，认为泛化能力还不错，只存在稍许过拟合情况。
<br />除了尝试变化各种模型参数以达到好的模型效果，也可以尝试改变数据预处理的方式。比如指定dp用LabelEncoder就比OneHot效果好。

In [17]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.preprocessing import Normalizer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA

from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors,KNeighborsClassifier
from sklearn.metrics import accuracy_score,recall_score,f1_score
# sl:satisfaction_level——False:MinMaxScaler;True:StandardScaler
# le:last_evaluation——False:MinMaxScaler;True:StandardScaler
# npr:number_project——False:MinMaxScaler;True:StandardScaler
# amh:average_monthly_hours——False:MinMaxScaler;True:StandardScaler
# tsc:time_spend_company——False:MinMaxScaler;True:StandardScaler
# wa:Work_accident——False:MinMaxScaler;True:StandardScaler
# pl5:promotion_last_5years——False:MinMaxScaler;True:StandardScaler
# dp:deparment——False:LabelEncoding;True:OneHotEncoding
# slr:salary——False:LabelEncoding;True:OneHotEncoding
# lower_d——False:NotlowerDimension
# ld_n——to n dimensions
def hr_preprocessing(sl=False,le=False,npr=False,amh=False,tsc=False,wa=False,pl5=False,dp=True,slr=False,
                     lower_d=False,ld_n=1):
    df=pd.read_csv('./data/HR.csv')
    # 1、清洗数据
    df=df.dropna(subset=['satisfaction_level','last_evaluation'])
    df=df[df['satisfaction_level']<=1][df['salary']!='nme']
    # 2、得到标注
    label=df['left']
    df=df.drop('left',axis=1)
    # 3、特征选择(例子中的特征先全部保留)
    # 4、特征处理
    scaler_lst=[sl,le,npr,amh,tsc,wa,pl5]
    column_lst=['satisfaction_level','last_evaluation','number_project','average_monthly_hours',
                'time_spend_company','Work_accident','promotion_last_5years']
    for i in range(len(scaler_lst)):
        if not scaler_lst[i]:
            df[column_lst[i]]=MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
        else:
            df[column_lst[i]]=StandardScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
    scaler_lst=[dp,slr]
    column_lst=['department','salary']
    for i in range(len(scaler_lst)):
        if not scaler_lst[i]:
            if column_lst[i]=='salary':
                # 由于LabelEncoding会按照字母顺序来确定0,1,2，破坏了low,med,high的顺序，所以需要重新定义一个函数map_salary               
                df[column_lst[i]]=[map_salary(s) for s in df[column_lst[i]].values]
            else:
                df[column_lst[i]]=LabelEncoder().fit_transform(df[column_lst[i]])  
            # LabelEncoding之后，进行一下归一化处理
            df[column_lst[i]]=MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
        else:
            # OneHot编码，可以直接用pandas里面的get_dummies
            df=pd.get_dummies(df,columns=[column_lst[i]])
    if lower_d:
        # PCA降维与标注Label无关，而LDA降维的n_components不能超过Label的类别（由于left只有0,1，故LDA只能降成1维）
        return PCA(n_components=ld_n).fit_transform(df.values),label
    return df,label

def map_salary(s):
    d=dict([('low',0),('medium',1),('high',2)])
    return d.get(s,0)  # 将low,med,high分别赋值0,1,2，如果没有找到则赋值为0

def hr_modeling(features,label):
    f_v=features.values
    l_v=label.values
    # 先把验证集分离出来，再分割训练集和测试集。训练集、验证集、测试集之比6:2:2。
    X_tt,X_validation,Y_tt,Y_validation=train_test_split(f_v,l_v,test_size=0.2)
    X_train,X_test,Y_train,Y_test=train_test_split(X_tt,Y_tt,test_size=0.25)

    # KNN
    knn_clf=KNeighborsClassifier(n_neighbors=3)
    knn_clf.fit(X_train,Y_train)
    Y_pred=knn_clf.predict(X_validation)
    print('validation set:')
    print('ACC:',accuracy_score(Y_validation,Y_pred))
    print('REC:',recall_score(Y_validation,Y_pred))
    print('F-Score:',f1_score(Y_validation,Y_pred))
    Y_pred=knn_clf.predict(X_test)
    print('test set:')
    print('ACC:',accuracy_score(Y_test,Y_pred))
    print('REC:',recall_score(Y_test,Y_pred))
    print('F-Score:',f1_score(Y_test,Y_pred))
    Y_pred=knn_clf.predict(X_train)
    print('train set:')
    print('ACC:',accuracy_score(Y_train,Y_pred))
    print('REC:',recall_score(Y_train,Y_pred))
    print('F-Score:',f1_score(Y_train,Y_pred))

fetures,label=hr_preprocessing(dp=False)
hr_modeling(fetures,label)



validation set:
ACC: 0.954
REC: 0.936690647482
F-Score: 0.904166666667
test set:
ACC: 0.951
REC: 0.930651872399
F-Score: 0.901276024177
train set:
ACC: 0.973885987332
REC: 0.960092807425
F-Score: 0.946261147953


然后保存训练出的模型。

In [19]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.preprocessing import Normalizer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA

from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors,KNeighborsClassifier
from sklearn.metrics import accuracy_score,recall_score,f1_score

from sklearn.externals import joblib
# sl:satisfaction_level——False:MinMaxScaler;True:StandardScaler
# le:last_evaluation——False:MinMaxScaler;True:StandardScaler
# npr:number_project——False:MinMaxScaler;True:StandardScaler
# amh:average_monthly_hours——False:MinMaxScaler;True:StandardScaler
# tsc:time_spend_company——False:MinMaxScaler;True:StandardScaler
# wa:Work_accident——False:MinMaxScaler;True:StandardScaler
# pl5:promotion_last_5years——False:MinMaxScaler;True:StandardScaler
# dp:deparment——False:LabelEncoding;True:OneHotEncoding
# slr:salary——False:LabelEncoding;True:OneHotEncoding
# lower_d——False:NotlowerDimension
# ld_n——to n dimensions
def hr_preprocessing(sl=False,le=False,npr=False,amh=False,tsc=False,wa=False,pl5=False,dp=True,slr=False,
                     lower_d=False,ld_n=1):
    df=pd.read_csv('./data/HR.csv')
    # 1、清洗数据
    df=df.dropna(subset=['satisfaction_level','last_evaluation'])
    df=df[df['satisfaction_level']<=1][df['salary']!='nme']
    # 2、得到标注
    label=df['left']
    df=df.drop('left',axis=1)
    # 3、特征选择(例子中的特征先全部保留)
    # 4、特征处理
    scaler_lst=[sl,le,npr,amh,tsc,wa,pl5]
    column_lst=['satisfaction_level','last_evaluation','number_project','average_monthly_hours',
                'time_spend_company','Work_accident','promotion_last_5years']
    for i in range(len(scaler_lst)):
        if not scaler_lst[i]:
            df[column_lst[i]]=MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
        else:
            df[column_lst[i]]=StandardScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
    scaler_lst=[dp,slr]
    column_lst=['department','salary']
    for i in range(len(scaler_lst)):
        if not scaler_lst[i]:
            if column_lst[i]=='salary':
                # 由于LabelEncoding会按照字母顺序来确定0,1,2，破坏了low,med,high的顺序，所以需要重新定义一个函数map_salary               
                df[column_lst[i]]=[map_salary(s) for s in df[column_lst[i]].values]
            else:
                df[column_lst[i]]=LabelEncoder().fit_transform(df[column_lst[i]])  
            # LabelEncoding之后，进行一下归一化处理
            df[column_lst[i]]=MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
        else:
            # OneHot编码，可以直接用pandas里面的get_dummies
            df=pd.get_dummies(df,columns=[column_lst[i]])
    if lower_d:
        # PCA降维与标注Label无关，而LDA降维的n_components不能超过Label的类别（由于left只有0,1，故LDA只能降成1维）
        return PCA(n_components=ld_n).fit_transform(df.values),label
    return df,label

def map_salary(s):
    d=dict([('low',0),('medium',1),('high',2)])
    return d.get(s,0)  # 将low,med,high分别赋值0,1,2，如果没有找到则赋值为0

def hr_modeling(features,label):
    f_v=features.values
    l_v=label.values
    # 先把验证集分离出来，再分割训练集和测试集。训练集、验证集、测试集之比6:2:2。
    X_tt,X_validation,Y_tt,Y_validation=train_test_split(f_v,l_v,test_size=0.2)
    X_train,X_test,Y_train,Y_test=train_test_split(X_tt,Y_tt,test_size=0.25)

    # KNN
    knn_clf=KNeighborsClassifier(n_neighbors=3)
    knn_clf.fit(X_train,Y_train)
    Y_pred=knn_clf.predict(X_validation)
    print('validation set:')
    print('ACC:',accuracy_score(Y_validation,Y_pred))
    print('REC:',recall_score(Y_validation,Y_pred))
    print('F-Score:',f1_score(Y_validation,Y_pred))
    Y_pred=knn_clf.predict(X_test)
    print('test set:')
    print('ACC:',accuracy_score(Y_test,Y_pred))
    print('REC:',recall_score(Y_test,Y_pred))
    print('F-Score:',f1_score(Y_test,Y_pred))
    Y_pred=knn_clf.predict(X_train)
    print('train set:')
    print('ACC:',accuracy_score(Y_train,Y_pred))
    print('REC:',recall_score(Y_train,Y_pred))
    print('F-Score:',f1_score(Y_train,Y_pred))
    
    joblib.dump(knn_clf,'knn_clf')

以上这一段在PyCharm中跑一遍就会在Project中添加一个名为‘knn_clf’的文件。
<br />之后，引用该模型则可以用knn_clf=joblib.load('knn_clf')

In [21]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.preprocessing import Normalizer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA

from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors,KNeighborsClassifier
from sklearn.metrics import accuracy_score,recall_score,f1_score

from sklearn.externals import joblib
# sl:satisfaction_level——False:MinMaxScaler;True:StandardScaler
# le:last_evaluation——False:MinMaxScaler;True:StandardScaler
# npr:number_project——False:MinMaxScaler;True:StandardScaler
# amh:average_monthly_hours——False:MinMaxScaler;True:StandardScaler
# tsc:time_spend_company——False:MinMaxScaler;True:StandardScaler
# wa:Work_accident——False:MinMaxScaler;True:StandardScaler
# pl5:promotion_last_5years——False:MinMaxScaler;True:StandardScaler
# dp:deparment——False:LabelEncoding;True:OneHotEncoding
# slr:salary——False:LabelEncoding;True:OneHotEncoding
# lower_d——False:NotlowerDimension
# ld_n——to n dimensions
def hr_preprocessing(sl=False,le=False,npr=False,amh=False,tsc=False,wa=False,pl5=False,dp=True,slr=False,
                     lower_d=False,ld_n=1):
    df=pd.read_csv('./data/HR.csv')
    # 1、清洗数据
    df=df.dropna(subset=['satisfaction_level','last_evaluation'])
    df=df[df['satisfaction_level']<=1][df['salary']!='nme']
    # 2、得到标注
    label=df['left']
    df=df.drop('left',axis=1)
    # 3、特征选择(例子中的特征先全部保留)
    # 4、特征处理
    scaler_lst=[sl,le,npr,amh,tsc,wa,pl5]
    column_lst=['satisfaction_level','last_evaluation','number_project','average_monthly_hours',
                'time_spend_company','Work_accident','promotion_last_5years']
    for i in range(len(scaler_lst)):
        if not scaler_lst[i]:
            df[column_lst[i]]=MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
        else:
            df[column_lst[i]]=StandardScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
    scaler_lst=[dp,slr]
    column_lst=['department','salary']
    for i in range(len(scaler_lst)):
        if not scaler_lst[i]:
            if column_lst[i]=='salary':
                # 由于LabelEncoding会按照字母顺序来确定0,1,2，破坏了low,med,high的顺序，所以需要重新定义一个函数map_salary               
                df[column_lst[i]]=[map_salary(s) for s in df[column_lst[i]].values]
            else:
                df[column_lst[i]]=LabelEncoder().fit_transform(df[column_lst[i]])  
            # LabelEncoding之后，进行一下归一化处理
            df[column_lst[i]]=MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
        else:
            # OneHot编码，可以直接用pandas里面的get_dummies
            df=pd.get_dummies(df,columns=[column_lst[i]])
    if lower_d:
        # PCA降维与标注Label无关，而LDA降维的n_components不能超过Label的类别（由于left只有0,1，故LDA只能降成1维）
        return PCA(n_components=ld_n).fit_transform(df.values),label
    return df,label

def map_salary(s):
    d=dict([('low',0),('medium',1),('high',2)])
    return d.get(s,0)  # 将low,med,high分别赋值0,1,2，如果没有找到则赋值为0

def hr_modeling(features,label):
    f_v=features.values
    l_v=label.values
    # 先把验证集分离出来，再分割训练集和测试集。训练集、验证集、测试集之比6:2:2。
    X_tt,X_validation,Y_tt,Y_validation=train_test_split(f_v,l_v,test_size=0.2)
    X_train,X_test,Y_train,Y_test=train_test_split(X_tt,Y_tt,test_size=0.25)

    # KNN
    knn_clf=joblib.load('knn_clf')
    Y_pred=knn_clf.predict(X_validation)
    print('validation set:')
    print('ACC:',accuracy_score(Y_validation,Y_pred))
    print('REC:',recall_score(Y_validation,Y_pred))
    print('F-Score:',f1_score(Y_validation,Y_pred))
    Y_pred=knn_clf.predict(X_test)
    print('test set:')
    print('ACC:',accuracy_score(Y_test,Y_pred))
    print('REC:',recall_score(Y_test,Y_pred))
    print('F-Score:',f1_score(Y_test,Y_pred))
    Y_pred=knn_clf.predict(X_train)
    print('train set:')
    print('ACC:',accuracy_score(Y_train,Y_pred))
    print('REC:',recall_score(Y_train,Y_pred))
    print('F-Score:',f1_score(Y_train,Y_pred))
    
fetures,label=hr_preprocessing(dp=False)
hr_modeling(fetures,label)    



validation set:
ACC: 0.964
REC: 0.944444444444
F-Score: 0.922857142857
test set:
ACC: 0.964333333333
REC: 0.95567867036
F-Score: 0.928043039677
train set:
ACC: 0.966885209468
REC: 0.944110854503
F-Score: 0.932056543548


## 分类——朴素贝叶斯
<img src='./image/6.4.png' width=500 />
<br />例子：
<img src='./image/6.4_2.png' width=500 />
<img src='./image/6.4_3.png' width=500 />
<img src='./image/6.4_4.png' width=500 />
<img src='./image/6.4_5.png' width=500 />
但是如果贝叶斯公式中出现分子分母均为0的情况，如：
<img src='./image/6.4_6.png' width=500 />
<img src='./image/6.4_7.png' width=500 />

在下一步实现朴素贝叶斯之前，现将之前的模型进行统一管理，放到models=[]里面。

In [29]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.preprocessing import Normalizer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA

from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors,KNeighborsClassifier
from sklearn.metrics import accuracy_score,recall_score,f1_score
# sl:satisfaction_level——False:MinMaxScaler;True:StandardScaler
# le:last_evaluation——False:MinMaxScaler;True:StandardScaler
# npr:number_project——False:MinMaxScaler;True:StandardScaler
# amh:average_monthly_hours——False:MinMaxScaler;True:StandardScaler
# tsc:time_spend_company——False:MinMaxScaler;True:StandardScaler
# wa:Work_accident——False:MinMaxScaler;True:StandardScaler
# pl5:promotion_last_5years——False:MinMaxScaler;True:StandardScaler
# dp:deparment——False:LabelEncoding;True:OneHotEncoding
# slr:salary——False:LabelEncoding;True:OneHotEncoding
# lower_d——False:NotlowerDimension
# ld_n——to n dimensions
def hr_preprocessing(sl=False,le=False,npr=False,amh=False,tsc=False,wa=False,pl5=False,dp=True,slr=False,
                     lower_d=False,ld_n=1):
    df=pd.read_csv('./data/HR.csv')
    # 1、清洗数据
    df=df.dropna(subset=['satisfaction_level','last_evaluation'])
    df=df[df['satisfaction_level']<=1][df['salary']!='nme']
    # 2、得到标注
    label=df['left']
    df=df.drop('left',axis=1)
    # 3、特征选择(例子中的特征先全部保留)
    # 4、特征处理
    scaler_lst=[sl,le,npr,amh,tsc,wa,pl5]
    column_lst=['satisfaction_level','last_evaluation','number_project','average_monthly_hours',
                'time_spend_company','Work_accident','promotion_last_5years']
    for i in range(len(scaler_lst)):
        if not scaler_lst[i]:
            df[column_lst[i]]=MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
        else:
            df[column_lst[i]]=StandardScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
    scaler_lst=[dp,slr]
    column_lst=['department','salary']
    for i in range(len(scaler_lst)):
        if not scaler_lst[i]:
            if column_lst[i]=='salary':
                # 由于LabelEncoding会按照字母顺序来确定0,1,2，破坏了low,med,high的顺序，所以需要重新定义一个函数map_salary               
                df[column_lst[i]]=[map_salary(s) for s in df[column_lst[i]].values]
            else:
                df[column_lst[i]]=LabelEncoder().fit_transform(df[column_lst[i]])  
            # LabelEncoding之后，进行一下归一化处理
            df[column_lst[i]]=MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
        else:
            # OneHot编码，可以直接用pandas里面的get_dummies
            df=pd.get_dummies(df,columns=[column_lst[i]])
    if lower_d:
        # PCA降维与标注Label无关，而LDA降维的n_components不能超过Label的类别（由于left只有0,1，故LDA只能降成1维）
        return PCA(n_components=ld_n).fit_transform(df.values),label
    return df,label

def map_salary(s):
    d=dict([('low',0),('medium',1),('high',2)])
    return d.get(s,0)  # 将low,med,high分别赋值0,1,2，如果没有找到则赋值为0

def hr_modeling(features,label):
    f_v=features.values
    l_v=label.values
    # 先把验证集分离出来，再分割训练集和测试集。训练集、验证集、测试集之比6:2:2。
    X_tt,X_validation,Y_tt,Y_validation=train_test_split(f_v,l_v,test_size=0.2)
    X_train,X_test,Y_train,Y_test=train_test_split(X_tt,Y_tt,test_size=0.25)

    # 
    models=[]
    models.append(('KNN',KNeighborsClassifier(n_neighbors=3)))
    for clf_name,clf in models:
        clf.fit(X_train,Y_train)
        xy_lst=[(X_train,Y_train),(X_validation,Y_validation),(X_test,Y_test)]
        d=dict([(0,'Train Set'),(1,'Validation Set'),(2,'Test Set')])
        print('-'*8,clf_name,'-'*8)
        for i in range(len(xy_lst)):
            X_part=xy_lst[i][0]
            Y_part=xy_lst[i][1]
            Y_pred=clf.predict(X_part)
            print(d.get(i))
            print('-ACC:',accuracy_score(Y_part,Y_pred))
            print('-REC:',recall_score(Y_part,Y_pred))
            print('-F-Score:',f1_score(Y_part,Y_pred))


fetures,label=hr_preprocessing(dp=False)
hr_modeling(fetures,label)



-------- KNN --------
Train Set
-ACC: 0.977553061451
-REC: 0.968377635197
-F-Score: 0.954381210479
Validation Set
-ACC: 0.955666666667
-REC: 0.934971098266
-F-Score: 0.906797477225
Test Set
-ACC: 0.953
-REC: 0.932568149211
-F-Score: 0.902151283831


加入朴素贝叶斯模型

In [31]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.preprocessing import Normalizer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA

from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors,KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB,BernoulliNB

from sklearn.metrics import accuracy_score,recall_score,f1_score
# sl:satisfaction_level——False:MinMaxScaler;True:StandardScaler
# le:last_evaluation——False:MinMaxScaler;True:StandardScaler
# npr:number_project——False:MinMaxScaler;True:StandardScaler
# amh:average_monthly_hours——False:MinMaxScaler;True:StandardScaler
# tsc:time_spend_company——False:MinMaxScaler;True:StandardScaler
# wa:Work_accident——False:MinMaxScaler;True:StandardScaler
# pl5:promotion_last_5years——False:MinMaxScaler;True:StandardScaler
# dp:deparment——False:LabelEncoding;True:OneHotEncoding
# slr:salary——False:LabelEncoding;True:OneHotEncoding
# lower_d——False:NotlowerDimension
# ld_n——to n dimensions
def hr_preprocessing(sl=False,le=False,npr=False,amh=False,tsc=False,wa=False,pl5=False,dp=True,slr=False,
                     lower_d=False,ld_n=1):
    df=pd.read_csv('./data/HR.csv')
    # 1、清洗数据
    df=df.dropna(subset=['satisfaction_level','last_evaluation'])
    df=df[df['satisfaction_level']<=1][df['salary']!='nme']
    # 2、得到标注
    label=df['left']
    df=df.drop('left',axis=1)
    # 3、特征选择(例子中的特征先全部保留)
    # 4、特征处理
    scaler_lst=[sl,le,npr,amh,tsc,wa,pl5]
    column_lst=['satisfaction_level','last_evaluation','number_project','average_monthly_hours',
                'time_spend_company','Work_accident','promotion_last_5years']
    for i in range(len(scaler_lst)):
        if not scaler_lst[i]:
            df[column_lst[i]]=MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
        else:
            df[column_lst[i]]=StandardScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
    scaler_lst=[dp,slr]
    column_lst=['department','salary']
    for i in range(len(scaler_lst)):
        if not scaler_lst[i]:
            if column_lst[i]=='salary':
                # 由于LabelEncoding会按照字母顺序来确定0,1,2，破坏了low,med,high的顺序，所以需要重新定义一个函数map_salary               
                df[column_lst[i]]=[map_salary(s) for s in df[column_lst[i]].values]
            else:
                df[column_lst[i]]=LabelEncoder().fit_transform(df[column_lst[i]])  
            # LabelEncoding之后，进行一下归一化处理
            df[column_lst[i]]=MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
        else:
            # OneHot编码，可以直接用pandas里面的get_dummies
            df=pd.get_dummies(df,columns=[column_lst[i]])
    if lower_d:
        # PCA降维与标注Label无关，而LDA降维的n_components不能超过Label的类别（由于left只有0,1，故LDA只能降成1维）
        return PCA(n_components=ld_n).fit_transform(df.values),label
    return df,label

def map_salary(s):
    d=dict([('low',0),('medium',1),('high',2)])
    return d.get(s,0)  # 将low,med,high分别赋值0,1,2，如果没有找到则赋值为0

def hr_modeling(features,label):
    f_v=features.values
    l_v=label.values
    # 先把验证集分离出来，再分割训练集和测试集。训练集、验证集、测试集之比6:2:2。
    X_tt,X_validation,Y_tt,Y_validation=train_test_split(f_v,l_v,test_size=0.2)
    X_train,X_test,Y_train,Y_test=train_test_split(X_tt,Y_tt,test_size=0.25)

    # 
    models=[]
    models.append(('KNN',KNeighborsClassifier(n_neighbors=3)))
    
    models.append(('GaussianNB',GaussianNB()))
    models.append(('BernoulliNB',BernoulliNB()))
    
    for clf_name,clf in models:
        clf.fit(X_train,Y_train)
        xy_lst=[(X_train,Y_train),(X_validation,Y_validation),(X_test,Y_test)]
        d=dict([(0,'Train Set'),(1,'Validation Set'),(2,'Test Set')])
        print('-'*8,clf_name,'-'*8)
        for i in range(len(xy_lst)):
            X_part=xy_lst[i][0]
            Y_part=xy_lst[i][1]
            Y_pred=clf.predict(X_part)
            print(d.get(i))
            print('-ACC:',accuracy_score(Y_part,Y_pred))
            print('-REC:',recall_score(Y_part,Y_pred))
            print('-F-Score:',f1_score(Y_part,Y_pred))


fetures,label=hr_preprocessing(dp=False)
hr_modeling(fetures,label)



-------- KNN --------
Train Set
-ACC: 0.976552950328
-REC: 0.969245107176
-F-Score: 0.951727293526
Validation Set
-ACC: 0.951333333333
-REC: 0.920765027322
-F-Score: 0.902275769746
Test Set
-ACC: 0.950333333333
-REC: 0.923520923521
-F-Score: 0.895731280616
-------- GaussianNB --------
Train Set
-ACC: 0.79319924436
-REC: 0.745107176142
-F-Score: 0.632140739277
Validation Set
-ACC: 0.798
-REC: 0.743169398907
-F-Score: 0.642266824085
Test Set
-ACC: 0.796666666667
-REC: 0.756132756133
-F-Score: 0.632086851628
-------- BernoulliNB --------
Train Set
-ACC: 0.842538059784
-REC: 0.465051258155
-F-Score: 0.584822736595
Validation Set
-ACC: 0.830333333333
-REC: 0.449453551913
-F-Score: 0.563838903171
Test Set
-ACC: 0.850333333333
-REC: 0.499278499278
-F-Score: 0.606485539001


发现朴素贝叶斯分类器的效果不是很好。如果一份数据中绝大多数都是离散的，可以考虑朴素贝叶斯。

<img src='./image/6.4_8.png' width=500 />

## 分类——决策树
例子：
<img src='./image/6.4_9.png' width=500 />
<img src='./image/6.4_10.png' width=500 />
<img src='./image/6.4_11.png' width=500 />
<img src='./image/6.4_12.png' width=500 />
不纯度最低的切分是很有效的切分，应该先考虑。
<img src='./image/6.4_13.png' width=500 /> 

In [33]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.preprocessing import Normalizer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA

from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors,KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB,BernoulliNB
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score,recall_score,f1_score
# sl:satisfaction_level——False:MinMaxScaler;True:StandardScaler
# le:last_evaluation——False:MinMaxScaler;True:StandardScaler
# npr:number_project——False:MinMaxScaler;True:StandardScaler
# amh:average_monthly_hours——False:MinMaxScaler;True:StandardScaler
# tsc:time_spend_company——False:MinMaxScaler;True:StandardScaler
# wa:Work_accident——False:MinMaxScaler;True:StandardScaler
# pl5:promotion_last_5years——False:MinMaxScaler;True:StandardScaler
# dp:deparment——False:LabelEncoding;True:OneHotEncoding
# slr:salary——False:LabelEncoding;True:OneHotEncoding
# lower_d——False:NotlowerDimension
# ld_n——to n dimensions
def hr_preprocessing(sl=False,le=False,npr=False,amh=False,tsc=False,wa=False,pl5=False,dp=True,slr=False,
                     lower_d=False,ld_n=1):
    df=pd.read_csv('./data/HR.csv')
    # 1、清洗数据
    df=df.dropna(subset=['satisfaction_level','last_evaluation'])
    df=df[df['satisfaction_level']<=1][df['salary']!='nme']
    # 2、得到标注
    label=df['left']
    df=df.drop('left',axis=1)
    # 3、特征选择(例子中的特征先全部保留)
    # 4、特征处理
    scaler_lst=[sl,le,npr,amh,tsc,wa,pl5]
    column_lst=['satisfaction_level','last_evaluation','number_project','average_monthly_hours',
                'time_spend_company','Work_accident','promotion_last_5years']
    for i in range(len(scaler_lst)):
        if not scaler_lst[i]:
            df[column_lst[i]]=MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
        else:
            df[column_lst[i]]=StandardScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
    scaler_lst=[dp,slr]
    column_lst=['department','salary']
    for i in range(len(scaler_lst)):
        if not scaler_lst[i]:
            if column_lst[i]=='salary':
                # 由于LabelEncoding会按照字母顺序来确定0,1,2，破坏了low,med,high的顺序，所以需要重新定义一个函数map_salary               
                df[column_lst[i]]=[map_salary(s) for s in df[column_lst[i]].values]
            else:
                df[column_lst[i]]=LabelEncoder().fit_transform(df[column_lst[i]])  
            # LabelEncoding之后，进行一下归一化处理
            df[column_lst[i]]=MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
        else:
            # OneHot编码，可以直接用pandas里面的get_dummies
            df=pd.get_dummies(df,columns=[column_lst[i]])
    if lower_d:
        # PCA降维与标注Label无关，而LDA降维的n_components不能超过Label的类别（由于left只有0,1，故LDA只能降成1维）
        return PCA(n_components=ld_n).fit_transform(df.values),label
    return df,label

def map_salary(s):
    d=dict([('low',0),('medium',1),('high',2)])
    return d.get(s,0)  # 将low,med,high分别赋值0,1,2，如果没有找到则赋值为0

def hr_modeling(features,label):
    f_v=features.values
    l_v=label.values
    # 先把验证集分离出来，再分割训练集和测试集。训练集、验证集、测试集之比6:2:2。
    X_tt,X_validation,Y_tt,Y_validation=train_test_split(f_v,l_v,test_size=0.2)
    X_train,X_test,Y_train,Y_test=train_test_split(X_tt,Y_tt,test_size=0.25)

    # KNN
    models=[]
    models.append(('KNN',KNeighborsClassifier(n_neighbors=3)))
    # 朴素贝叶斯
    models.append(('GaussianNB',GaussianNB()))
    models.append(('BernoulliNB',BernoulliNB()))
    # 决策树
    models.append(('DecisionTree',DecisionTreeClassifier()))
    
    for clf_name,clf in models:
        clf.fit(X_train,Y_train)
        xy_lst=[(X_train,Y_train),(X_validation,Y_validation),(X_test,Y_test)]
        d=dict([(0,'Train Set'),(1,'Validation Set'),(2,'Test Set')])
        print('-'*8,clf_name,'-'*8)
        for i in range(len(xy_lst)):
            X_part=xy_lst[i][0]
            Y_part=xy_lst[i][1]
            Y_pred=clf.predict(X_part)
            print(d.get(i))
            print('-ACC:',accuracy_score(Y_part,Y_pred))
            print('-REC:',recall_score(Y_part,Y_pred))
            print('-F-Score:',f1_score(Y_part,Y_pred))


fetures,label=hr_preprocessing(dp=False)
hr_modeling(fetures,label)



-------- KNN --------
Train Set
-ACC: 0.974108234248
-REC: 0.957865168539
-F-Score: 0.94612716763
Validation Set
-ACC: 0.952666666667
-REC: 0.923928077455
-F-Score: 0.903924221922
Test Set
-ACC: 0.951666666667
-REC: 0.915730337079
-F-Score: 0.899930986888
-------- GaussianNB --------
Train Set
-ACC: 0.794532725858
-REC: 0.716760299625
-F-Score: 0.623498269192
Validation Set
-ACC: 0.791666666667
-REC: 0.723374827109
-F-Score: 0.625972471574
Test Set
-ACC: 0.797
-REC: 0.717696629213
-F-Score: 0.62660944206
-------- BernoulliNB --------
Train Set
-ACC: 0.843093677075
-REC: 0.47893258427
-F-Score: 0.591671486408
Validation Set
-ACC: 0.836
-REC: 0.457814661134
-F-Score: 0.573656845754
Test Set
-ACC: 0.843
-REC: 0.448033707865
-F-Score: 0.575293056808
-------- DecisionTree --------
Train Set
-ACC: 1.0
-REC: 1.0
-F-Score: 1.0
Validation Set
-ACC: 0.974666666667
-REC: 0.96265560166
-F-Score: 0.948228882834
Test Set
-ACC: 0.976333333333
-REC: 0.967696629213
-F-Score: 0.951000690131


接下来希望把决策树画出来。

In [39]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.preprocessing import Normalizer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA

from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors,KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB,BernoulliNB
from sklearn.tree import DecisionTreeClassifier,export_graphviz
from sklearn.externals.six import StringIO  # 画决策树可能会用到

from sklearn.metrics import accuracy_score,recall_score,f1_score

import os
os.environ['PATH']+=os.pathsep+'D:/graphviz/bin/'
import pydotplus
# sl:satisfaction_level——False:MinMaxScaler;True:StandardScaler
# le:last_evaluation——False:MinMaxScaler;True:StandardScaler
# npr:number_project——False:MinMaxScaler;True:StandardScaler
# amh:average_monthly_hours——False:MinMaxScaler;True:StandardScaler
# tsc:time_spend_company——False:MinMaxScaler;True:StandardScaler
# wa:Work_accident——False:MinMaxScaler;True:StandardScaler
# pl5:promotion_last_5years——False:MinMaxScaler;True:StandardScaler
# dp:deparment——False:LabelEncoding;True:OneHotEncoding
# slr:salary——False:LabelEncoding;True:OneHotEncoding
# lower_d——False:NotlowerDimension
# ld_n——to n dimensions
def hr_preprocessing(sl=False,le=False,npr=False,amh=False,tsc=False,wa=False,pl5=False,dp=True,slr=False,
                     lower_d=False,ld_n=1):
    df=pd.read_csv('./data/HR.csv')
    # 1、清洗数据
    df=df.dropna(subset=['satisfaction_level','last_evaluation'])
    df=df[df['satisfaction_level']<=1][df['salary']!='nme']
    # 2、得到标注
    label=df['left']
    df=df.drop('left',axis=1)
    # 3、特征选择(例子中的特征先全部保留)
    # 4、特征处理
    scaler_lst=[sl,le,npr,amh,tsc,wa,pl5]
    column_lst=['satisfaction_level','last_evaluation','number_project','average_monthly_hours',
                'time_spend_company','Work_accident','promotion_last_5years']
    for i in range(len(scaler_lst)):
        if not scaler_lst[i]:
            df[column_lst[i]]=MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
        else:
            df[column_lst[i]]=StandardScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
    scaler_lst=[dp,slr]
    column_lst=['department','salary']
    for i in range(len(scaler_lst)):
        if not scaler_lst[i]:
            if column_lst[i]=='salary':
                # 由于LabelEncoding会按照字母顺序来确定0,1,2，破坏了low,med,high的顺序，所以需要重新定义一个函数map_salary               
                df[column_lst[i]]=[map_salary(s) for s in df[column_lst[i]].values]
            else:
                df[column_lst[i]]=LabelEncoder().fit_transform(df[column_lst[i]])  
            # LabelEncoding之后，进行一下归一化处理
            df[column_lst[i]]=MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
        else:
            # OneHot编码，可以直接用pandas里面的get_dummies
            df=pd.get_dummies(df,columns=[column_lst[i]])
    if lower_d:
        # PCA降维与标注Label无关，而LDA降维的n_components不能超过Label的类别（由于left只有0,1，故LDA只能降成1维）
        return PCA(n_components=ld_n).fit_transform(df.values),label
    return df,label

def map_salary(s):
    d=dict([('low',0),('medium',1),('high',2)])
    return d.get(s,0)  # 将low,med,high分别赋值0,1,2，如果没有找到则赋值为0

def hr_modeling(features,label):
    f_v=features.values
    f_names=fetures.columns.values
    l_v=label.values
    # 先把验证集分离出来，再分割训练集和测试集。训练集、验证集、测试集之比6:2:2。
    X_tt,X_validation,Y_tt,Y_validation=train_test_split(f_v,l_v,test_size=0.2)
    X_train,X_test,Y_train,Y_test=train_test_split(X_tt,Y_tt,test_size=0.25)

    # KNN
    models=[]
    #models.append(('KNN',KNeighborsClassifier(n_neighbors=3)))
    # 朴素贝叶斯
    #models.append(('GaussianNB',GaussianNB()))
    #models.append(('BernoulliNB',BernoulliNB()))
    # 决策树
    models.append(('DecisionTreeGini',DecisionTreeClassifier()))
    models.append(('DecisionTreeEntropy',DecisionTreeClassifier(criterion='entropy')))
    
    for clf_name,clf in models:
        clf.fit(X_train,Y_train)
        xy_lst=[(X_train,Y_train),(X_validation,Y_validation),(X_test,Y_test)]
        d=dict([(0,'Train Set'),(1,'Validation Set'),(2,'Test Set')])
        print('-'*8,clf_name,'-'*8)
        for i in range(len(xy_lst)):
            X_part=xy_lst[i][0]
            Y_part=xy_lst[i][1]
            Y_pred=clf.predict(X_part)
            print(d.get(i))
            print('-ACC:',accuracy_score(Y_part,Y_pred))
            print('-REC:',recall_score(Y_part,Y_pred))
            print('-F-Score:',f1_score(Y_part,Y_pred))
            """
            dot_data=export_graphviz(clf,out_file=None,
                                     feature_names=f_names,
                                     class_names=['NL','L'],
                                     filled=True,
                                     rounded=True,
                                     special_characters=True)
            graph=pydotplus.graph_from_dot_data(dot_data)
            graph.write_pdf('dt_tree.pdf')
            """
            """
            # 以上画决策树这一段用上StringIO可以这么写（与上段等价）：
            dot_data=StringIO()
            export_graphviz(clf,out_file=dot_data,
                            feature_names=f_names,
                            class_names=['NL','L'],
                            filled=True,
                            rounded=True,
                            special_characters=True)
            graph=pydotplus.graph_from_dot_data(dot_data.getvalue())
            graph.write_pdf('dt_tree.pdf')
            """
            


fetures,label=hr_preprocessing(dp=False)
hr_modeling(fetures,label)

-------- DecisionTreeGini --------
Train Set
-ACC: 1.0
-REC: 1.0
-F-Score: 1.0
Validation Set
-ACC: 0.975333333333
-REC: 0.965706447188
-F-Score: 0.950067476383
Test Set
-ACC: 0.972333333333
-REC: 0.951790633609
-F-Score: 0.943344709898
-------- DecisionTreeEntropy --------
Train Set
-ACC: 1.0
-REC: 1.0
-F-Score: 1.0
Validation Set
-ACC: 0.978
-REC: 0.968449931413
-F-Score: 0.955345060893
Test Set
-ACC: 0.974666666667
-REC: 0.957300275482
-F-Score: 0.948158253752




尝试其他剪枝方法。

In [40]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.preprocessing import Normalizer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA

from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors,KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB,BernoulliNB
from sklearn.tree import DecisionTreeClassifier,export_graphviz
from sklearn.externals.six import StringIO  # 画决策树可能会用到

from sklearn.metrics import accuracy_score,recall_score,f1_score

import os
os.environ['PATH']+=os.pathsep+'D:/graphviz/bin/'
import pydotplus
# sl:satisfaction_level——False:MinMaxScaler;True:StandardScaler
# le:last_evaluation——False:MinMaxScaler;True:StandardScaler
# npr:number_project——False:MinMaxScaler;True:StandardScaler
# amh:average_monthly_hours——False:MinMaxScaler;True:StandardScaler
# tsc:time_spend_company——False:MinMaxScaler;True:StandardScaler
# wa:Work_accident——False:MinMaxScaler;True:StandardScaler
# pl5:promotion_last_5years——False:MinMaxScaler;True:StandardScaler
# dp:deparment——False:LabelEncoding;True:OneHotEncoding
# slr:salary——False:LabelEncoding;True:OneHotEncoding
# lower_d——False:NotlowerDimension
# ld_n——to n dimensions
def hr_preprocessing(sl=False,le=False,npr=False,amh=False,tsc=False,wa=False,pl5=False,dp=True,slr=False,
                     lower_d=False,ld_n=1):
    df=pd.read_csv('./data/HR.csv')
    # 1、清洗数据
    df=df.dropna(subset=['satisfaction_level','last_evaluation'])
    df=df[df['satisfaction_level']<=1][df['salary']!='nme']
    # 2、得到标注
    label=df['left']
    df=df.drop('left',axis=1)
    # 3、特征选择(例子中的特征先全部保留)
    # 4、特征处理
    scaler_lst=[sl,le,npr,amh,tsc,wa,pl5]
    column_lst=['satisfaction_level','last_evaluation','number_project','average_monthly_hours',
                'time_spend_company','Work_accident','promotion_last_5years']
    for i in range(len(scaler_lst)):
        if not scaler_lst[i]:
            df[column_lst[i]]=MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
        else:
            df[column_lst[i]]=StandardScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
    scaler_lst=[dp,slr]
    column_lst=['department','salary']
    for i in range(len(scaler_lst)):
        if not scaler_lst[i]:
            if column_lst[i]=='salary':
                # 由于LabelEncoding会按照字母顺序来确定0,1,2，破坏了low,med,high的顺序，所以需要重新定义一个函数map_salary               
                df[column_lst[i]]=[map_salary(s) for s in df[column_lst[i]].values]
            else:
                df[column_lst[i]]=LabelEncoder().fit_transform(df[column_lst[i]])  
            # LabelEncoding之后，进行一下归一化处理
            df[column_lst[i]]=MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
        else:
            # OneHot编码，可以直接用pandas里面的get_dummies
            df=pd.get_dummies(df,columns=[column_lst[i]])
    if lower_d:
        # PCA降维与标注Label无关，而LDA降维的n_components不能超过Label的类别（由于left只有0,1，故LDA只能降成1维）
        return PCA(n_components=ld_n).fit_transform(df.values),label
    return df,label

def map_salary(s):
    d=dict([('low',0),('medium',1),('high',2)])
    return d.get(s,0)  # 将low,med,high分别赋值0,1,2，如果没有找到则赋值为0

def hr_modeling(features,label):
    f_v=features.values
    f_names=fetures.columns.values
    l_v=label.values
    # 先把验证集分离出来，再分割训练集和测试集。训练集、验证集、测试集之比6:2:2。
    X_tt,X_validation,Y_tt,Y_validation=train_test_split(f_v,l_v,test_size=0.2)
    X_train,X_test,Y_train,Y_test=train_test_split(X_tt,Y_tt,test_size=0.25)

    # KNN
    models=[]
    #models.append(('KNN',KNeighborsClassifier(n_neighbors=3)))
    # 朴素贝叶斯
    #models.append(('GaussianNB',GaussianNB()))
    #models.append(('BernoulliNB',BernoulliNB()))
    # 决策树
    models.append(('DecisionTreeGini',DecisionTreeClassifier(min_impurity_split=0.1)))
    models.append(('DecisionTreeEntropy',DecisionTreeClassifier(criterion='entropy')))
    
    for clf_name,clf in models:
        clf.fit(X_train,Y_train)
        xy_lst=[(X_train,Y_train),(X_validation,Y_validation),(X_test,Y_test)]
        d=dict([(0,'Train Set'),(1,'Validation Set'),(2,'Test Set')])
        print('-'*8,clf_name,'-'*8)
        for i in range(len(xy_lst)):
            X_part=xy_lst[i][0]
            Y_part=xy_lst[i][1]
            Y_pred=clf.predict(X_part)
            print(d.get(i))
            print('-ACC:',accuracy_score(Y_part,Y_pred))
            print('-REC:',recall_score(Y_part,Y_pred))
            print('-F-Score:',f1_score(Y_part,Y_pred))
            """
            dot_data=export_graphviz(clf,out_file=None,
                                     feature_names=f_names,
                                     class_names=['NL','L'],
                                     filled=True,
                                     rounded=True,
                                     special_characters=True)
            graph=pydotplus.graph_from_dot_data(dot_data)
            graph.write_pdf('dt_tree.pdf')
            """
            """
            # 以上画决策树这一段用上StringIO可以这么写（与上段等价）：
            dot_data=StringIO()
            export_graphviz(clf,out_file=dot_data,
                            feature_names=f_names,
                            class_names=['NL','L'],
                            filled=True,
                            rounded=True,
                            special_characters=True)
            graph=pydotplus.graph_from_dot_data(dot_data.getvalue())
            graph.write_pdf('dt_tree.pdf')
            """
            

fetures,label=hr_preprocessing(dp=False)
hr_modeling(fetures,label)

-------- DecisionTreeGini --------
Train Set
-ACC: 0.977775308368
-REC: 0.930794240595
-F-Score: 0.95247148289
Validation Set
-ACC: 0.972
-REC: 0.916317991632
-F-Score: 0.93991416309
Test Set
-ACC: 0.974
-REC: 0.920114122682
-F-Score: 0.94298245614
-------- DecisionTreeEntropy --------
Train Set
-ACC: 1.0
-REC: 1.0
-F-Score: 1.0
Validation Set
-ACC: 0.975
-REC: 0.958158995816
-F-Score: 0.948240165631
Test Set
-ACC: 0.974666666667
-REC: 0.971469329529
-F-Score: 0.947148817803




效果变差了，因此 min_impurity_split=0.1 这一条还是去掉比较好。经过各种尝试，可以调出适宜的模型。

## 分类——支持向量机（SVM）
<img src='./image/6.6_1.png' width=500 />
<img src='./image/6.6_2.png' width=500 />
其中，每个$a_n$>0。
<br />但是很多时候超平面无法直接将不同类的样本分开，这时候可以考虑扩维，如图：
<img src='./image/6.6_3.png' width=500 />
但是这样可能会导致维度灾难（维度过多），所以应该考虑先在地位计算再扩维。
<img src='./image/6.6_4.png' width=500 />
常用核函数：
<img src='./image/6.6_5.png' width=500 />
与决策树相比，SVM的边界更加平滑。
<img src='./image/6.6_6.png' width=500 />
<br />添加松弛变量：为了达到更宽的分界线，有时需要容忍少量的错分点，减少过拟合现象。
<br /><img src='./image/6.6_7.png' width=500 />
——》
<img src='./image/6.6_8.png' width=500 div />  
<br />根据问题实际场景，对不同的标注赋予不同的权值，得到更加合理的边界。（图中黑色线可能比红色更合理）
<br /><img src='./image/6.6_9.png' width=500 />

In [42]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.preprocessing import Normalizer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA

from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors,KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB,BernoulliNB
from sklearn.tree import DecisionTreeClassifier,export_graphviz
from sklearn.externals.six import StringIO  # 画决策树可能会用到
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score,recall_score,f1_score

import os
os.environ['PATH']+=os.pathsep+'D:/graphviz/bin/'
import pydotplus
# sl:satisfaction_level——False:MinMaxScaler;True:StandardScaler
# le:last_evaluation——False:MinMaxScaler;True:StandardScaler
# npr:number_project——False:MinMaxScaler;True:StandardScaler
# amh:average_monthly_hours——False:MinMaxScaler;True:StandardScaler
# tsc:time_spend_company——False:MinMaxScaler;True:StandardScaler
# wa:Work_accident——False:MinMaxScaler;True:StandardScaler
# pl5:promotion_last_5years——False:MinMaxScaler;True:StandardScaler
# dp:deparment——False:LabelEncoding;True:OneHotEncoding
# slr:salary——False:LabelEncoding;True:OneHotEncoding
# lower_d——False:NotlowerDimension
# ld_n——to n dimensions
def hr_preprocessing(sl=False,le=False,npr=False,amh=False,tsc=False,wa=False,pl5=False,dp=True,slr=False,
                     lower_d=False,ld_n=1):
    df=pd.read_csv('./data/HR.csv')
    # 1、清洗数据
    df=df.dropna(subset=['satisfaction_level','last_evaluation'])
    df=df[df['satisfaction_level']<=1][df['salary']!='nme']
    # 2、得到标注
    label=df['left']
    df=df.drop('left',axis=1)
    # 3、特征选择(例子中的特征先全部保留)
    # 4、特征处理
    scaler_lst=[sl,le,npr,amh,tsc,wa,pl5]
    column_lst=['satisfaction_level','last_evaluation','number_project','average_monthly_hours',
                'time_spend_company','Work_accident','promotion_last_5years']
    for i in range(len(scaler_lst)):
        if not scaler_lst[i]:
            df[column_lst[i]]=MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
        else:
            df[column_lst[i]]=StandardScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
    scaler_lst=[dp,slr]
    column_lst=['department','salary']
    for i in range(len(scaler_lst)):
        if not scaler_lst[i]:
            if column_lst[i]=='salary':
                # 由于LabelEncoding会按照字母顺序来确定0,1,2，破坏了low,med,high的顺序，所以需要重新定义一个函数map_salary               
                df[column_lst[i]]=[map_salary(s) for s in df[column_lst[i]].values]
            else:
                df[column_lst[i]]=LabelEncoder().fit_transform(df[column_lst[i]])  
            # LabelEncoding之后，进行一下归一化处理
            df[column_lst[i]]=MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
        else:
            # OneHot编码，可以直接用pandas里面的get_dummies
            df=pd.get_dummies(df,columns=[column_lst[i]])
    if lower_d:
        # PCA降维与标注Label无关，而LDA降维的n_components不能超过Label的类别（由于left只有0,1，故LDA只能降成1维）
        return PCA(n_components=ld_n).fit_transform(df.values),label
    return df,label

def map_salary(s):
    d=dict([('low',0),('medium',1),('high',2)])
    return d.get(s,0)  # 将low,med,high分别赋值0,1,2，如果没有找到则赋值为0

def hr_modeling(features,label):
    f_v=features.values
    f_names=fetures.columns.values
    l_v=label.values
    # 先把验证集分离出来，再分割训练集和测试集。训练集、验证集、测试集之比6:2:2。
    X_tt,X_validation,Y_tt,Y_validation=train_test_split(f_v,l_v,test_size=0.2)
    X_train,X_test,Y_train,Y_test=train_test_split(X_tt,Y_tt,test_size=0.25)

    models=[]
#     # KNN
#     models.append(('KNN',KNeighborsClassifier(n_neighbors=3)))
#     # 朴素贝叶斯
#     models.append(('GaussianNB',GaussianNB()))
#     models.append(('BernoulliNB',BernoulliNB()))
#     # 决策树
#     models.append(('DecisionTreeGini',DecisionTreeClassifier()))
#     models.append(('DecisionTreeEntropy',DecisionTreeClassifier(criterion='entropy')))
    # SVM
    models.append(('SVM Classifier',SVC()))
    
    for clf_name,clf in models:
        clf.fit(X_train,Y_train)
        xy_lst=[(X_train,Y_train),(X_validation,Y_validation),(X_test,Y_test)]
        d=dict([(0,'Train Set'),(1,'Validation Set'),(2,'Test Set')])
        print('-'*8,clf_name,'-'*8)
        for i in range(len(xy_lst)):
            X_part=xy_lst[i][0]
            Y_part=xy_lst[i][1]
            Y_pred=clf.predict(X_part)
            print(d.get(i))
            print('-ACC:',accuracy_score(Y_part,Y_pred))
            print('-REC:',recall_score(Y_part,Y_pred))
            print('-F-Score:',f1_score(Y_part,Y_pred))
            """
            dot_data=export_graphviz(clf,out_file=None,
                                     feature_names=f_names,
                                     class_names=['NL','L'],
                                     filled=True,
                                     rounded=True,
                                     special_characters=True)
            graph=pydotplus.graph_from_dot_data(dot_data)
            graph.write_pdf('dt_tree.pdf')
            """
            """
            # 以上画决策树这一段用上StringIO可以这么写（与上段等价）：
            dot_data=StringIO()
            export_graphviz(clf,out_file=dot_data,
                            feature_names=f_names,
                            class_names=['NL','L'],
                            filled=True,
                            rounded=True,
                            special_characters=True)
            graph=pydotplus.graph_from_dot_data(dot_data.getvalue())
            graph.write_pdf('dt_tree.pdf')
            """
            

fetures,label=hr_preprocessing(dp=False)
hr_modeling(fetures,label)



-------- SVM Classifier --------
Train Set
-ACC: 0.912101344594
-REC: 0.740723774622
-F-Score: 0.80347826087
Validation Set
-ACC: 0.913
-REC: 0.744588744589
-F-Score: 0.798143851508
Test Set
-ACC: 0.917
-REC: 0.746762589928
-F-Score: 0.806526806527


效果不算很好，需要试着调节一下SVM的参数。比如C=100000，C表示分错类别的惩罚力度，默认为1。如果惩罚加大，错分点就会减少，代价是电脑运行速度会减慢。

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.preprocessing import Normalizer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA

from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors,KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB,BernoulliNB
from sklearn.tree import DecisionTreeClassifier,export_graphviz
from sklearn.externals.six import StringIO  # 画决策树可能会用到
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score,recall_score,f1_score

import os
os.environ['PATH']+=os.pathsep+'D:/graphviz/bin/'
import pydotplus
# sl:satisfaction_level——False:MinMaxScaler;True:StandardScaler
# le:last_evaluation——False:MinMaxScaler;True:StandardScaler
# npr:number_project——False:MinMaxScaler;True:StandardScaler
# amh:average_monthly_hours——False:MinMaxScaler;True:StandardScaler
# tsc:time_spend_company——False:MinMaxScaler;True:StandardScaler
# wa:Work_accident——False:MinMaxScaler;True:StandardScaler
# pl5:promotion_last_5years——False:MinMaxScaler;True:StandardScaler
# dp:deparment——False:LabelEncoding;True:OneHotEncoding
# slr:salary——False:LabelEncoding;True:OneHotEncoding
# lower_d——False:NotlowerDimension
# ld_n——to n dimensions
def hr_preprocessing(sl=False,le=False,npr=False,amh=False,tsc=False,wa=False,pl5=False,dp=True,slr=False,
                     lower_d=False,ld_n=1):
    df=pd.read_csv('./data/HR.csv')
    # 1、清洗数据
    df=df.dropna(subset=['satisfaction_level','last_evaluation'])
    df=df[df['satisfaction_level']<=1][df['salary']!='nme']
    # 2、得到标注
    label=df['left']
    df=df.drop('left',axis=1)
    # 3、特征选择(例子中的特征先全部保留)
    # 4、特征处理
    scaler_lst=[sl,le,npr,amh,tsc,wa,pl5]
    column_lst=['satisfaction_level','last_evaluation','number_project','average_monthly_hours',
                'time_spend_company','Work_accident','promotion_last_5years']
    for i in range(len(scaler_lst)):
        if not scaler_lst[i]:
            df[column_lst[i]]=MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
        else:
            df[column_lst[i]]=StandardScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
    scaler_lst=[dp,slr]
    column_lst=['department','salary']
    for i in range(len(scaler_lst)):
        if not scaler_lst[i]:
            if column_lst[i]=='salary':
                # 由于LabelEncoding会按照字母顺序来确定0,1,2，破坏了low,med,high的顺序，所以需要重新定义一个函数map_salary               
                df[column_lst[i]]=[map_salary(s) for s in df[column_lst[i]].values]
            else:
                df[column_lst[i]]=LabelEncoder().fit_transform(df[column_lst[i]])  
            # LabelEncoding之后，进行一下归一化处理
            df[column_lst[i]]=MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
        else:
            # OneHot编码，可以直接用pandas里面的get_dummies
            df=pd.get_dummies(df,columns=[column_lst[i]])
    if lower_d:
        # PCA降维与标注Label无关，而LDA降维的n_components不能超过Label的类别（由于left只有0,1，故LDA只能降成1维）
        return PCA(n_components=ld_n).fit_transform(df.values),label
    return df,label

def map_salary(s):
    d=dict([('low',0),('medium',1),('high',2)])
    return d.get(s,0)  # 将low,med,high分别赋值0,1,2，如果没有找到则赋值为0

def hr_modeling(features,label):
    f_v=features.values
    f_names=fetures.columns.values
    l_v=label.values
    # 先把验证集分离出来，再分割训练集和测试集。训练集、验证集、测试集之比6:2:2。
    X_tt,X_validation,Y_tt,Y_validation=train_test_split(f_v,l_v,test_size=0.2)
    X_train,X_test,Y_train,Y_test=train_test_split(X_tt,Y_tt,test_size=0.25)

    models=[]
#     # KNN
#     models.append(('KNN',KNeighborsClassifier(n_neighbors=3)))
#     # 朴素贝叶斯
#     models.append(('GaussianNB',GaussianNB()))
#     models.append(('BernoulliNB',BernoulliNB()))
#     # 决策树
#     models.append(('DecisionTreeGini',DecisionTreeClassifier()))
#     models.append(('DecisionTreeEntropy',DecisionTreeClassifier(criterion='entropy')))
#     # SVM
#     models.append(('SVM Classifier',SVC(C=100)))
    # 分类——集成——随机森林
    
    
    for clf_name,clf in models:
        clf.fit(X_train,Y_train)
        xy_lst=[(X_train,Y_train),(X_validation,Y_validation),(X_test,Y_test)]
        d=dict([(0,'Train Set'),(1,'Validation Set'),(2,'Test Set')])
        print('-'*8,clf_name,'-'*8)
        for i in range(len(xy_lst)):
            X_part=xy_lst[i][0]
            Y_part=xy_lst[i][1]
            Y_pred=clf.predict(X_part)
            print(d.get(i))
            print('-ACC:',accuracy_score(Y_part,Y_pred))
            print('-REC:',recall_score(Y_part,Y_pred))
            print('-F-Score:',f1_score(Y_part,Y_pred))
            """
            dot_data=export_graphviz(clf,out_file=None,
                                     feature_names=f_names,
                                     class_names=['NL','L'],
                                     filled=True,
                                     rounded=True,
                                     special_characters=True)
            graph=pydotplus.graph_from_dot_data(dot_data)
            graph.write_pdf('dt_tree.pdf')
            """
            """
            # 以上画决策树这一段用上StringIO可以这么写（与上段等价）：
            dot_data=StringIO()
            export_graphviz(clf,out_file=dot_data,
                            feature_names=f_names,
                            class_names=['NL','L'],
                            filled=True,
                            rounded=True,
                            special_characters=True)
            graph=pydotplus.graph_from_dot_data(dot_data.getvalue())
            graph.write_pdf('dt_tree.pdf')
            """
            

fetures,label=hr_preprocessing(dp=False)
hr_modeling(fetures,label)



-------- SVM Classifier --------
Train Set
-ACC: 0.970218913213
-REC: 0.925501432665
-F-Score: 0.935328185328
Validation Set
-ACC: 0.961333333333
-REC: 0.909574468085
-F-Score: 0.921832884097
Test Set
-ACC: 0.958
-REC: 0.906206896552
-F-Score: 0.9125


## 分类——集成——随机森林
<img src='./image/6.7_2.png' width=500 />
<img src='./image/6.7_3.png' width=500 />
<img src='./image/6.7.png' width=500 />

In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.preprocessing import Normalizer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA

from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors,KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB,BernoulliNB
from sklearn.tree import DecisionTreeClassifier,export_graphviz
from sklearn.externals.six import StringIO  # 画决策树可能会用到
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score,recall_score,f1_score

import os
os.environ['PATH']+=os.pathsep+'D:/graphviz/bin/'
import pydotplus
# sl:satisfaction_level——False:MinMaxScaler;True:StandardScaler
# le:last_evaluation——False:MinMaxScaler;True:StandardScaler
# npr:number_project——False:MinMaxScaler;True:StandardScaler
# amh:average_monthly_hours——False:MinMaxScaler;True:StandardScaler
# tsc:time_spend_company——False:MinMaxScaler;True:StandardScaler
# wa:Work_accident——False:MinMaxScaler;True:StandardScaler
# pl5:promotion_last_5years——False:MinMaxScaler;True:StandardScaler
# dp:deparment——False:LabelEncoding;True:OneHotEncoding
# slr:salary——False:LabelEncoding;True:OneHotEncoding
# lower_d——False:NotlowerDimension
# ld_n——to n dimensions
def hr_preprocessing(sl=False,le=False,npr=False,amh=False,tsc=False,wa=False,pl5=False,dp=True,slr=False,
                     lower_d=False,ld_n=1):
    df=pd.read_csv('./data/HR.csv')
    # 1、清洗数据
    df=df.dropna(subset=['satisfaction_level','last_evaluation'])
    df=df[df['satisfaction_level']<=1][df['salary']!='nme']
    # 2、得到标注
    label=df['left']
    df=df.drop('left',axis=1)
    # 3、特征选择(例子中的特征先全部保留)
    # 4、特征处理
    scaler_lst=[sl,le,npr,amh,tsc,wa,pl5]
    column_lst=['satisfaction_level','last_evaluation','number_project','average_monthly_hours',
                'time_spend_company','Work_accident','promotion_last_5years']
    for i in range(len(scaler_lst)):
        if not scaler_lst[i]:
            df[column_lst[i]]=MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
        else:
            df[column_lst[i]]=StandardScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
    scaler_lst=[dp,slr]
    column_lst=['department','salary']
    for i in range(len(scaler_lst)):
        if not scaler_lst[i]:
            if column_lst[i]=='salary':
                # 由于LabelEncoding会按照字母顺序来确定0,1,2，破坏了low,med,high的顺序，所以需要重新定义一个函数map_salary               
                df[column_lst[i]]=[map_salary(s) for s in df[column_lst[i]].values]
            else:
                df[column_lst[i]]=LabelEncoder().fit_transform(df[column_lst[i]])  
            # LabelEncoding之后，进行一下归一化处理
            df[column_lst[i]]=MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
        else:
            # OneHot编码，可以直接用pandas里面的get_dummies
            df=pd.get_dummies(df,columns=[column_lst[i]])
    if lower_d:
        # PCA降维与标注Label无关，而LDA降维的n_components不能超过Label的类别（由于left只有0,1，故LDA只能降成1维）
        return PCA(n_components=ld_n).fit_transform(df.values),label
    return df,label

def map_salary(s):
    d=dict([('low',0),('medium',1),('high',2)])
    return d.get(s,0)  # 将low,med,high分别赋值0,1,2，如果没有找到则赋值为0

def hr_modeling(features,label):
    f_v=features.values
    f_names=fetures.columns.values
    l_v=label.values
    # 先把验证集分离出来，再分割训练集和测试集。训练集、验证集、测试集之比6:2:2。
    X_tt,X_validation,Y_tt,Y_validation=train_test_split(f_v,l_v,test_size=0.2)
    X_train,X_test,Y_train,Y_test=train_test_split(X_tt,Y_tt,test_size=0.25)

    models=[]
#     # KNN
#     models.append(('KNN',KNeighborsClassifier(n_neighbors=3)))
#     # 朴素贝叶斯
#     models.append(('GaussianNB',GaussianNB()))
#     models.append(('BernoulliNB',BernoulliNB()))
    # 决策树
    models.append(('DecisionTreeGini',DecisionTreeClassifier()))
    models.append(('DecisionTreeEntropy',DecisionTreeClassifier(criterion='entropy')))
#     # SVM
#     models.append(('SVM Classifier',SVC(C=100)))
    # 分类——集成——随机森林
    models.append(('RandomForest',RandomForestClassifier()))
    
    for clf_name,clf in models:
        clf.fit(X_train,Y_train)
        xy_lst=[(X_train,Y_train),(X_validation,Y_validation),(X_test,Y_test)]
        d=dict([(0,'Train Set'),(1,'Validation Set'),(2,'Test Set')])
        print('-'*8,clf_name,'-'*8)
        for i in range(len(xy_lst)):
            X_part=xy_lst[i][0]
            Y_part=xy_lst[i][1]
            Y_pred=clf.predict(X_part)
            print(d.get(i))
            print('-ACC:',accuracy_score(Y_part,Y_pred))
            print('-REC:',recall_score(Y_part,Y_pred))
            print('-F-Score:',f1_score(Y_part,Y_pred))
            """
            dot_data=export_graphviz(clf,out_file=None,
                                     feature_names=f_names,
                                     class_names=['NL','L'],
                                     filled=True,
                                     rounded=True,
                                     special_characters=True)
            graph=pydotplus.graph_from_dot_data(dot_data)
            graph.write_pdf('dt_tree.pdf')
            """
            """
            # 以上画决策树这一段用上StringIO可以这么写（与上段等价）：
            dot_data=StringIO()
            export_graphviz(clf,out_file=dot_data,
                            feature_names=f_names,
                            class_names=['NL','L'],
                            filled=True,
                            rounded=True,
                            special_characters=True)
            graph=pydotplus.graph_from_dot_data(dot_data.getvalue())
            graph.write_pdf('dt_tree.pdf')
            """
            

fetures,label=hr_preprocessing(dp=False)
hr_modeling(fetures,label)

-------- DecisionTreeGini --------
Train Set
-ACC: 1.0
-REC: 1.0
-F-Score: 1.0
Validation Set
-ACC: 0.974666666667
-REC: 0.962447844228
-F-Score: 0.947945205479
Test Set
-ACC: 0.973
-REC: 0.960164835165
-F-Score: 0.94523326572
-------- DecisionTreeEntropy --------
Train Set
-ACC: 1.0
-REC: 1.0
-F-Score: 1.0
Validation Set
-ACC: 0.972
-REC: 0.956884561892
-F-Score: 0.942465753425
Test Set
-ACC: 0.981
-REC: 0.957417582418
-F-Score: 0.960716747071
-------- RandomForest --------
Train Set
-ACC: 0.997777530837
-REC: 0.991054613936
-F-Score: 0.995271867612
Validation Set
-ACC: 0.987333333333
-REC: 0.95966620306
-F-Score: 0.973201692525
Test Set
-ACC: 0.985333333333
-REC: 0.949175824176
-F-Score: 0.969144460028




发现随机森林的效果比决策树更好。这是集成带来的好处。接下来调整随机森林参数。

In [10]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.preprocessing import Normalizer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA

from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors,KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB,BernoulliNB
from sklearn.tree import DecisionTreeClassifier,export_graphviz
from sklearn.externals.six import StringIO  # 画决策树可能会用到
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score,recall_score,f1_score

import os
os.environ['PATH']+=os.pathsep+'D:/graphviz/bin/'
import pydotplus
# sl:satisfaction_level——False:MinMaxScaler;True:StandardScaler
# le:last_evaluation——False:MinMaxScaler;True:StandardScaler
# npr:number_project——False:MinMaxScaler;True:StandardScaler
# amh:average_monthly_hours——False:MinMaxScaler;True:StandardScaler
# tsc:time_spend_company——False:MinMaxScaler;True:StandardScaler
# wa:Work_accident——False:MinMaxScaler;True:StandardScaler
# pl5:promotion_last_5years——False:MinMaxScaler;True:StandardScaler
# dp:deparment——False:LabelEncoding;True:OneHotEncoding
# slr:salary——False:LabelEncoding;True:OneHotEncoding
# lower_d——False:NotlowerDimension
# ld_n——to n dimensions
def hr_preprocessing(sl=False,le=False,npr=False,amh=False,tsc=False,wa=False,pl5=False,dp=True,slr=False,
                     lower_d=False,ld_n=1):
    df=pd.read_csv('./data/HR.csv')
    # 1、清洗数据
    df=df.dropna(subset=['satisfaction_level','last_evaluation'])
    df=df[df['satisfaction_level']<=1][df['salary']!='nme']
    # 2、得到标注
    label=df['left']
    df=df.drop('left',axis=1)
    # 3、特征选择(例子中的特征先全部保留)
    # 4、特征处理
    scaler_lst=[sl,le,npr,amh,tsc,wa,pl5]
    column_lst=['satisfaction_level','last_evaluation','number_project','average_monthly_hours',
                'time_spend_company','Work_accident','promotion_last_5years']
    for i in range(len(scaler_lst)):
        if not scaler_lst[i]:
            df[column_lst[i]]=MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
        else:
            df[column_lst[i]]=StandardScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
    scaler_lst=[dp,slr]
    column_lst=['department','salary']
    for i in range(len(scaler_lst)):
        if not scaler_lst[i]:
            if column_lst[i]=='salary':
                # 由于LabelEncoding会按照字母顺序来确定0,1,2，破坏了low,med,high的顺序，所以需要重新定义一个函数map_salary               
                df[column_lst[i]]=[map_salary(s) for s in df[column_lst[i]].values]
            else:
                df[column_lst[i]]=LabelEncoder().fit_transform(df[column_lst[i]])  
            # LabelEncoding之后，进行一下归一化处理
            df[column_lst[i]]=MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
        else:
            # OneHot编码，可以直接用pandas里面的get_dummies
            df=pd.get_dummies(df,columns=[column_lst[i]])
    if lower_d:
        # PCA降维与标注Label无关，而LDA降维的n_components不能超过Label的类别（由于left只有0,1，故LDA只能降成1维）
        return PCA(n_components=ld_n).fit_transform(df.values),label
    return df,label

def map_salary(s):
    d=dict([('low',0),('medium',1),('high',2)])
    return d.get(s,0)  # 将low,med,high分别赋值0,1,2，如果没有找到则赋值为0

def hr_modeling(features,label):
    f_v=features.values
    f_names=fetures.columns.values
    l_v=label.values
    # 先把验证集分离出来，再分割训练集和测试集。训练集、验证集、测试集之比6:2:2。
    X_tt,X_validation,Y_tt,Y_validation=train_test_split(f_v,l_v,test_size=0.2)
    X_train,X_test,Y_train,Y_test=train_test_split(X_tt,Y_tt,test_size=0.25)

    models=[]
#     # KNN
#     models.append(('KNN',KNeighborsClassifier(n_neighbors=3)))
#     # 朴素贝叶斯
#     models.append(('GaussianNB',GaussianNB()))
#     models.append(('BernoulliNB',BernoulliNB()))
#     # 决策树
#     models.append(('DecisionTreeGini',DecisionTreeClassifier()))
#     models.append(('DecisionTreeEntropy',DecisionTreeClassifier(criterion='entropy')))
#     # SVM
#     models.append(('SVM Classifier',SVC(C=100)))
    # 分类——集成——随机森林
    models.append(('OriginalRandomForest',RandomForestClassifier()))
    models.append(('RandomForest',RandomForestClassifier(n_estimators=81,max_features=None,bootstrap=False)))
    
    for clf_name,clf in models:
        clf.fit(X_train,Y_train)
        xy_lst=[(X_train,Y_train),(X_validation,Y_validation),(X_test,Y_test)]
        d=dict([(0,'Train Set'),(1,'Validation Set'),(2,'Test Set')])
        print('-'*8,clf_name,'-'*8)
        for i in range(len(xy_lst)):
            X_part=xy_lst[i][0]
            Y_part=xy_lst[i][1]
            Y_pred=clf.predict(X_part)
            print(d.get(i))
            print('-ACC:',accuracy_score(Y_part,Y_pred))
            print('-REC:',recall_score(Y_part,Y_pred))
            print('-F-Score:',f1_score(Y_part,Y_pred))
            """
            dot_data=export_graphviz(clf,out_file=None,
                                     feature_names=f_names,
                                     class_names=['NL','L'],
                                     filled=True,
                                     rounded=True,
                                     special_characters=True)
            graph=pydotplus.graph_from_dot_data(dot_data)
            graph.write_pdf('dt_tree.pdf')
            """
            """
            # 以上画决策树这一段用上StringIO可以这么写（与上段等价）：
            dot_data=StringIO()
            export_graphviz(clf,out_file=dot_data,
                            feature_names=f_names,
                            class_names=['NL','L'],
                            filled=True,
                            rounded=True,
                            special_characters=True)
            graph=pydotplus.graph_from_dot_data(dot_data.getvalue())
            graph.write_pdf('dt_tree.pdf')
            """
            

fetures,label=hr_preprocessing(dp=False)
hr_modeling(fetures,label)



-------- OriginalRandomForest --------
Train Set
-ACC: 0.997888654295
-REC: 0.991612301957
-F-Score: 0.995555555556
Validation Set
-ACC: 0.986
-REC: 0.951253481894
-F-Score: 0.970170454545
Test Set
-ACC: 0.989666666667
-REC: 0.958981612447
-F-Score: 0.977649603461
-------- RandomForest --------
Train Set
-ACC: 1.0
-REC: 1.0
-F-Score: 1.0
Validation Set
-ACC: 0.970333333333
-REC: 0.956824512535
-F-Score: 0.939166097061
Test Set
-ACC: 0.976666666667
-REC: 0.968882602546
-F-Score: 0.951388888889


## 分类——集成——Adaboost
<img src='./image/6.8_1.png' width=500 />
<img src='./image/6.8_2.png' width=700 />
<img src='./image/6.8_3.png' width=700 />
<img src='./image/6.8_4.png' width=500 />

In [14]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.preprocessing import Normalizer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA

from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors,KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB,BernoulliNB
from sklearn.tree import DecisionTreeClassifier,export_graphviz
from sklearn.externals.six import StringIO  # 画决策树可能会用到
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.metrics import accuracy_score,recall_score,f1_score

import os
os.environ['PATH']+=os.pathsep+'D:/graphviz/bin/'
import pydotplus
# sl:satisfaction_level——False:MinMaxScaler;True:StandardScaler
# le:last_evaluation——False:MinMaxScaler;True:StandardScaler
# npr:number_project——False:MinMaxScaler;True:StandardScaler
# amh:average_monthly_hours——False:MinMaxScaler;True:StandardScaler
# tsc:time_spend_company——False:MinMaxScaler;True:StandardScaler
# wa:Work_accident——False:MinMaxScaler;True:StandardScaler
# pl5:promotion_last_5years——False:MinMaxScaler;True:StandardScaler
# dp:deparment——False:LabelEncoding;True:OneHotEncoding
# slr:salary——False:LabelEncoding;True:OneHotEncoding
# lower_d——False:NotlowerDimension
# ld_n——to n dimensions
def hr_preprocessing(sl=False,le=False,npr=False,amh=False,tsc=False,wa=False,pl5=False,dp=True,slr=False,
                     lower_d=False,ld_n=1):
    df=pd.read_csv('./data/HR.csv')
    # 1、清洗数据
    df=df.dropna(subset=['satisfaction_level','last_evaluation'])
    df=df[df['satisfaction_level']<=1][df['salary']!='nme']
    # 2、得到标注
    label=df['left']
    df=df.drop('left',axis=1)
    # 3、特征选择(例子中的特征先全部保留)
    # 4、特征处理
    scaler_lst=[sl,le,npr,amh,tsc,wa,pl5]
    column_lst=['satisfaction_level','last_evaluation','number_project','average_monthly_hours',
                'time_spend_company','Work_accident','promotion_last_5years']
    for i in range(len(scaler_lst)):
        if not scaler_lst[i]:
            df[column_lst[i]]=MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
        else:
            df[column_lst[i]]=StandardScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
    scaler_lst=[dp,slr]
    column_lst=['department','salary']
    for i in range(len(scaler_lst)):
        if not scaler_lst[i]:
            if column_lst[i]=='salary':
                # 由于LabelEncoding会按照字母顺序来确定0,1,2，破坏了low,med,high的顺序，所以需要重新定义一个函数map_salary               
                df[column_lst[i]]=[map_salary(s) for s in df[column_lst[i]].values]
            else:
                df[column_lst[i]]=LabelEncoder().fit_transform(df[column_lst[i]])  
            # LabelEncoding之后，进行一下归一化处理
            df[column_lst[i]]=MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
        else:
            # OneHot编码，可以直接用pandas里面的get_dummies
            df=pd.get_dummies(df,columns=[column_lst[i]])
    if lower_d:
        # PCA降维与标注Label无关，而LDA降维的n_components不能超过Label的类别（由于left只有0,1，故LDA只能降成1维）
        return PCA(n_components=ld_n).fit_transform(df.values),label
    return df,label

def map_salary(s):
    d=dict([('low',0),('medium',1),('high',2)])
    return d.get(s,0)  # 将low,med,high分别赋值0,1,2，如果没有找到则赋值为0

def hr_modeling(features,label):
    f_v=features.values
    f_names=fetures.columns.values
    l_v=label.values
    # 先把验证集分离出来，再分割训练集和测试集。训练集、验证集、测试集之比6:2:2。
    X_tt,X_validation,Y_tt,Y_validation=train_test_split(f_v,l_v,test_size=0.2)
    X_train,X_test,Y_train,Y_test=train_test_split(X_tt,Y_tt,test_size=0.25)

    models=[]
#     # KNN
#     models.append(('KNN',KNeighborsClassifier(n_neighbors=3)))
#     # 朴素贝叶斯
#     models.append(('GaussianNB',GaussianNB()))
#     models.append(('BernoulliNB',BernoulliNB()))
#     # 决策树
#     models.append(('DecisionTreeGini',DecisionTreeClassifier()))
#     models.append(('DecisionTreeEntropy',DecisionTreeClassifier(criterion='entropy')))
#     # SVM
#     models.append(('SVM Classifier',SVC(C=100)))
#     # 分类——集成——随机森林
#     models.append(('RandomForest',RandomForestClassifier()))
    # 分类——集成——Adaboost
    models.append(('Adaboost',AdaBoostClassifier(n_estimators=100)))
    
    for clf_name,clf in models:
        clf.fit(X_train,Y_train)
        xy_lst=[(X_train,Y_train),(X_validation,Y_validation),(X_test,Y_test)]
        d=dict([(0,'Train Set'),(1,'Validation Set'),(2,'Test Set')])
        print('-'*8,clf_name,'-'*8)
        for i in range(len(xy_lst)):
            X_part=xy_lst[i][0]
            Y_part=xy_lst[i][1]
            Y_pred=clf.predict(X_part)
            print(d.get(i))
            print('-ACC:',accuracy_score(Y_part,Y_pred))
            print('-REC:',recall_score(Y_part,Y_pred))
            print('-F-Score:',f1_score(Y_part,Y_pred))
            """
            dot_data=export_graphviz(clf,out_file=None,
                                     feature_names=f_names,
                                     class_names=['NL','L'],
                                     filled=True,
                                     rounded=True,
                                     special_characters=True)
            graph=pydotplus.graph_from_dot_data(dot_data)
            graph.write_pdf('dt_tree.pdf')
            """
            """
            # 以上画决策树这一段用上StringIO可以这么写（与上段等价）：
            dot_data=StringIO()
            export_graphviz(clf,out_file=dot_data,
                            feature_names=f_names,
                            class_names=['NL','L'],
                            filled=True,
                            rounded=True,
                            special_characters=True)
            graph=pydotplus.graph_from_dot_data(dot_data.getvalue())
            graph.write_pdf('dt_tree.pdf')
            """
            

fetures,label=hr_preprocessing(dp=False)
hr_modeling(fetures,label)



-------- Adaboost --------
Train Set
-ACC: 0.960551172352
-REC: 0.911162790698
-F-Score: 0.916920196583
Validation Set
-ACC: 0.966
-REC: 0.916317991632
-F-Score: 0.927966101695
Test Set
-ACC: 0.96
-REC: 0.903409090909
-F-Score: 0.913793103448
