In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.preprocessing import Normalizer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA
import os

## 数据处理

In [48]:
#sl:satisfaction_level---False:MinMaxScaler;True:StandardScaler
#le:last_evaluation---False:MinMaxScaler;True:StandardScaler
#npr:number_project---False:MinMaxScaler;True:StandardScaler
#amh:average_monthly_hours--False:MinMaxScaler;True:StandardScaler
#tsc:time_spend_company--False:MinMaxScaler;True:StandardScaler
#wa:Work_accident--False:MinMaxScaler;True:StandardScaler
#pl5:promotion_last_5years--False:MinMaxScaler;True:StandardScaler
#dp:department--False:LabelEncoding;True:OneHotEncoding
#slr:salary--False:LabelEncoding;True:OneHotEncoding
def map_salary(s):
    d=dict([("low",0),("medium",1),("high",2)])
    return d.get(s,0)
def hr_preprocessing(sl=False,le=False,npr=False,amh=False,tsc=False,wa=False,pl5=False,dp=False,slr=False,lower_d=False,ld_n=1):
    df=pd.read_csv("./data/HR.csv")
    
    #1、清洗数据
    df=df.dropna(subset=["satisfaction_level","last_evaluation"])
    df=df[df["satisfaction_level"]<=1][df["salary"]!="nme"]
    #2、得到标注
    label = df["left"]
    df = df.drop("left", axis=1)
    #3、特征选择
    #4、特征处理
    scaler_lst=[sl,le,npr,amh,tsc,wa,pl5]
    column_lst=["satisfaction_level","last_evaluation","number_project",\
                "average_monthly_hours","time_spend_company","Work_accident",\
                "promotion_last_5years"]
    for i in range(len(scaler_lst)):
        if not scaler_lst[i]:
            df[column_lst[i]]=\
                MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
        else:
            df[column_lst[i]]=\
                StandardScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
    scaler_lst=[slr,dp]
    column_lst=["salary","department"]
    for i in range(len(scaler_lst)):
        if not scaler_lst[i]:
            if column_lst[i]=="salary":
                # low 为0,medium为1，high为2
                df[column_lst[i]]=[map_salary(s) for s in df["salary"].values]
            else:
                df[column_lst[i]]=LabelEncoder().fit_transform(df[column_lst[i]])
            df[column_lst[i]]=MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
        else:
            df=pd.get_dummies(df,columns=[column_lst[i]])
    if lower_d:
        return PCA(n_components=ld_n).fit_transform(df.values),label  
    return df,label            
                



features,label = hr_preprocessing(sl=True,le=True,npr=True,amh=True,tsc=True,wa=True,pl5=True,dp=True,slr=True)

## 建模函数

In [49]:
from sklearn.model_selection import train_test_split

def hr_modeling_nn(features,label):
    f_v = features.values
    f_names = features.columns.values
    l_v = label.values
    X_tt, X_validation, Y_tt, Y_validation = train_test_split(f_v, l_v, test_size=0.2)
    X_train, X_test, Y_train, Y_test = train_test_split(X_tt, Y_tt, test_size=0.25)
    print(len(X_train),len(X_train),print(X_validation))
    
    return X_train, X_test, Y_train, Y_test,X_validation, Y_validation
    
X_train, X_test, Y_train, Y_test,X_validation, Y_validation = hr_modeling_nn(features,label)

[[ 1.23547405 -1.3209701   0.97111292 ...  0.          0.
   1.        ]
 [ 0.47126283  1.42494396  0.97111292 ...  0.          1.
   0.        ]
 [-0.25272675 -0.67830936 -0.65153764 ...  0.          0.
   1.        ]
 ...
 [-0.73538647  0.25646989  1.7824382  ...  0.          0.
   0.        ]
 [ 0.02882475  1.60021507  0.15978764 ...  0.          0.
   0.        ]
 [ 1.35613898 -0.03564863  0.15978764 ...  1.          0.
   0.        ]]
8999 8999 None


## 机器学习

In [51]:
from sklearn.metrics import accuracy_score, recall_score, f1_score
from sklearn.naive_bayes import GaussianNB,BernoulliNB
from sklearn.neighbors import NearestNeighbors,KNeighborsClassifier
models = []
models.append(("KNN",KNeighborsClassifier(n_neighbors=3)))
models.append(("GaussianNB",GaussianNB()))
models.append(("BernoulliNB",BernoulliNB()))
for clf_name,clf in models:
    clf.fit(X_train,Y_train)
    xy_lst=[(X_train,Y_train),(X_validation,Y_validation),(X_test,Y_test)]
    for i in range(len(xy_lst)):
        X_part=xy_lst[i][0]
        Y_part=xy_lst[i][1]
        Y_pred=clf.predict(X_part)
        print(i)
        print(clf_name,"-ACC:",accuracy_score(Y_part,Y_pred))
        print(clf_name,"-REC:",recall_score(Y_part,Y_pred))
        print(clf_name,"-F1:",f1_score(Y_part,Y_pred))        

0
KNN -ACC: 0.9737748638737638
KNN -REC: 0.9591930307198533
KNN -F1: 0.946606334841629
1
KNN -ACC: 0.9433333333333334
KNN -REC: 0.9049586776859504
KNN -F1: 0.8854447439353099
2
KNN -ACC: 0.9433333333333334
KNN -REC: 0.9126506024096386
KNN -F1: 0.8769898697539797
0
GaussianNB -ACC: 0.6506278475386154
GaussianNB -REC: 0.8541953232462174
GaussianNB -F1: 0.5423580786026202
1
GaussianNB -ACC: 0.6533333333333333
GaussianNB -REC: 0.8347107438016529
GaussianNB -F1: 0.5381882770870338
2
GaussianNB -ACC: 0.6496666666666666
GaussianNB -REC: 0.8192771084337349
GaussianNB -F1: 0.5086489013557738
0
BernoulliNB -ACC: 0.7609734414934993
BernoulliNB -REC: 0.2347546996790463
BernoulliNB -F1: 0.3225196850393701
1
BernoulliNB -ACC: 0.754
BernoulliNB -REC: 0.2066115702479339
BernoulliNB -F1: 0.2890173410404624
2
BernoulliNB -ACC: 0.772
BernoulliNB -REC: 0.21536144578313254
BernoulliNB -F1: 0.2948453608247423
