In [54]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.preprocessing import Normalizer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA
import os
import pydotplus

## 数据处理

In [48]:
#sl:satisfaction_level---False:MinMaxScaler;True:StandardScaler
#le:last_evaluation---False:MinMaxScaler;True:StandardScaler
#npr:number_project---False:MinMaxScaler;True:StandardScaler
#amh:average_monthly_hours--False:MinMaxScaler;True:StandardScaler
#tsc:time_spend_company--False:MinMaxScaler;True:StandardScaler
#wa:Work_accident--False:MinMaxScaler;True:StandardScaler
#pl5:promotion_last_5years--False:MinMaxScaler;True:StandardScaler
#dp:department--False:LabelEncoding;True:OneHotEncoding
#slr:salary--False:LabelEncoding;True:OneHotEncoding
def map_salary(s):
    d=dict([("low",0),("medium",1),("high",2)])
    return d.get(s,0)
def hr_preprocessing(sl=False,le=False,npr=False,amh=False,tsc=False,wa=False,pl5=False,dp=False,slr=False,lower_d=False,ld_n=1):
    df=pd.read_csv("./data/HR.csv")
    
    #1、清洗数据
    df=df.dropna(subset=["satisfaction_level","last_evaluation"])
    df=df[df["satisfaction_level"]<=1][df["salary"]!="nme"]
    #2、得到标注
    label = df["left"]
    df = df.drop("left", axis=1)
    #3、特征选择
    #4、特征处理
    scaler_lst=[sl,le,npr,amh,tsc,wa,pl5]
    column_lst=["satisfaction_level","last_evaluation","number_project",\
                "average_monthly_hours","time_spend_company","Work_accident",\
                "promotion_last_5years"]
    for i in range(len(scaler_lst)):
        if not scaler_lst[i]:
            df[column_lst[i]]=\
                MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
        else:
            df[column_lst[i]]=\
                StandardScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
    scaler_lst=[slr,dp]
    column_lst=["salary","department"]
    for i in range(len(scaler_lst)):
        if not scaler_lst[i]:
            if column_lst[i]=="salary":
                # low 为0,medium为1，high为2
                df[column_lst[i]]=[map_salary(s) for s in df["salary"].values]
            else:
                df[column_lst[i]]=LabelEncoder().fit_transform(df[column_lst[i]])
            df[column_lst[i]]=MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
        else:
            df=pd.get_dummies(df,columns=[column_lst[i]])
    if lower_d:
        return PCA(n_components=ld_n).fit_transform(df.values),label  
    return df,label            
                



features,label = hr_preprocessing(sl=True,le=True,npr=True,amh=True,tsc=True,wa=True,pl5=True,dp=True,slr=True)

## 建模函数

In [49]:
from sklearn.model_selection import train_test_split

def hr_modeling_nn(features,label):
    f_v = features.values
    f_names = features.columns.values
    l_v = label.values
    X_tt, X_validation, Y_tt, Y_validation = train_test_split(f_v, l_v, test_size=0.2)
    X_train, X_test, Y_train, Y_test = train_test_split(X_tt, Y_tt, test_size=0.25)
    print(len(X_train),len(X_train),print(X_validation))
    
    return X_train, X_test, Y_train, Y_test,X_validation, Y_validation
    
X_train, X_test, Y_train, Y_test,X_validation, Y_validation = hr_modeling_nn(features,label)

[[ 1.23547405 -1.3209701   0.97111292 ...  0.          0.
   1.        ]
 [ 0.47126283  1.42494396  0.97111292 ...  0.          1.
   0.        ]
 [-0.25272675 -0.67830936 -0.65153764 ...  0.          0.
   1.        ]
 ...
 [-0.73538647  0.25646989  1.7824382  ...  0.          0.
   0.        ]
 [ 0.02882475  1.60021507  0.15978764 ...  0.          0.
   0.        ]
 [ 1.35613898 -0.03564863  0.15978764 ...  1.          0.
   0.        ]]
8999 8999 None


## 机器学习 --分类

In [71]:
from sklearn.externals.six import StringIO
    
from sklearn.metrics import accuracy_score, recall_score, f1_score
from sklearn.naive_bayes import GaussianNB,BernoulliNB
from sklearn.neighbors import NearestNeighbors,KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier,export_graphviz
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
          
    

f_v=features.values
f_names=features.columns.values
    
models = []
models.append(("KNN",KNeighborsClassifier(n_neighbors=3,n_jobs=-1)))
models.append(("GaussianNB",GaussianNB()))
models.append(("BernoulliNB",BernoulliNB()))
models.append(("DecisionTreeGini",DecisionTreeClassifier()))
models.append(("DecisionTreeEntropy",DecisionTreeClassifier(criterion="entropy")))
models.append(("SVM Classifier",SVC(C=1000)))
models.append(("OriginalRandomForest",RandomForestClassifier()))
models.append(("RandomForest",RandomForestClassifier(n_estimators=11,max_features=None,n_jobs=-1)))
models.append(("Adaboost",AdaBoostClassifier(n_estimators=100)))
# sag 随机梯度下降
models.append(("LogisticRegression",LogisticRegression(C=1000,tol=1e-10,solver="sag",max_iter=10000)))

  
      
for clf_name,clf in models:
    clf.fit(X_train,Y_train)
    xy_lst=[(X_train,Y_train),(X_validation,Y_validation),(X_test,Y_test)]
    for i in range(len(xy_lst)):
        X_part=xy_lst[i][0]
        Y_part=xy_lst[i][1]
        Y_pred=clf.predict(X_part)
        print(i)
        print(clf_name,"-ACC:",accuracy_score(Y_part,Y_pred))
        print(clf_name,"-REC:",recall_score(Y_part,Y_pred))
        print(clf_name,"-F1:",f1_score(Y_part,Y_pred)) 
#         if clf_name.startswith("DecisionTree"):
#             dot_data=StringIO()
#             export_graphviz(clf,out_file=dot_data,
#                                      feature_names=f_names,
#                                      class_names=["NL","L"],
#                                      filled=True,
#                                      rounded=True,
#                                      special_characters=True)
#             graph=pydotplus.graph_from_dot_data(dot_data.getvalue())
#             graph.write_pdf("dt_tree_%s.pdf"%(clf_name))

0
KNN -ACC: 0.9737748638737638
KNN -REC: 0.9591930307198533
KNN -F1: 0.946606334841629
1
KNN -ACC: 0.9433333333333334
KNN -REC: 0.9049586776859504
KNN -F1: 0.8854447439353099
2
KNN -ACC: 0.9433333333333334
KNN -REC: 0.9126506024096386
KNN -F1: 0.8769898697539797
0
GaussianNB -ACC: 0.6506278475386154
GaussianNB -REC: 0.8541953232462174
GaussianNB -F1: 0.5423580786026202
1
GaussianNB -ACC: 0.6533333333333333
GaussianNB -REC: 0.8347107438016529
GaussianNB -F1: 0.5381882770870338
2
GaussianNB -ACC: 0.6496666666666666
GaussianNB -REC: 0.8192771084337349
GaussianNB -F1: 0.5086489013557738
0
BernoulliNB -ACC: 0.7609734414934993
BernoulliNB -REC: 0.2347546996790463
BernoulliNB -F1: 0.3225196850393701
1
BernoulliNB -ACC: 0.754
BernoulliNB -REC: 0.2066115702479339
BernoulliNB -F1: 0.2890173410404624
2
BernoulliNB -ACC: 0.772
BernoulliNB -REC: 0.21536144578313254
BernoulliNB -F1: 0.2948453608247423
0
DecisionTreeGini -ACC: 1.0
DecisionTreeGini -REC: 1.0
DecisionTreeGini -F1: 1.0
1
DecisionTreeGin



0
SVM Classifier -ACC: 0.9882209134348261
SVM Classifier -REC: 0.9674461256304447
SVM Classifier -F1: 0.9754969949144707
1
SVM Classifier -ACC: 0.9646666666666667
SVM Classifier -REC: 0.928374655647383
SVM Classifier -F1: 0.9270976616231086
2
SVM Classifier -ACC: 0.9676666666666667
SVM Classifier -REC: 0.9352409638554217
SVM Classifier -F1: 0.9275578790141896




0
OriginalRandomForest -ACC: 0.9977775308367597
OriginalRandomForest -REC: 0.9912883998165979
OriginalRandomForest -F1: 0.9953959484346225
1
OriginalRandomForest -ACC: 0.9863333333333333
OriginalRandomForest -REC: 0.9559228650137741
OriginalRandomForest -F1: 0.9713086074177746
2
OriginalRandomForest -ACC: 0.989
OriginalRandomForest -REC: 0.9563253012048193
OriginalRandomForest -F1: 0.9746738296239448
0
RandomForest -ACC: 0.9981109012112457
RandomForest -REC: 0.9931224209078404
RandomForest -F1: 0.9960910554150378
1
RandomForest -ACC: 0.9866666666666667
RandomForest -REC: 0.9559228650137741
RandomForest -F1: 0.9719887955182073
2
RandomForest -ACC: 0.9846666666666667
RandomForest -REC: 0.9518072289156626
RandomForest -F1: 0.9648854961832061
0
Adaboost -ACC: 0.962773641515724
Adaboost -REC: 0.9211370930765704
Adaboost -F1: 0.9230415805191822
1
Adaboost -ACC: 0.9543333333333334
Adaboost -REC: 0.8980716253443526
Adaboost -F1: 0.9049271339347675
2
Adaboost -ACC: 0.9583333333333334
Adaboost -



## 回归测试

In [70]:
# print("X",features)
# print("Y",label)
from sklearn.linear_model import LinearRegression,Ridge,Lasso
#regr=LinearRegression()
regr=Ridge(alpha=1)
regr.fit(features.values,label.values)
Y_pred=regr.predict(features.values)
print("Coef:",regr.coef_)
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
print("MSE:",mean_squared_error(label.values,Y_pred))
print("MAE:",mean_absolute_error(label.values,Y_pred))
print("R2:",r2_score(label.values,Y_pred))

Coef: [-0.16009483  0.01494029 -0.04196149  0.03202485  0.05320615 -0.05465984
 -0.01616189 -0.10639539  0.09246624  0.01392915 -0.01076429 -0.06050522
  0.0146708   0.04922629 -0.04760503  0.0122534  -0.00939767  0.00951584
  0.02061428  0.0219916 ]
MSE: 0.14230903029941022
MAE: 0.3019494008573236
R2: 0.2154927025916843
