In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.preprocessing import Normalizer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA
import os
import pydotplus

## 数据处理

In [2]:
#sl:satisfaction_level---False:MinMaxScaler;True:StandardScaler
#le:last_evaluation---False:MinMaxScaler;True:StandardScaler
#npr:number_project---False:MinMaxScaler;True:StandardScaler
#amh:average_monthly_hours--False:MinMaxScaler;True:StandardScaler
#tsc:time_spend_company--False:MinMaxScaler;True:StandardScaler
#wa:Work_accident--False:MinMaxScaler;True:StandardScaler
#pl5:promotion_last_5years--False:MinMaxScaler;True:StandardScaler
#dp:department--False:LabelEncoding;True:OneHotEncoding
#slr:salary--False:LabelEncoding;True:OneHotEncoding
def map_salary(s):
    d=dict([("low",0),("medium",1),("high",2)])
    return d.get(s,0)
def hr_preprocessing(sl=False,le=False,npr=False,amh=False,tsc=False,wa=False,pl5=False,dp=False,slr=False,lower_d=False,ld_n=1):
    df=pd.read_csv("./data/HR.csv")
    
    #1、清洗数据
    df=df.dropna(subset=["satisfaction_level","last_evaluation"])
    df=df[df["satisfaction_level"]<=1][df["salary"]!="nme"]
    #2、得到标注
    label = df["left"]
    df = df.drop("left", axis=1)
    #3、特征选择
    #4、特征处理
    scaler_lst=[sl,le,npr,amh,tsc,wa,pl5]
    column_lst=["satisfaction_level","last_evaluation","number_project",\
                "average_monthly_hours","time_spend_company","Work_accident",\
                "promotion_last_5years"]
    for i in range(len(scaler_lst)):
        if not scaler_lst[i]:
            df[column_lst[i]]=\
                MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
        else:
            df[column_lst[i]]=\
                StandardScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
    scaler_lst=[slr,dp]
    column_lst=["salary","department"]
    for i in range(len(scaler_lst)):
        if not scaler_lst[i]:
            if column_lst[i]=="salary":
                # low 为0,medium为1，high为2
                df[column_lst[i]]=[map_salary(s) for s in df["salary"].values]
            else:
                df[column_lst[i]]=LabelEncoder().fit_transform(df[column_lst[i]])
            df[column_lst[i]]=MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
        else:
            df=pd.get_dummies(df,columns=[column_lst[i]])
    if lower_d:
        return PCA(n_components=ld_n).fit_transform(df.values),label  
    return df,label            
                



features,label = hr_preprocessing(sl=True,le=True,npr=True,amh=True,tsc=True,wa=True,pl5=True,dp=True,slr=True)

## 建模函数

In [3]:
from sklearn.model_selection import train_test_split

def hr_modeling_nn(features,label):
    f_v = features.values
    f_names = features.columns.values
    l_v = label.values
    X_tt, X_validation, Y_tt, Y_validation = train_test_split(f_v, l_v, test_size=0.2)
    X_train, X_test, Y_train, Y_test = train_test_split(X_tt, Y_tt, test_size=0.25)
    print(len(X_train),len(X_train),print(X_validation))
    
    return X_train, X_test, Y_train, Y_test,X_validation, Y_validation
    
X_train, X_test, Y_train, Y_test,X_validation, Y_validation = hr_modeling_nn(features,label)

[[ 1.03436583  1.13282545  0.97111292 ...  0.          0.
   0.        ]
 [ 0.14948968  0.3733173   0.15978764 ...  0.          0.
   0.        ]
 [ 0.83325762  0.49016471  0.97111292 ...  0.          0.
   0.        ]
 ...
 [ 0.3505979   0.66543582  0.15978764 ...  0.          1.
   0.        ]
 [ 0.18971132 -1.3209701   0.15978764 ...  0.          0.
   0.        ]
 [-1.94203577 -0.38619085  0.15978764 ...  0.          1.
   0.        ]]
8999 8999 None


## 机器学习 --分类

In [6]:
from sklearn.externals.six import StringIO
    
from sklearn.metrics import accuracy_score, recall_score, f1_score
from sklearn.naive_bayes import GaussianNB,BernoulliNB
from sklearn.neighbors import NearestNeighbors,KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier,export_graphviz
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
    
from keras.models import Sequential
from keras.layers.core import Dense,Activation
from keras.optimizers import SGD


f_v=features.values
f_names=features.columns.values
    
# # mdl = Sequential()
# # mdl.add(Dense(50,input_dim=len(f_v[0])))
# # mdl.add(Activation("sigmoid"))
# # mdl.add(Dense(2))
# # mdl.add(Activation("softmax"))
# # sgd=SGD(learning_rate=0.05)
# # mdl.compile(loss="mean_squared_error",optimizer="adam")
# # mdl.fit(X_train,np.array([[0,1] if i==1 else [1,0] for i in Y_train]),nb_epoch=2000,batch_size=2048)
# xy_lst=[(X_train,Y_train),(X_validation,Y_validation),(X_test,Y_test)]
# import matplotlib.pyplot as plt
# from sklearn.metrics import roc_auc_score,roc_curve
# f = plt.figure()
# for i in range(len(xy_lst)):
#     X_part=xy_lst[i][0]
#     Y_part=xy_lst[i][1]
# #     Y_pred=mdl.predict_classes(X_part)
#     Y_pred=mdl.predict(X_part)
#     Y_pred= np.array(Y_pred[:,1]).reshape((1,-1))[0]
# #     print(i)
# #     print("NN","-ACC:",accuracy_score(Y_part,Y_pred))
# #     print("NN","-REC:",recall_score(Y_part,Y_pred))
# #     print("NN","-F1:",f1_score(Y_part,Y_pred)) 
#     f.add_subplot(1,3,i+1)
#     fpr,tpr,threshold = roc_curve(Y_part,Y_pred)
#     plt.plot(fpr,tpr)
#     print("NN","-AUC:",auc(Y_part,Y_pred)) 
#     print("NN","-AUC_score:",roc_auc_score(Y_part,Y_pred)) 
# plt.show()
# return
models = []
models.append(("KNN",KNeighborsClassifier(n_neighbors=3,n_jobs=-1)))
models.append(("GaussianNB",GaussianNB()))
models.append(("BernoulliNB",BernoulliNB()))
models.append(("DecisionTreeGini",DecisionTreeClassifier()))
models.append(("DecisionTreeEntropy",DecisionTreeClassifier(criterion="entropy")))
models.append(("SVM Classifier",SVC(C=1000)))
models.append(("OriginalRandomForest",RandomForestClassifier()))
models.append(("RandomForest",RandomForestClassifier(n_estimators=11,max_features=None,n_jobs=-1)))
models.append(("Adaboost",AdaBoostClassifier(n_estimators=100)))
# sag 随机梯度下降
models.append(("LogisticRegression",LogisticRegression(C=1000,tol=1e-10,solver="sag",max_iter=1000)))
models.append(("GBDT",GradientBoostingClassifier(max_depth=6,n_estimators=100)))
  
      
for clf_name,clf in models:
    clf.fit(X_train,Y_train)
    xy_lst=[(X_train,Y_train),(X_validation,Y_validation),(X_test,Y_test)]
    for i in range(len(xy_lst)):
        X_part=xy_lst[i][0]
        Y_part=xy_lst[i][1]
        Y_pred=clf.predict(X_part)
        if i ==2:
            print(clf_name,"-ACC:",accuracy_score(Y_part,Y_pred))
            print(clf_name,"-REC:",recall_score(Y_part,Y_pred))
            print(clf_name,"-F1:",f1_score(Y_part,Y_pred)) 
#         if clf_name.startswith("DecisionTree"):
#             dot_data=StringIO()
#             export_graphviz(clf,out_file=dot_data,
#                                      feature_names=f_names,
#                                      class_names=["NL","L"],
#                                      filled=True,
#                                      rounded=True,
#                                      special_characters=True)
#             graph=pydotplus.graph_from_dot_data(dot_data.getvalue())
#             graph.write_pdf("dt_tree_%s.pdf"%(clf_name))

KNN -ACC: 0.9486666666666667
KNN -REC: 0.9286713286713286
KNN -F1: 0.8960863697705802
GaussianNB -ACC: 0.6603333333333333
GaussianNB -REC: 0.8125874125874126
GaussianNB -F1: 0.5327831270059605
BernoulliNB -ACC: 0.7553333333333333
BernoulliNB -REC: 0.17902097902097902
BernoulliNB -F1: 0.2585858585858586
DecisionTreeGini -ACC: 0.9693333333333334
DecisionTreeGini -REC: 0.9594405594405594
DecisionTreeGini -F1: 0.9371584699453551
DecisionTreeEntropy -ACC: 0.9726666666666667
DecisionTreeEntropy -REC: 0.9552447552447553
DecisionTreeEntropy -F1: 0.9433701657458564




SVM Classifier -ACC: 0.9716666666666667
SVM Classifier -REC: 0.9328671328671329
SVM Classifier -F1: 0.9400986610288935
OriginalRandomForest -ACC: 0.9856666666666667
OriginalRandomForest -REC: 0.9524475524475524
OriginalRandomForest -F1: 0.9693950177935944




RandomForest -ACC: 0.985
RandomForest -REC: 0.9566433566433566
RandomForest -F1: 0.9681528662420383
Adaboost -ACC: 0.9516666666666667
Adaboost -REC: 0.8895104895104895
Adaboost -F1: 0.8976711362032463




LogisticRegression -ACC: 0.788
LogisticRegression -REC: 0.34545454545454546
LogisticRegression -F1: 0.43716814159292033
GBDT -ACC: 0.985
GBDT -REC: 0.9538461538461539
GBDT -F1: 0.9680624556422995


## 回归测试

In [5]:
# print("X",features)
# print("Y",label)
from sklearn.linear_model import LinearRegression,Ridge,Lasso
#regr=LinearRegression()
regr=Ridge(alpha=1)
regr.fit(features.values,label.values)
Y_pred=regr.predict(features.values)
print("Coef:",regr.coef_)
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
print("MSE:",mean_squared_error(label.values,Y_pred))
print("MAE:",mean_absolute_error(label.values,Y_pred))
print("R2:",r2_score(label.values,Y_pred))

Coef: [-0.16009483  0.01494029 -0.04196149  0.03202485  0.05320615 -0.05465984
 -0.01616189 -0.10639539  0.09246624  0.01392915 -0.01076429 -0.06050522
  0.0146708   0.04922629 -0.04760503  0.0122534  -0.00939767  0.00951584
  0.02061428  0.0219916 ]
MSE: 0.1423090302994102
MAE: 0.3019494008573236
R2: 0.21549270259168452
