In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as ss
import seaborn as sns
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.feature_selection import SelectKBest,RFE,SelectFromModel
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.preprocessing import Normalizer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split#用于拆分训练集和测试集

In [4]:
df = pd.read_csv('d:\HR.csv')
label = df['left']

In [5]:
#1.数据预处理
# split features and labels
x = df.drop('left', axis=1)
y = df['left']
x.drop(['sales','salary'], axis=1, inplace=True)
x['average_montly_hours']=\
            MinMaxScaler().fit_transform(x['average_montly_hours'].values.reshape(-1,1)).reshape(1,-1)[0]
x['time_spend_company']=\
            MinMaxScaler().fit_transform(x['time_spend_company'].values.reshape(-1,1)).reshape(1,-1)[0]
x['number_project']=\
            MinMaxScaler().fit_transform(x['number_project'].values.reshape(-1,1)).reshape(1,-1)[0]
# transform the object type: One-hot encoding
salary_dummy = pd.get_dummies(df['salary'],prefix = 'salary',prefix_sep = '_')
department_dummy = pd.get_dummies(df['sales'],prefix = 'depart',prefix_sep = '_')
x = pd.concat([x, salary_dummy], axis=1)
x = pd.concat([x, department_dummy], axis=1)
x_name = x.columns.values
x.head()



Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,salary_high,salary_low,salary_medium,depart_IT,depart_RandD,depart_accounting,depart_hr,depart_management,depart_marketing,depart_product_mng,depart_sales,depart_support,depart_technical
0,0.38,0.53,0.0,0.285047,0.125,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0
1,0.8,0.86,0.6,0.775701,0.5,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0
2,0.11,0.88,1.0,0.82243,0.25,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0
3,0.72,0.87,0.6,0.593458,0.375,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0
4,0.37,0.52,0.0,0.294393,0.125,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0


In [6]:
# split train set and test set
x_tt,x_validation,y_tt,y_validation = train_test_split(x,y,test_size = 0.2)
x_train,x_test,y_train,y_test = train_test_split(x_tt,y_tt,test_size = 0.25)
print(len(x_train),len(x_validation),len(x_test))

8999 3000 3000


###### KNN

In [7]:
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
knn_clf = KNeighborsClassifier(n_neighbors=3)
knn_clf.fit(x_train,y_train)
y_pred = knn_clf.predict(x_validation)
from sklearn.metrics import accuracy_score,recall_score,f1_score #测试他的准确率，召回率，f1
print("ACC:",accuracy_score(y_validation,y_pred))
print("REC:",recall_score(y_validation,y_pred))
print("F-Score:",f1_score(y_validation,y_pred))

ACC: 0.939
REC: 0.8930555555555556
F-Score: 0.875425459496256


In [8]:
y_pred = knn_clf.predict(x_test)
print("ACC:",accuracy_score(y_test,y_pred))
print("REC:",recall_score(y_test,y_pred))
print("F-Score:",f1_score(y_test,y_pred))

ACC: 0.943
REC: 0.8982300884955752
F-Score: 0.876889848812095


In [9]:
y_pred = knn_clf.predict(x_train)
print("ACC:",accuracy_score(y_train,y_pred))
print("REC:",recall_score(y_train,y_pred))
print("F-Score:",f1_score(y_train,y_pred))

ACC: 0.9714412712523613
REC: 0.9558214450069029
F-Score: 0.9417365676717298


In [10]:
#保存模型
from sklearn.externals import joblib
joblib.dump(knn_clf,"knn_clf")

['knn_clf']

In [11]:
joblib.load("knn_clf")

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=3, p=2,
           weights='uniform')

###### 朴素贝叶斯

In [12]:
from sklearn.naive_bayes import GaussianNB, BernoulliNB
#高斯分布 0-1分布  只适合离散的数据分类，对于该数据分析效果极差
gnb_clf=GaussianNB()
gnb_clf.fit(x_train,y_train)
y_pred = gnb_clf.predict(x_validation)
print("ACC:",accuracy_score(y_validation,y_pred))
print("REC:",recall_score(y_validation,y_pred))
print("F-Score:",f1_score(y_validation,y_pred))

ACC: 0.6606666666666666
REC: 0.8319444444444445
F-Score: 0.5406137184115525


###### 决策树 画出决策树

In [13]:
from sklearn.tree import DecisionTreeClassifier,export_graphviz
from sklearn.externals.six import StringIO
import os
import pydotplus 
os.environ["PATH"]+=os.pathsep+"D:/release/bin/"
dtc_clf = DecisionTreeClassifier()
dtc_clf.fit(x_train,y_train)
y_pred = dtc_clf.predict(x_validation)
print("ACC:",accuracy_score(y_validation,y_pred))
print("REC:",recall_score(y_validation,y_pred))
print("F-Score:",f1_score(y_validation,y_pred))
dot_data=export_graphviz(dtc_clf,out_file=None,
               feature_names=x_name,class_names=["NL","L"],
               filled=True,rounded=True,special_characters=True)
graph=pydotplus.graph_from_dot_data(dot_data)
graph.write_pdf("dt_tree.pdf")

ACC: 0.9756666666666667
REC: 0.9680555555555556
F-Score: 0.9502385821404227


True

###### SVM

In [14]:
from sklearn.svm import SVC
svc_clf=SVC() #C是惩罚度，也就是最大宽容度
svc_clf.fit(x_train,y_train)
y_pred = svc_clf.predict(x_validation)
print("ACC:",accuracy_score(y_validation,y_pred))
print("REC:",recall_score(y_validation,y_pred))
print("F-Score:",f1_score(y_validation,y_pred))



ACC: 0.817
REC: 0.29583333333333334
F-Score: 0.43692307692307697


###### 集成方法，组合多个模型有更好效果

###### 随机森林

In [32]:
from sklearn.ensemble import RandomForestClassifier
rfc_clf = RandomForestClassifier(n_estimators=11,max_features=None)
rfc_clf.fit(x_train,y_train)
y_pred = rfc_clf.predict(x_validation)
print("random forest:\nACC:",accuracy_score(y_validation,y_pred))
print("REC:",recall_score(y_validation,y_pred))
print("F-Score:",f1_score(y_validation,y_pred))
dtc_clf = DecisionTreeClassifier()
dtc_clf.fit(x_train,y_train)
y_pred = dtc_clf.predict(x_validation)
print("decide tree:\nACC:",accuracy_score(y_validation,y_pred))
print("REC:",recall_score(y_validation,y_pred))
print("F-Score:",f1_score(y_validation,y_pred))

random forest:
ACC: 0.9866666666666667
REC: 0.9652777777777778
F-Score: 0.972027972027972
decide tree:
ACC: 0.9746666666666667
REC: 0.9666666666666667
F-Score: 0.9482288828337875


###### Adaboost

In [35]:
from sklearn.ensemble import AdaBoostClassifier
abc_clf=AdaBoostClassifier(n_estimators=50)#50为默认,SVC毫无作用
abc_clf.fit(x_train,y_train)
y_pred = abc_clf.predict(x_validation)
print("random forest:\nACC:",accuracy_score(y_validation,y_pred))
print("REC:",recall_score(y_validation,y_pred))
print("F-Score:",f1_score(y_validation,y_pred))

random forest:
ACC: 0.961
REC: 0.9111111111111111
F-Score: 0.9181245626312106


###### 线性回归

In [51]:
from sklearn.linear_model import LinearRegression,Ridge,Lasso
features= df[['number_project','average_montly_hours']]
label = df[['last_evaluation']]
regr=LinearRegression()
regr=Ridge(alpha=1)
regr=Lasso(alpha=0.001)
regr.fit(features.values,label.values)
y_pred=regr.predict(features.values)
print("Coef:拟合参数",regr.coef_)
from sklearn.metrics import mean_squared_error#判断好坏
print("MSE:",mean_squared_error(y_pred,label.values))
#'number_project','promotion_last_5years','last_evaluation'

Coef:拟合参数 [0.03411468 0.00081272]
MSE: 0.024387547749192724


###### 逻辑回归

In [57]:
from sklearn.linear_model import LogisticRegression
lgr = LogisticRegression(solver="sag",max_iter=10000)
#可见这个不是单纯的线性可以解决，需要高维解决
lgr.fit(x_train,y_train)
y_pred = lgr.predict(x_validation)
print("ACC:",accuracy_score(y_validation,y_pred))
print("REC:",recall_score(y_validation,y_pred))
print("F-Score:",f1_score(y_validation,y_pred))

ACC: 0.7976666666666666
REC: 0.38333333333333336
F-Score: 0.4762726488352028


###### 人工神经网络

In [60]:
from keras.models import Sequential#人工神经网络的模具
from keras.layers.core import Dense,Activation#层数
from keras.optimizers import SGD
mdl=Sequential()
mdl.add(Dense(50,input_dim=len(df[0])))
mdl.add(Activation('sigmoid'))
mdl.add(Dense(2))
mdl.add(Activation('softmax'))#output
sgd=SGD(lr=0.1)
mdl.compile(loss='mean_squared_error',optimizer="adam")
mdl.fit(x_train,np.array([[0,1] if i==1 else [1,0] for i in y_train]),nb_epoch=10000,\
       batch_size=8999)#随机梯度下降的选择
xy_lst = [(x_train,y_train),(x_validation,y_validation),(x_test,y_test)]
for i in range(len(xy_lst)):
    x_part = xy_lst[i][0]
    y_part = xy_lst[i][1]
    y_pred = mdl.predict_classes(x_part)
print("ACC:",accuracy_score(y_part,y_pred))
print("REC:",recall_score(y_part,y_pred))
print("F-Score:",f1_score(y_part,y_pred))
#pybrain是反向传播

KeyError: 0

###### 回归树与提升树

###### GBDT gradient boosting decision tree

In [61]:
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier(max_depth=6, n_estimators=100)
gbc.fit(x_train,y_train)
y_pred = gbc.predict(x_validation)
print("ACC:",accuracy_score(y_validation,y_pred))
print("REC:",recall_score(y_validation,y_pred))
print("F-Score:",f1_score(y_validation,y_pred))

ACC: 0.9846666666666667
REC: 0.9527777777777777
F-Score: 0.9675599435825105


### 聚类