# MachineLearning Human Resources Analysis

概要：【分類問題】 ある企業の従業員のデータを基に「退職」か、「非退職」を分類予測する

## 1.データの読み込み

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("final_hr_analysis_train.csv")
test_df = pd.read_csv("final_hr_analysis_test.csv")

In [3]:
df.head()

Unnamed: 0,index,left,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,sales,salary
0,10438,0,0.53,0.52,2,135,4,0,0,technical,medium
1,9236,0,0.77,0.53,5,256,3,0,0,accounting,medium
2,818,1,0.89,0.79,3,149,2,0,0,support,medium
3,11503,0,0.64,0.63,3,156,6,1,0,support,low
4,11721,0,0.98,0.74,4,151,3,0,0,sales,medium


In [4]:
test_df.head()

Unnamed: 0,index,left,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,sales,salary
0,1670,,0.44,0.57,2,141,3,0,0,product_mng,medium
1,13378,,0.55,0.96,3,194,3,0,0,product_mng,medium
2,10233,,0.72,0.67,5,210,2,0,0,management,medium
3,4719,,0.96,0.75,4,177,2,0,0,IT,low
4,7003,,0.96,0.54,3,198,3,0,0,support,low


## 2.前処理

正解データの作成

In [5]:
y = np.array(df['left'])

訓練データの作成

In [6]:
x = df.drop('index', axis=1)
x = x.drop('left', axis=1)

質的変数のOne-Hotベクトル化

In [7]:
# 今回のデータではsalesとsalaryが質的変数である
ohe_columns = ['sales', 'salary']
x = pd.get_dummies(x, dummy_na=True, columns=ohe_columns)

訓練データの確認

In [8]:
x.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,sales_IT,sales_RandD,sales_accounting,...,sales_marketing,sales_product_mng,sales_sales,sales_support,sales_technical,sales_nan,salary_high,salary_low,salary_medium,salary_nan
0,0.53,0.52,2,135,4,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
1,0.77,0.53,5,256,3,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
2,0.89,0.79,3,149,2,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
3,0.64,0.63,3,156,6,1,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
4,0.98,0.74,4,151,3,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0


## 3.訓練データとテストデータに分ける

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.30, shuffle=True, random_state=1)

## 4.Pipeline構築

In [77]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression # ロジスティック回帰
from sklearn.decomposition import PCA  # 主成分分析
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC  # サポートベクターマシン分類
from sklearn.neighbors import KNeighborsClassifier # KNN分類

# pipeline構築
pipelines = {
    'Logistic':
        Pipeline([('scl', StandardScaler()),
                  ('lr', LogisticRegression(max_iter=100, random_state=1))]),
    'SVC':
        Pipeline([('scl', StandardScaler()),
                  ('svc', SVC())]),
    'KNN':
        Pipeline([('scl', StandardScaler()),
                  ('knn', KNeighborsClassifier(n_neighbors=6))]),
    'PCA_and_RandomForest':
        Pipeline([('scl', StandardScaler()),
                  ('pca', PCA(n_components=2)),
                  ('rf', RandomForestClassifier())]),
    'PCA_and_SVC':
        Pipeline([('scl', StandardScaler()),
                  ('pca', PCA(n_components=2)),
                  ('svc', SVC())]),
    'PCA_and_KNN':
        Pipeline([('scl', StandardScaler()),
                  ('pca', PCA(n_components=2)),
                  ('knn', KNeighborsClassifier(n_neighbors=6))]),
}

## 5.フィッティング

In [78]:
score_table = {'model':[],'score':[]}
for pipe_name, pipeline in pipelines.items():
    pipeline.fit(X_train, y_train)
    score_table['model'].append(pipe_name)
    score_table['score'].append(pipeline.score(X_test, y_test))

score = pd.DataFrame(data=score_table)
score

Unnamed: 0,model,score
0,Logistic,0.790159
1,SVC,0.947302
2,KNN,0.936825
3,PCA_and_RandomForest,0.925714
4,PCA_and_SVC,0.909206
5,PCA_and_KNN,0.914603


## 6.グリッドサーチ
PCAとRandom Forestでグリッドサーチを行う。

In [64]:
# PCAとRandom Forestでグリッドサーチ
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Pipeline構築
pipeline = Pipeline([('scl', StandardScaler()),
                     ('pca', PCA()),
                     ('rf', RandomForestClassifier())])

# グリッドサーチパラメータ設定
params = {
    # PCAコンポーネント数：1～max(説明変数の総数)
    'pca__n_components': range(1, X_train.shape[1]),
    # サブサンプルの数
    'rf__n_estimators': [10, 100, 1000],
}

# グリッドサーチ　K分割交差検証 4分割
grid_search_cv = GridSearchCV(pipeline, param_grid=params, cv=4, verbose=2)
grid_search_cv.fit(X_train, y_train)
grid_search_cv.score(X_test, y_test)

Fitting 4 folds for each of 63 candidates, totalling 252 fits
[CV] END ...........pca__n_components=1, rf__n_estimators=10; total time=   0.0s
[CV] END ...........pca__n_components=1, rf__n_estimators=10; total time=   0.0s
[CV] END ...........pca__n_components=1, rf__n_estimators=10; total time=   0.0s
[CV] END ...........pca__n_components=1, rf__n_estimators=10; total time=   0.0s
[CV] END ..........pca__n_components=1, rf__n_estimators=100; total time=   0.6s
[CV] END ..........pca__n_components=1, rf__n_estimators=100; total time=   0.6s
[CV] END ..........pca__n_components=1, rf__n_estimators=100; total time=   0.6s
[CV] END ..........pca__n_components=1, rf__n_estimators=100; total time=   0.7s
[CV] END .........pca__n_components=1, rf__n_estimators=1000; total time=   7.4s
[CV] END .........pca__n_components=1, rf__n_estimators=1000; total time=   6.9s
[CV] END .........pca__n_components=1, rf__n_estimators=1000; total time=   6.6s
[CV] END .........pca__n_components=1, rf__n_es

0.9746031746031746

## 予測する

In [82]:
X_data = test_df.drop('left', axis=1)
X_data = X_data.drop('index', axis=1)

In [84]:
# 今回のデータではsalesとsalaryが質的変数である
ohe_columns = ['sales', 'salary']
X_data = pd.get_dummies(X_data, dummy_na=True, columns=ohe_columns)

In [85]:
X_data.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,sales_IT,sales_RandD,sales_accounting,...,sales_marketing,sales_product_mng,sales_sales,sales_support,sales_technical,sales_nan,salary_high,salary_low,salary_medium,salary_nan
0,0.44,0.57,2,141,3,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
1,0.55,0.96,3,194,3,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
2,0.72,0.67,5,210,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0.96,0.75,4,177,2,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0.96,0.54,3,198,3,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0


In [92]:
pred = grid_search_cv.predict(X_data)

In [95]:
test_df['left'] = pred

In [96]:
test_df

Unnamed: 0,index,left,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,sales,salary
0,1670,1,0.44,0.57,2,141,3,0,0,product_mng,medium
1,13378,0,0.55,0.96,3,194,3,0,0,product_mng,medium
2,10233,0,0.72,0.67,5,210,2,0,0,management,medium
3,4719,0,0.96,0.75,4,177,2,0,0,IT,low
4,7003,0,0.96,0.54,3,198,3,0,0,support,low
...,...,...,...,...,...,...,...,...,...,...,...
4495,11275,0,0.56,0.71,3,211,6,0,1,marketing,low
4496,3828,0,0.58,0.79,5,262,2,0,0,sales,high
4497,4645,0,0.85,0.58,4,273,4,0,0,IT,medium
4498,6069,0,0.54,0.64,6,278,2,0,0,technical,medium


In [100]:
write_data = test_df[['index', 'left']]
write_data

Unnamed: 0,index,left
0,1670,1
1,13378,0
2,10233,0
3,4719,0
4,7003,0
...,...,...
4495,11275,0
4496,3828,0
4497,4645,0
4498,6069,0


In [None]:
write_data.to_csv("tk200151_森本悠真.csv", index=False)

### 参考
1. Python: scikit-learn の Pipeline を使ってみる  
   https://blog.amedama.jp/entry/2018/07/07/223257  
2. scikit-learnのpipelineモジュールで機械学習パイプラインを作る  
   https://dev.classmethod.jp/articles/create_pipeline_scikit-learn_pipeline/
3. 【python】sklearnのPipelineを使うとできること  
   https://www.haya-programming.com/entry/2018/02/22/234011
4. pythonでランダムフォレストとアンサンブル学習まとめ  
   https://qiita.com/mshinoda88/items/8bfe0b540b35437296bd  
5. アンサンブル学習とは？バギング、ブースティング、ブースティングを図で解説  
   https://nisshingeppo.com/ai/whats-ensemble/  
6. sklearn.model_selection.GridSearchCV  
   https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html  