# 機械学習プロジェクト 分類問題の解き方の例
### 株式会社データミックス データサイエンティスト育成コース ベーシックステップ

In [None]:
% matplotlib inline

from __future__ import print_function

import copy

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.stats import spearmanr
from sklearn.cross_validation import train_test_split, KFold

In [None]:
data = pd.read_csv("dataset/HR_comma_sep.csv")

In [None]:
data.head()

## salesとsalaryのダミー変数にする

In [None]:
data2 = pd.get_dummies(data, columns=['sales', 'salary'])

In [None]:
target_col = 'left'
exclude_cols = ['left']
feature_cols = [col for col in data2.columns if col not in exclude_cols]

## 学習データとテストデータに分割する

In [None]:
y = np.array(data2[target_col])
X = np.array(data2[feature_cols])

In [None]:
X_train, X_test, y_train, y_test = \
 train_test_split(X, y, test_size=0.3, random_state=1234)

## 機械学習モデルを作る - ロジスティック回帰編

評価指標をConfusion matrixとAUCで計算するのでインポート

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegressionCV

### ロジスティック回帰モデルを初期化

In [None]:
#クラスの重みをバランスさせる
lr = LogisticRegressionCV(penalty='l2', Cs=[0.01, 0.1, 1, 10], class_weight='balanced')

#クラスの重みを変えない場合
#lr = LogisticRegressionCV(penalty='l2', Cs=[0.01, 0.1, 1, 10], class_weight=None)

### 学習

In [None]:
lr.fit(X_train, y_train)

### 予測
確率を予測(その場合はpredict_probaを使う。もし対数オッズ比が欲しい場合は、predict_log_proba, フラグを出したい場合はpredictを使う）

In [None]:
y_pred_proba = lr.predict_proba(X_test)
y_pred_proba

### モデルの性能評価

AUCを計算

In [None]:
roc_auc_score(y_test, y_pred_proba[:,1])

Confusion matrix

In [None]:
#cutoff_score = np.mean(data2['left'])
cutoff_score = 0.5
ypred_flag = (y_pred_proba[:,1] > cutoff_score).astype(np.int)

In [None]:
confusion_matrix(y_true=y_test, y_pred=ypred_flag)

Precision, Recall, F1-scoreの計算

In [None]:
print(classification_report(y_true=y_test, y_pred=ypred_flag))

## 機械学習モデルを作る - ランダムフォレスト編

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV

### ハイパーパラメーターのチューニング

In [None]:
params = {'n_estimators' :[10, 50, 100], 'max_depth':[3, 5, None]}

In [None]:
rf = RandomForestClassifier(class_weight='auto')

In [None]:
gcv = GridSearchCV(rf, param_grid=params, n_jobs=4, scoring='roc_auc')

In [None]:
gcv.fit(X_train, y_train)

In [None]:
gcv.best_params_

In [None]:
gcv.best_score_

### チューニングしたランダムフォレストでモデルを作る

In [None]:
rf = RandomForestClassifier(n_estimators=100, max_depth=None, class_weight='auto')

In [None]:
rf.fit(X_train, y_train)

### 予測

In [None]:
y_pred_proba = rf.predict_proba(X_test)

### モデルの性能評価

AUC

In [None]:
roc_auc_score(y_true=y_test, y_score=y_pred_proba[:,1])

Confusion matrix

In [None]:
cutoff_score = 0.5
ypred_flag = (y_pred_proba[:,1] > cutoff_score).astype(np.int)

In [None]:
confusion_matrix(y_true=y_test, y_pred=ypred_flag)

Precision, Recall, F1-scoreの計算

In [None]:
print(classification_report(y_true=y_test, y_pred=ypred_flag))

## 機械学習モデルを作る - AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostClassifier

In [None]:
params = {'n_estimators' :[100, 200, 300], 'learning_rate':[0.01, 0.1, 1]}

In [None]:
ada = AdaBoostClassifier()

In [None]:
gcv = GridSearchCV(ada, param_grid=params, n_jobs=4, scoring='roc_auc')

In [None]:
gcv.fit(X_train, y_train)

In [None]:
gcv.best_params_

In [None]:
gcv.best_score_

In [None]:
ada = AdaBoostClassifier(n_estimators=200, learning_rate=1)

In [None]:
ada.fit(X_train, y_train)

In [None]:
y_pred_proba = ada.predict_proba(X_test)

In [None]:
roc_auc_score(y_true=y_test, y_score=y_pred_proba[:,1])

In [None]:
cutoff_score = 0.5
ypred_flag = (y_pred_proba[:,1] > cutoff_score).astype(np.int)

In [None]:
confusion_matrix(y_true=y_test, y_pred=ypred_flag)

In [None]:
print(classification_report(y_true=y_test, y_pred=ypred_flag))