## ライブラリの読み込み


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import csv
import sys
import seaborn as sns
import xgboost as xgb
import imblearn
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN, BorderlineSMOTE, SMOTENC
from imblearn.under_sampling import RandomUnderSampler, TomekLinks, ClusterCentroids, NearMiss
from imblearn.combine import SMOTETomek, SMOTEENN
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import  confusion_matrix, classification_report, roc_curve, auc, accuracy_score, precision_recall_curve
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn import svm
from IPython import embed
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_colwidth', 200)
pd.options.display.float_format = '{:.4g}'.format
class pycolor:
    BLACK = '\033[30m'
    RED = '\033[31m'
    GREEN = '\033[32m'
    YELLOW = '\033[33m'
    BLUE = '\033[34m'
    PURPLE = '\033[35m'
    CYAN = '\033[36m'
    WHITE = '\033[37m'
    END = '\033[0m'
    BOLD = '\038[1m'
    UNDERLINE = '\033[4m'
    INVISIBLE = '\033[08m'
    REVERCE = '\033[07m'

## データの読み込み

In [None]:
path = os.path.dirname(os.path.abspath('__file__'))
df = pd.read_csv(os.path.join(path, 'grouped_data.csv'))

## データに含まれるカラム

In [None]:
df.columns

## 同じ内容のカラムの片方をドロップ

In [None]:
df = df.drop(columns = ['SEATROW', 'SEATLR', 'BMIG', 'PDOF1'])

In [None]:
df = df.drop(columns = 'DVper10')

In [None]:
df = df.drop(columns = 'TRAVELSP')

## NANと同じ意味のデータをNANに置換

In [None]:
df['TRAVELSP'] = df['TRAVELSP'].replace(777, np.nan)

## 相関係数の高い特徴量をドロップ

In [None]:
df_dic  = {}
count = 0
for i, col1 in enumerate([['MODELG', 'SEATROW2'], ['BAGAVAIL', 'BAGAVOTH']]):
    for j, col2 in enumerate([['BODYG', 'obodyg'], ['WGTG',  'owgtg']]):
        df_dic[count] = df.drop(columns = col1 + col2)
        #df_dic[count] = df.drop(columns = 'POS_DOF')
        print(count)
        print('remain : ', list(df_dic[count].columns))
        print('drop : ', col1 + col2)
        count += 1
df_dic[count] = df

## 標準化

In [None]:
stdcol = ['SPLIMIT', 'TRAVELSP', 'DVTOTAL']
for i in range(5):
    for j in stdcol:
        std = StandardScaler()
        df_std = df_dic[i].query('YEAR < 2015')
        if j in df_std.columns:
            null_val = df_std[j].isnull()
            std.fit(df_std.loc[~null_val, [j]])
            null_val = df_dic[i][j].isnull()
            df_dic[i].loc[~null_val, [j]] = std.transform(df_dic[i].loc[~null_val, [j]])

## 中央値で欠損値補完

In [None]:
for i in range(5):
    for j in list(df_dic[i].columns):
        df_med = df_dic[i].query('YEAR < 2015')
        med = df_med[j].median()
        if df_dic[i][j].isnull().any():
            print(j, ':', med)
        df_dic[i][j] = df_dic[i][j].fillna(med)

## 欠損値補完の確認

In [None]:
for i in range(5):
    print('欠損値数：', df_dic[i].isnull().any().sum())

## ダミー変数化

In [None]:
dummy_dic = {}
not_dummy = ['YEAR', 'MAIS', 'MAIS3', 'DVTOTAL', 'TRAVELSP']
for i in range(5):
    dummy_dic[i] = pd.get_dummies(df_dic[i], drop_first = False, columns = [x for x in list(df_dic[i].columns) if x not in not_dummy])

## テストデータと訓練データの分割

In [None]:
train_dummy =  {}
train = {}
test_dummy =  {}
test = {}
explanatory = {}
y_train = df.query('YEAR < 2015')['MAIS']
y_train3 = df.query('YEAR < 2015')['MAIS3']
y_test = df.query('YEAR == 2015')['MAIS']
y_test3 = df.query('YEAR == 2015')['MAIS3']
objective = df['MAIS']
objective3 = df['MAIS3']
for i in range(5):
    train_dummy[i] = dummy_dic[i].query('YEAR < 2015').drop(columns = ['YEAR', 'MAIS', 'MAIS3'])
    train[i] = df_dic[i].query('YEAR < 2015').drop(columns = ['YEAR', 'MAIS', 'MAIS3'])
    test_dummy[i] = dummy_dic[i].query('YEAR == 2015').drop(columns = ['YEAR', 'MAIS', 'MAIS3'])
    test[i] = df_dic[i].query('YEAR == 2015').drop(columns = ['YEAR', 'MAIS', 'MAIS3'])
    explanatory[i] = df_dic[i].drop(columns = ['YEAR', 'MAIS', 'MAIS3'])

## ランダムフォレストで推定

In [None]:
forest_dic = {}
gs_result = {}
predict = {}#テストデータセットの推定結果
pred = {}#訓練データでの推定結果
f_imp_dic = {}
grid_param = {'max_depth': [5], 'min_samples_split': [2], 'min_samples_leaf': [2], 'n_estimators': [500]}
for i in range(5):
    forest_dic[i] = GridSearchCV(RandomForestClassifier(random_state = 0, class_weight = 'balanced'), grid_param, cv = 5, scoring = 'roc_auc')
    forest_dic[i].fit(train[i], y_train3)
    predict[i] = forest_dic[i].predict(test[i])
    fpr, tpr, thresholds = roc_curve(y_test3, predict[i], pos_label = 1)
    print('best parameter:', forest_dic[i].best_params_)
    print('dic num:', i)
    print('test auc:', auc(fpr, tpr))
    print('test accuracy:', accuracy_score(y_test3, predict[i]) * 100)
    pred[i] = forest_dic[i].predict(train[i])
    fpr, tpr, thresholds = roc_curve(y_train3, pred[i], pos_label = 1)
    print('train auc:', auc(fpr, tpr))
    print('train accuracy:', accuracy_score(y_train3, pred[i]) * 100)
    gs_result[i] = pd.DataFrame.from_dict(forest_dic[i].cv_results_)
    f_imp_dic[i] = pd.DataFrame(index = range(len(train[i].columns)))
    imp = forest_dic[i].best_estimator_.feature_importances_
    for j, col in enumerate(train[i].columns):
        f_imp_dic[i].at[j, 'column'] = col
        f_imp_dic[i].at[j, 'importance'] = imp[j]
    f_imp_dic[i] = f_imp_dic[i].sort_values('importance', ascending = False)

## 実際のMAIS3+が1で推定結果が0となっているデータの出力

In [None]:
hoge = df.query('YEAR == 2015')
hoge['pre'] = predict[4]
hoge.drop(columns = ['YEAR'], inplace = True)
len(hoge.query('MAIS3 == 1'))
len(hoge.query('MAIS3 == 1 & pre == 0'))
print('真値1, 推定値0のデータ')
hoge.query('MAIS3 == 1 & pre == 0')
print('真値1, 推定値1のデータ')
hoge.query('MAIS3 == 1 & pre == 1').sort_values('MAIS', ascending = False)

## ロジスティック回帰で推定

In [None]:
logi_dic = {}
coef_df = {}
grid_param = {'C': [0.1, 0.5, 1], 'solver': ['liblinear']}
for i in range(5):
    logi_dic[i] = GridSearchCV(LogisticRegression(random_state = 0, class_weight = 'balanced'), grid_param, cv = 5, scoring = 'roc_auc')
    logi_dic[i].fit(train_dummy[i], y_train3)
    predict = logi_dic[i].predict(test_dummy[i])
    fpr, tpr, thresholds = roc_curve(y_test3, predict, pos_label = 1)
    coef_df[i] = pd.DataFrame({'Feature' : list(train_dummy[i].columns.values), 'coef' : logi_dic[i].best_estimator_.coef_[0]})
    coef_df[i]['abs'] = coef_df[i]['coef'].abs()
    coef_df[i] = coef_df[i].sort_values('abs', ascending = False).drop('abs', axis = 1)
    print('best parameter:', logi_dic[i].best_params_)
    print('dic num:', i)
    print('test auc: ', auc(fpr, tpr))    
    predict = logi_dic[i].predict(train_dummy[i])
    fpr, tpr, thresholds = roc_curve(y_train3, predict, pos_label = 1)
    print('train auc: ', auc(fpr, tpr))

In [None]:
for i in range(5):
    coef_df[1].head(10)

## リサンプリング(標準化前)

In [None]:
#sampling_train = {}
for i in range(5):
    trainsample = df_dic[i].query('YEAR < 2015')
    trainy = trainsample['MAIS3']
    trainX = trainsample.drop(columns = ['YEAR', 'MAIS', 'MAIS3'])
    #print('ros')
    ros = RandomOverSampler(random_state = 0)
    sampling_train['ros_X_{}'.format(i)], sampling_train['ros_y_{}'.format(i)] = ros.fit_sample(trainX, trainy)
    #print('rus')
    rus = RandomUnderSampler(random_state = 0)
    sampling_train['rus_X_{}'.format(i)], sampling_train['rus_y_{}'.format(i)] = rus.fit_sample(trainX, trainy)
    #print('tl')
    tl = TomekLinks(random_state = 0, ratio = 'majority')
    sampling_train['tl_X_{}'.format(i)], sampling_train['tl_y_{}'.format(i)] = tl.fit_sample(trainX, trainy)
    #print('cc')
    cc = ClusterCentroids(random_state = 0, ratio = 'majority')
    temp_df_X, sampling_train['cc_y_{}'.format(i)] = cc.fit_sample(trainX, trainy)
    sampling_train['cc_X_{}'.format(i)] = pd.DataFrame(temp_df_X).round()
    #print('smote')
    smote = SMOTE(random_state = 0, ratio = 'minority')
    temp_df_X,sampling_train['smote_y_{}'.format(i)] = smote.fit_sample(trainX, trainy)
    sampling_train['smote_X_{}'.format(i)] = pd.DataFrame(temp_df_X).round() 
    #print('smt')
    smt = SMOTETomek(random_state = 0, ratio = 'auto')
    temp_df_X, sampling_train['smt_y_{}'.format(i)] = smt.fit_sample(trainX, trainy)
    sampling_train['smt_X_{}'.format(i)] = pd.DataFrame(temp_df_X).round()

In [None]:
for i in range(5):
    for j in ['ros', 'rus', 'tl', 'cc', 'smote', 'smt']:
        print(i, j)
        np.count_nonzero(sampling_train['{}_y_{}'.format(j, i)] == 0) - np.count_nonzero(sampling_train['{}_y_{}'.format(j, i)] == 1)

## リサンプリングした訓練データを用いてランダムフォレストで推定

In [None]:
resample_forest = pd.DataFrame(index = range(30),columns = ['サンプリング方法', 'データ番号', 'test auc', 'test accuracy', 'train auc', 'train accuracy', 'auc diff', 'best param'])
count = 0
grid_param = {'max_depth': [5], 'min_samples_split': [2], 'min_samples_leaf': [2], 'n_estimators': [500]}
for j in ['ros', 'rus', 'tl', 'cc', 'smote', 'smt']:
    for i in range(5):
        forest = GridSearchCV(RandomForestClassifier(random_state = 0), grid_param, cv = 5, scoring = 'roc_auc') 
        if j == 'tl':
            forest = GridSearchCV(RandomForestClassifier(random_state = 0, class_weight = 'balanced'), grid_param, cv = 5, scoring = 'roc_auc') 
        forest.fit(sampling_train['{}_X_{}'.format(j, i)], sampling_train['{}_y_{}'.format(j, i)])
        pred = forest.best_estimator_.predict(test[i])
        fpr, tpr, thresholds = roc_curve(y_test3, pred, pos_label = 1)
        test_auc = auc(fpr, tpr)
        resample_forest['サンプリング方法'][count] = j
        resample_forest['データ番号'][count] = i
        resample_forest['best param'][count] = forest.best_params_
        resample_forest['test auc'][count] = test_auc
        resample_forest['test accuracy'][count] = accuracy_score(y_test3, pred) * 100
        pred = forest.best_estimator_.predict(sampling_train['{}_X_{}'.format(j, i)])
        fpr, tpr, thresholds = roc_curve(sampling_train['{}_y_{}'.format(j, i)], pred, pos_label = 1)
        train_auc = auc(fpr, tpr)
        resample_forest['train auc'][count] = train_auc
        resample_forest['train accuracy'][count] = accuracy_score(sampling_train['{}_y_{}'.format(j, i)], pred) * 100
        resample_forest['auc diff'][count] = train_auc - test_auc
        count += 1

## 結果の出力

In [None]:
resample_forest = resample_forest.sort_values(['サンプリング方法', 'test auc'], ascending = False)
resample_forest.head(len(resample_forest))

## リサンプリング(標準化後)

In [None]:
sampling_train = {}
for i in range(5):
    trainsample = df_dic[i].query('YEAR < 2015')
    trainy = trainsample['MAIS3']
    trainX = trainsample.drop(columns = ['YEAR', 'MAIS', 'MAIS3'])
    #print('ros')
    ros = RandomOverSampler(random_state = 0)
    sampling_train['ros_X_{}'.format(i)], sampling_train['ros_y_{}'.format(i)] = ros.fit_sample(trainX, trainy)
    sampling_train['ros_X_{}'.format(i)] = pd.DataFrame(sampling_train['ros_X_{}'.format(i)], columns = [x for x in trainX.columns])
    sampling_train['ros_X_{}'.format(i)] = pd.get_dummies(sampling_train['ros_X_{}'.format(i)], drop_first = False, columns = [x for x in list(sampling_train['ros_X_{}'.format(i)].columns) if x not in not_dummy])
    #print('rus')
    rus = RandomUnderSampler(random_state = 0)
    sampling_train['rus_X_{}'.format(i)], sampling_train['rus_y_{}'.format(i)] = rus.fit_sample(trainX, trainy)
    sampling_train['rus_X_{}'.format(i)] = pd.DataFrame(sampling_train['rus_X_{}'.format(i)], columns = [x for x in trainX.columns])
    sampling_train['rus_X_{}'.format(i)] = pd.get_dummies(sampling_train['rus_X_{}'.format(i)], drop_first = False, columns = [x for x in list(sampling_train['rus_X_{}'.format(i)].columns) if x not in not_dummy])
    #print('tl')
    tl = TomekLinks(random_state = 0, ratio = 'majority')
    sampling_train['tl_X_{}'.format(i)], sampling_train['tl_y_{}'.format(i)] = tl.fit_sample(trainX, trainy)
    sampling_train['tl_X_{}'.format(i)] = pd.DataFrame(sampling_train['tl_X_{}'.format(i)], columns = [x for x in trainX.columns])
    sampling_train['tl_X_{}'.format(i)] = pd.get_dummies(sampling_train['tl_X_{}'.format(i)], drop_first = False, columns = [x for x in list(sampling_train['tl_X_{}'.format(i)].columns) if x not in not_dummy])
    #print('cc')
    cc = ClusterCentroids(random_state = 0, ratio = 'majority')
    sampling_train['cc_X_{}'.format(i)], sampling_train['cc_y_{}'.format(i)] = cc.fit_sample(trainX, trainy)
    sampling_train['cc_X_{}'.format(i)] = pd.DataFrame(sampling_train['cc_X_{}'.format(i)], columns = [x for x in trainX.columns])
    sampling_train['cc_X_{}'.format(i)] = sampling_train['cc_X_{}'.format(i)][[x for x in sampling_train['cc_X_{}'.format(i)].columns if x not in stdcol]].round() 
    sampling_train['cc_X_{}'.format(i)] = pd.get_dummies(sampling_train['cc_X_{}'.format(i)], drop_first = False, columns = [x for x in list(sampling_train['cc_X_{}'.format(i)].columns) if x not in not_dummy])
    #print('smote')
    smote = SMOTE(random_state = 0, ratio = 'minority')
    temp_df_X,sampling_train['smote_y_{}'.format(i)] = smote.fit_sample(trainX, trainy)
    temp_df_X = pd.DataFrame(temp_df_X, columns = [x for x in trainX.columns])
    sampling_train['smote_X_{}'.format(i)] = temp_df_X[[x for x in temp_df_X.columns if x not in stdcol]].round() 
    sampling_train['smote_X_{}'.format(i)] = pd.get_dummies(sampling_train['smote_X_{}'.format(i)], drop_first = False, columns = [x for x in list(sampling_train['smote_X_{}'.format(i)].columns) if x not in not_dummy])
    #print('smt')
    smt = SMOTETomek(random_state = 0, ratio = 'auto')
    temp_df_X, sampling_train['smt_y_{}'.format(i)] = smt.fit_sample(trainX, trainy)
    temp_df_X = pd.DataFrame(temp_df_X, columns = [x for x in trainX.columns])
    sampling_train['smt_X_{}'.format(i)] = temp_df_X[[x for x in temp_df_X.columns if x not in stdcol]].round() 
    sampling_train['smt_X_{}'.format(i)] = pd.get_dummies(sampling_train['smt_X_{}'.format(i)], drop_first = False, columns = [x for x in list(sampling_train['smt_X_{}'.format(i)].columns) if x not in not_dummy])

## リサンプリングした訓練データを用いてロジスティック回帰で推定

In [None]:
resample_logi = pd.DataFrame(index = range(30),columns = ['サンプリング方法', 'データ番号', 'test auc', 'test accuracy', 'train auc', 'train accuracy', 'auc diff', 'best param'])
count = 0
grid_param = {'C': [0.1, 0.5, 1], 'solver': ['liblinear']}
for j  in ['ros', 'rus', 'tl', 'smt', 'cc', 'smote']:
    print(j)
    for i in range(5):
        logi = GridSearchCV(LogisticRegression(random_state = 0), grid_param, cv = 5, scoring = 'roc_auc')
        if j == 'tl':
            logi = GridSearchCV(LogisticRegression(random_state = 0, class_weight = 'balanced'), grid_param, cv = 5, scoring = 'roc_auc')
        logi.fit(sampling_train['{}_X_{}'.format(j, i)], sampling_train['{}_y_{}'.format(j, i)])
        pred = logi.best_estimator_.predict(test_dummy[i])
        fpr, tpr, thresholds = roc_curve(y_test3, pred, pos_label = 1)
        test_auc = auc(fpr, tpr)
        resample_logi['サンプリング方法'][count] = j
        resample_logi['データ番号'][count] = i
        resample_logi['best param'][count] = logi.best_params_
        resample_logi['test auc'][count] = test_auc
        resample_logi['test accuracy'][count] = accuracy_score(y_test3, pred) * 100
        pred = logi.best_estimator_.predict(sampling_train['{}_X_{}'.format(j, i)])
        fpr, tpr, thresholds = roc_curve(sampling_train['{}_y_{}'.format(j, i)], pred, pos_label = 1)
        train_auc = auc(fpr, tpr)
        resample_logi['train auc'][count] = train_auc
        resample_logi['train accuracy'][count] = accuracy_score(sampling_train['{}_y_{}'.format(j, i)], pred) * 100
        resample_logi['auc diff'][count] = train_auc - test_auc
        count += 1

## 結果の出力

In [None]:
resample_logi = resample_logi.sort_values(['サンプリング方法', 'test auc'], ascending = False)
resample_logi.head(len(resample_logi))

## SVMで推定(重い割にそこまで精度は出ない)

In [None]:
svm_dic = {}
grid_param = {'C': [100, 500], 'gamma': [0.001, 0.01]}
for i in range(5):
    svm_dic[i] = GridSearchCV(svm.SVC(random_state = 0, class_weight = 'balanced'), grid_param, cv = 5, scoring = 'roc_auc')
    svm_dic[i].fit(train_dummy[i], y_train3)
    predict = svm_dic[i].predict(test_dummy[i])
    fpr, tpr, thresholds = roc_curve(y_test3, predict, pos_label = 1)
    print('best parameter:', svm_dic[i].best_params_)
    print('dic num:', i)
    print('test auc: ', auc(fpr, tpr))    
    predict = svm_dic[i].predict(train_dummy[i])
    fpr, tpr, thresholds = roc_curve(y_train3, predict, pos_label = 1)
    print('train auc: ', auc(fpr, tpr))