In [1]:
from sklearn.ensemble import GradientBoostingClassifier
import xgboost

In [2]:
import pandas as pd
from datetime import datetime as dtm
from src.experiment_based_function import *
from wKit.utility.file_sys import mkdirs_if_not_exist
from wKit.ML.sk_ml import (grid_cv_a_model, grid_cv_default_params, evaluator_scalable_cls, 
                           show_important_features, confusion_matrix_as_df)

In [3]:
from imblearn.over_sampling import SMOTE

In [4]:
def load_data():
    y = pd.read_csv('data/y_csl_all.csv', index_col=0).csl
    X_total = pd.read_csv('data/x_TOTAL_~2014.csv', index_col=0)
    X_type = pd.read_csv('data/x_NO_TOTAL_~2014.csv', index_col=0)
    Xs = {'NO_TOTAL': X_type, 'TOTAL': X_total}
    return Xs, y

In [5]:
def init_model_params(name):
    params = grid_cv_default_params()
    if name == 'XGBreg': 
        model = xgboost.XGBRegressor()
        param = params['reg']['XGBreg']
    elif name == 'XGBcls': 
        model = xgboost.XGBClassifier()
        param = params['cls']['XGBcls']
    elif name == 'GDBcls': 
        model = GradientBoostingClassifier()
        param = params['cls']['GDBcls']
    else: raise('no model')
        
    return model, param

In [6]:

def upsample_one_class(y_one_class, target_num):
    num = len(y_one_class)
    factor = int(round(target_num/num))
    if factor == 1:  # don't do anything
        return y_one_class
    return pd.concat([y_one_class]*factor)  # dulicating by factor times

def upsample(train_y):
    max_ = train_y.round().value_counts().max()
    labels = train_y.round().unique()
    uped = []
    for label in labels:
        y_one_class = train_y[train_y.round()==label]
        up = upsample_one_class(y_one_class, max_)
        uped.append(up)
    return pd.concat(uped)

In [7]:
def get_total_or_type(total_or_not):
    return {'TOTAL': 'total', 'NO_TOTAL': 'type'}[total_or_not]


In [8]:
Xs, y = load_data()
y = y.round()

In [9]:
combos = [('TOTAL', 'XGBcls'), ('NO_TOTAL', 'XGBreg'), ('TOTAL', 'GDBcls')]

In [13]:
for seed in SEEDS:
    # set up experiment path
    exp_path = 'data/up_down_experiment/seed_%d' % seed
    upsample_path = '%s/upsample_res' % exp_path
    mkdirs_if_not_exist(exp_path)
    mkdirs_if_not_exist(upsample_path)
    
    # get train/test index
    idx_fn = '%s/%s' % (exp_path, 'indices.txt')
    train_idx, test_idx = get_idx(y.index, idx_fn, seed)
    
    # get train_y and test_y
    train_y, test_y = y.loc[train_idx], y.loc[test_idx]
#     uped_train_y = upsample(train_y)

    # store result
    df_grid_res, df_eval_res = [], []   
    
    # iterate combos
    for total_or_not, name in combos:
        total_or_type = get_total_or_type(total_or_not)
        # get train x and test_x
        X = Xs[total_or_not]
        train_x, test_x = X.loc[train_idx], X.loc[test_idx]
        feature_names = train_x.columns
        # oversample train_x and train_y
        upsampler = SMOTE()
        up_train_x, up_train_y = upsampler.fit_sample(train_x, train_y)
        
        # for each combo, do a feature slection experiment
        for fselect_type in FSELECT_TYPE:
            # create folder for each fselect experiment
            cv_path = '%s/%s' % (upsample_path, fselect_type)
            mkdirs_if_not_exist(cv_path)
            # min-max scale and select
            dset = scale_and_selection(up_train_x, up_train_y, test_x, test_y, fselect_type)
            # grid search best fit model
            model, param = init_model_params(name)
            grid_res = grid_cv_a_model(dset['train_x'], dset['train_y'], model, param, kind=name[-3:], name=name, path=cv_path)
            grid_res['total_or_type'] = total_or_type
            grid_res['feature_selection'] = fselect_type
            model = grid_res.pop('best_model')
            df_grid_res.append(grid_res)
            # evaluate on original test set
            eval_res = evaluator_scalable_cls(model, dset['train_x'], dset['train_y'], dset['test_x'], dset['test_y'])
            eval_res['total_or_type'] = total_or_type
            eval_res['model_name'] = name
            eval_res['feature_selection'] = fselect_type
            eval_res['#all'] = len(feature_names)
            eval_res['#keep'] = dset['selected_ftr'].sum()
            print('feature selection: %d -> %d' %(len(feature_names), dset['selected_ftr'].sum()))
            df_eval_res.append(eval_res)
            # save feature importances 
            imp = show_important_features(model, labels=feature_names, set_std=False, show_plt=False).drop('std', axis=1) 
            imp.columns = ['label', 'importance_%d' % seed]
            imp.to_csv('%s/imp-%s-%s.csv' % (cv_path, name, total_or_not))
            # save confusion matrix
            cfsn_norm = confusion_matrix_as_df(model, dset['test_x'], dset['test_y'], labels=[1, 2, 3, 4, 5], normalize=True)
            cfsn_norm.to_csv('%s/cfsn_norm-%s-%s.csv' % (cv_path, name, total_or_not))
            cfsn = confusion_matrix_as_df(model, dset['test_x'], dset['test_y'], labels=[1, 2, 3, 4, 5])
            cfsn.to_csv('%s/cfsn-%s-%s.csv' % (cv_path, name, total_or_not))
    
#         break    # run one combos 
    # save result
    df_grid_res = pd.DataFrame(df_grid_res)
    df_grid_res.to_csv('%s/grid_res.csv' % upsample_path)
    df_eval_res = pd.DataFrame(df_eval_res)
    df_eval_res.to_csv('%s/eval_res.csv' % upsample_path)
    break  # run one seed

scale features
min max only to [0,1]
feature selection, choice: None
2017-09-17 21:41:46.918019 CVing: kind = cls, model = XGBcls
Fitting 5 folds for each of 64 candidates, totalling 320 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   22.3s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:  1.6min
[Parallel(n_jobs=4)]: Done 320 out of 320 | elapsed:  2.8min finished


scale features
min max only to [0,1]
feature selection, choice: rfecv_linsvc
2017-09-17 21:44:45.234590 CVing: kind = cls, model = XGBcls
Fitting 5 folds for each of 64 candidates, totalling 320 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   18.5s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:  1.3min
[Parallel(n_jobs=4)]: Done 320 out of 320 | elapsed:  2.4min finished
  'precision', 'predicted', average, warn_for)


scale features
min max only to [0,1]
feature selection, choice: mrmr
2017-09-17 21:47:15.330877 CVing: kind = cls, model = XGBcls
Fitting 5 folds for each of 64 candidates, totalling 320 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   21.6s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:  1.5min
[Parallel(n_jobs=4)]: Done 320 out of 320 | elapsed:  2.7min finished


scale features
min max only to [0,1]
feature selection, choice: None
2017-09-17 21:50:01.309221 CVing: kind = reg, model = XGBreg
Fitting 5 folds for each of 64 candidates, totalling 320 fits


  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   14.3s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:  1.1min
[Parallel(n_jobs=4)]: Done 320 out of 320 | elapsed:  2.0min finished


scale features
min max only to [0,1]
feature selection, choice: rfecv_linsvc


  'precision', 'predicted', average, warn_for)


2017-09-17 21:52:44.121984 CVing: kind = reg, model = XGBreg
Fitting 5 folds for each of 64 candidates, totalling 320 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   10.4s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   47.8s
[Parallel(n_jobs=4)]: Done 320 out of 320 | elapsed:  1.5min finished
  'precision', 'predicted', average, warn_for)


scale features
min max only to [0,1]
feature selection, choice: mrmr
2017-09-17 21:55:00.084090 CVing: kind = reg, model = XGBreg
Fitting 5 folds for each of 64 candidates, totalling 320 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   13.7s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:  1.1min
[Parallel(n_jobs=4)]: Done 320 out of 320 | elapsed:  2.0min finished


scale features
min max only to [0,1]
feature selection, choice: None
2017-09-17 21:57:00.295934 CVing: kind = cls, model = GDBcls
Fitting 5 folds for each of 54 candidates, totalling 270 fits


  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=4)]: Done 119 tasks      | elapsed:   13.4s
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=4)]: Done 270 out of 270 | elapsed:   34.8s finished


scale features
min max only to [0,1]
feature selection, choice: rfecv_linsvc
2017-09-17 21:57:45.580631 CVing: kind = cls, model = GDBcls
Fitting 5 folds for each of 54 candidates, totalling 270 fits


[Parallel(n_jobs=4)]: Done 116 tasks      | elapsed:   13.0s
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=4)]: Done 270 out of 270 | elapsed:   33.9s finished


scale features
min max only to [0,1]
feature selection, choice: mrmr
2017-09-17 21:58:25.689812 CVing: kind = cls, model = GDBcls
Fitting 5 folds for each of 54 candidates, totalling 270 fits


[Parallel(n_jobs=4)]: Done 119 tasks      | elapsed:   13.6s
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=4)]: Done 270 out of 270 | elapsed:   34.7s finished
