In [None]:
from sklearn.ensemble import GradientBoostingClassifier
import xgboost

In [None]:
import pandas as pd
from datetime import datetime as dtm
from src.experiment_based_function import *
from wKit.utility.file_sys import mkdirs_if_not_exist
from wKit.ML.sk_ml import grid_cv_a_model, grid_cv_default_params, evaluator_scalable_cls, show_important_features, confusion_matrix_as_df

In [None]:
def load_data():
    y = pd.read_csv('data/y_csl_all.csv', index_col=0).csl
    X_total = pd.read_csv('data/x_TOTAL_~2014.csv', index_col=0)
    X_type = pd.read_csv('data/x_NO_TOTAL_~2014.csv', index_col=0)
    Xs = {'NO_TOTAL': X_type, 'TOTAL': X_total}
    return Xs, y

In [None]:
def init_model_params(name):
    params = grid_cv_default_params()
    if name == 'XGBreg': 
        model = xgboost.XGBRegressor()
        param = params['reg']['XGBreg']
    elif name == 'XGBcls': 
        model = xgboost.XGBClassifier()
        param = params['cls']['XGBcls']
    elif name == 'GDBcls': 
        model = GradientBoostingClassifier()
        param = params['cls']['GDBcls']
    else: raise('no model')
        
    return model, param

In [None]:

def upsample_one_class(y_one_class, target_num):
    num = len(y_one_class)
    factor = int(round(target_num/num))
    if factor == 1:  # don't do anything
        return y_one_class
    return pd.concat([y_one_class]*factor)  # dulicating by factor times

def upsample(train_y):
    max_ = train_y.round().value_counts().max()
    labels = train_y.round().unique()
    uped = []
    for label in labels:
        y_one_class = train_y[train_y.round()==label]
        up = upsample_one_class(y_one_class, max_)
        uped.append(up)
    return pd.concat(uped)

In [None]:
def get_total_or_type(total_or_not):
    return {'TOTAL': 'total', 'NO_TOTAL': 'type'}[total_or_not]


In [None]:
Xs, y = load_data()

In [None]:
combo = [('TOTAL', 'XGBcls'), ('NO_TOTAL', 'XGBreg'), ('TOTAL', 'GDBcls')]

In [None]:
for seed in SEEDS:
    # set up experiment path
    exp_path = 'data/up_down_experiment/seed_%d' % seed
    upsample_path = '%s/upsample_res' % exp_path
    mkdirs_if_not_exist(exp_path)
    mkdirs_if_not_exist(upsample_path)
    
    # get train/test index
    idx_fn = '%s/%s' % (exp_path, 'indices.txt')
    train_idx, test_idx = get_idx(y.index, idx_fn, seed)
    
    # get upsampled train_y and origin test_y
    train_y, test_y = y.loc[train_idx], y.loc[test_idx]
    uped_train_y = upsample(train_y)
    
    # store result
    df_grid_res, df_eval_res = [], []
    
    # iterate combo
    for total_or_not, name in combo:
        total_or_type = get_total_or_type(total_or_not)
        
        # get upsampled train x and origin test_x
        X = Xs[total_or_not]
        uped_train_x = X.loc[uped_train_y.index]
        test_x = X.loc[test_idx]
        feature_names = uped_train_x.columns
        uped_train_x, test_x = scale_ftr(uped_train_x, test_x)
        # grid search best fit model
        model, param = init_model_params(name)
        grid_res = grid_cv_a_model(uped_train_x, uped_train_y, model, param, kind=name[-3:], name=name, path=upsample_path)
        grid_res['total_or_type'] = total_or_type
        model = grid_res.pop('best_model')
        df_grid_res.append(grid_res)
        # evaluate on original test set
        eval_res = evaluator_scalable_cls(model, uped_train_x, uped_train_y, test_x, test_y)
        eval_res['total_or_type'] = total_or_type
        eval_res['model_name'] = name
        df_eval_res.append(eval_res)
        # save feature importances 
        imp = show_important_features(model, labels=feature_names, set_std=False, show_plt=False).drop('std', axis=1) 
        imp.columns = ['label', 'importance_%d' % seed]
        imp.to_csv('%s/imp-%s-%s.csv' % (upsample_path, name, total_or_not))
        # save confusion matrix
        cfsn_norm = confusion_matrix_as_df(model, test_x, test_y, labels=[1, 2, 3, 4, 5], normalize=True)
        cfsn_norm.to_csv('%s/cfsn_norm-%s-%s.csv' % (upsample_path, name, total_or_not))
        cfsn = confusion_matrix_as_df(model, test_x, test_y, labels=[1, 2, 3, 4, 5])
        cfsn.to_csv('%s/cfsn-%s-%s.csv' % (upsample_path, name, total_or_not))
        break
    
    # save result
    df_grid_res = pd.DataFrame(df_grid_res)
    df_grid_res.to_csv('%s/grid_res.csv' % upsample_path)
    df_eval_res = pd.DataFrame(df_eval_res)
    df_eval_res.to_csv('%s/eval_res.csv' % upsample_path)
    
    break