In [5]:
import pandas as pd
import glob

import os

from wKit.utility.file_sys import mkdirs_if_not_exist

In [17]:
def load_xs():
    Xs = {'RoadNet': pd.read_csv('data/x_RoadNet.csv', index_col=0)}

    for ftr_type in ['Segment', 'RoadNet+Segment']:
        for total_or_not in ['NO_TOTAL', 'TOTAL']:
            feature_combo_name = '%s_%s_%s' % (ftr_type, total_or_not, '~2014')
            Xs[feature_combo_name] = pd.read_csv('data/x_%s.csv' % feature_combo_name, index_col=0)

    return Xs

In [13]:
def init_model_params(name):
    params = grid_cv_default_params()
    if name == 'XGBcls':
        model = xgboost.XGBClassifier()
        param = params['cls']['XGBcls']
    elif name == 'BAGcls':
        model = BaggingClassifier()
        param = params['cls']['BAGcls']
    elif name == 'GDBcls':
        model = GradientBoostingClassifier()
        param = params['cls']['GDBcls']
    else: raise('no model')
    return model, param

In [22]:
def exp5_recon(train_y, test_y, Xs, frac, sample_seed, exp_path):
    """
    train_y: segment level computed by frac% of ratings
    test_y: segment level computed by 100% of ratings
    """
    
    train_n_classes = train_y.round().nunique()
    test_n_classes = test_y.round().nunique()
    print('====n classes, train: %d, test: %d' % (train_n_classes, test_n_classes))
    y_dist = train_y.round().value_counts().to_dict()
    print('====train_y: distr=%s' % (y_dist))
    
    train_idx = train_y.index
    test_idx = test_y.index
    
    # store result
    grid_res_list, eval_res_list = [], []

    # iterate combos
    for ftr_combo_name, X in Xs.items():
        print('========ftr_combo_name=%s' % ftr_combo_name)
        train_x, test_x = X.loc[train_idx], X.loc[test_idx]
        feature_names = train_x.columns

        for model_name, up_name, fselect_type in MODEL_UP_FSEL_NAMES:
            print('============%s,%s,%s' % (model_name, up_name, fselect_type))
    return

In [23]:
def main():
    redo = False
    test_y = pd.read_csv('data/y_csl_all-2017-10-01.csv', index_col=0).csl
    Xs = load_xs()

    for fn in glob.glob('data/exp5_reconstruction/1001/*'):
        _, frac, sample_seed = fn[:-4].split('#')
        exp_path = 'experiment_1001/exp5_recon/%s#%s' % (frac, sample_seed)
        if os.path.exists(exp_path) and not redo:
            print('exists:', exp_path)
            continue
        print('exp on:', exp_path)
        mkdirs_if_not_exist(exp_path)
        train_y = pd.read_csv(fn, index_col=0).csl
        exp5_recon(train_y, test_y, Xs, frac, sample_seed, exp_path)
        break

In [24]:

MODEL_UP_FSEL_NAMES = [
    ('XGBcls', 'None', 'None'),
    # ('XGBcls', 'svm', 'None'),
    ('BAGcls', 'None', 'None'),
    # ('XGBreg', 'svm', 'rfecv_linsvc'),
    ('GDBcls', 'None', 'None'),
    # ('GDBcls', 'svm', 'mrmr'),
]
SMOTE_SEED = 10
main()

exists: experiment_1001/exp5_recon/10#1151
exists: experiment_1001/exp5_recon/10#2097
exists: experiment_1001/exp5_recon/10#4737
exists: experiment_1001/exp5_recon/10#4934
exp on: experiment_1001/exp5_recon/10#5237
====n classes, train: 5, test: 5
====train_y: distr={3.0: 72, 4.0: 61, 2.0: 32, 5.0: 7, 1.0: 4}
