In [26]:
import glob

import os
from wKit.utility.file_sys import mkdirs_if_not_exist
import pandas as pd
from src.experiment_based_function import SEEDS, get_idx

In [9]:
def load_data():
    Xs = {'RoadNet': pd.read_csv('data/x_RoadNet.csv', index_col=0)}

    for ftr_type in ['Segment', 'RoadNet+Segment']:
        for total_or_not in ['NO_TOTAL', 'TOTAL']:
            feature_combo_name = '%s_%s_%s' % (ftr_type, total_or_not, '~2014')
            Xs[feature_combo_name] = pd.read_csv('data/x_%s.csv' % feature_combo_name, index_col=0)

    ys = {}
    for fn in glob.glob('data/exp4_ys/400ratings/*'):
        basename = os.path.basename(fn)
        qa = basename.replace('y_csl_', '')[:-4]
        y = pd.read_csv(fn, index_col=0).csl
        ys[qa] = y
    return Xs, ys

In [10]:
Xs, ys = load_data()

In [35]:

MODEL_UP_FSEL_NAMES = [
    ('XGBcls', 'None', 'None'),
    ('XGBcls', 'svm', 'None'),
    ('XGBreg', 'None', 'None'),
    ('XGBreg', 'svm', 'rfecv_linsvc'),
    ('GDBcls', 'None', 'None'),
    ('GDBcls', 'svm', 'mrmr'),
]

In [48]:
def exp4_one_run(exp_path, Xs, y, train_idx, test_idx, seed):
    
    def upsampling():
        try:
            upsampler = SMOTE(kind=up_name, random_state=SMOTE_SEED)
            up_train_x, up_train_y = upsampler.fit_sample(train_x, train_y.round())
            up_y_dist = pd.Series(up_train_y).value_counts().to_dict()
        except ValueError as e:
            print('path=%s, ftr_combo_name=%s, smote=%s' %(exp_path, ftr_combo_name, smote_kind))
            print('catch a ValueError: %s' % e)
            continue
        return up_train_x, up_train_y, up_y_dist

    def grid(model):
        grid_res = grid_cv_a_model(dset['train_x'], dset['train_y'], model, param, kind=name[-3:], name=name, path=cv_path)
        grid_res['ftr_combo_name'] = ftr_combo_name
        grid_res['feature_selection'] = fselect_type
        eval_res['upsample'] = up_name
        model = grid_res.pop('best_model')
        grid_res_list.append(grid_res)
        return model

    def eval_on_test():
        eval_res = evaluator_scalable_cls(model, dset['train_x'], dset['train_y'], dset['test_x'], dset['test_y'])
        eval_res['ftr_combo_name'] = ftr_combo_name
        eval_res['model_name'] = name
        eval_res['feature_selection'] = fselect_type
        eval_res['upsample'] = up_name
        eval_res['train_n_classes'] = train_n_classes
        eval_res['test_n_classes'] = test_n_classes
        eval_res['#all'] = len(feature_names)
        eval_res['#keep'] = dset['selected_ftr'].sum()
        eval_res['up_y_dist'] = up_y_dist if up_name!='None' else None
        eval_res['y_dist'] = y_dist
        eval_res_list.append(eval_res)
        
    # get train_y and test_y
    train_y, test_y = y.loc[train_idx], y.loc[test_idx]
    train_n_classes = train_y.round().nunique()
    test_n_classes = test_y.round().nunique()
    print('\tn classes, train: %d, test: %d' % (train_n_classes, test_n_classes))
    y_dist = train_y.round().value_counts().to_dict()
    print('\ttrain_y: distr=%s' % (y_dist))
    
    # store result
    grid_res_list, eval_res_list = [], []
    
    # iterate combos
    for ftr_combo_name, X in Xs.items():
        print('\t\tftr_combo_name=%s' % ftr_combo_name)
        # get train x and test_x
        train_x, test_x = X.loc[train_idx], X.loc[test_idx]
        feature_names = train_x.columns
        
        for model_name, up_name, fselect_type in MODEL_UP_FSEL_NAMES:
            print('\t\t\t%s,%s,%s' % (model_name, up_name, fselect_type))
            if up_name!='None': train_x, train_y, up_y_dist = upsampling()
            dset = scale_and_selection(train_x, train_y, test_x, test_y, fselect_type)
            print('feature selection: %d -> %d' %(len(feature_names), dset['selected_ftr'].sum()))
            cv_path = '%s/%s#%s#%s' % (exp_path, ftr_combo_name, up_name, fselect_type)
            mkdirs_if_not_exist(cv_path)
            print('fitting models for cv_path=%s' % cv_path)

            # init model
            model, param = init_model_params(name)
            # grid a model, save grid_res to grid_res_list
            model = grid(model)
            # evaluate on original test set, save eval_res to eval_res_list
            eval_on_test()

In [49]:
def exp4(Xs, y, qa):
    
    for seed in SEEDS:
        # set up experiment path
        exp_path = 'data/exp4/%s/seed_%d' % (qa, seed)
        mkdirs_if_not_exist(exp_path)
        # get train/test index
        idx_fn = '%s/%s' % (exp_path, 'indices.txt')
        train_idx, test_idx = get_idx(y.index, idx_fn, seed)
        print('\tbegin one run exp, in exp_path=%s' % exp_path)
        exp4_one_run(exp_path, Xs, y, train_idx, test_idx, seed)
#         break

In [50]:
for qa, y in ys.items():
    print(qa)
    exp4(Xs, y, qa)
    break

age#25-34
	begin one run exp, in exp_path=data/exp4/age#25-34/seed_0
	n classes, train: 5, test: 4
	train_y: distr={4.0: 66, 3.0: 60, 2.0: 30, 5.0: 6, 1.0: 5}
		ftr_combo_name=RoadNet
			XGBcls,None,None
			XGBcls,svm,None
			XGBreg,None,None
			XGBreg,svm,rfecv_linsvc
			GDBcls,None,None
			GDBcls,svm,mrmr
		ftr_combo_name=Segment_NO_TOTAL_~2014
			XGBcls,None,None
			XGBcls,svm,None
			XGBreg,None,None
			XGBreg,svm,rfecv_linsvc
			GDBcls,None,None
			GDBcls,svm,mrmr
		ftr_combo_name=Segment_TOTAL_~2014
			XGBcls,None,None
			XGBcls,svm,None
			XGBreg,None,None
			XGBreg,svm,rfecv_linsvc
			GDBcls,None,None
			GDBcls,svm,mrmr
		ftr_combo_name=RoadNet+Segment_NO_TOTAL_~2014
			XGBcls,None,None
			XGBcls,svm,None
			XGBreg,None,None
			XGBreg,svm,rfecv_linsvc
			GDBcls,None,None
			GDBcls,svm,mrmr
		ftr_combo_name=RoadNet+Segment_TOTAL_~2014
			XGBcls,None,None
			XGBcls,svm,None
			XGBreg,None,None
			XGBreg,svm,rfecv_linsvc
			GDBcls,None,None
			GDBcls,svm,mrmr
	begin one run exp, i

In [54]:
from sklearn.metrics import f1_score
f1_score([1]*10+[2]*10+[3]*10, [1]*15+[3]*15, average='weighted')

  'precision', 'predicted', average, warn_for)


0.53333333333333333