In [1]:
import warnings
from sklearn.exceptions import UndefinedMetricWarning
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)


In [2]:
# best 3 solutions based on EXP1 in turns of f1 weighted is:
# SVM, RoadNet+Segment NO_TOTAL, 0.553202
# RFcls, RoadNet+Segment TOTAL, 0.547742
# GDBcls, RoadNet+Segment TOTAL, 0.542030
# with ~2014, min-max, no feature selection

In [3]:
from wKit.utility.check_dtype import all_float
from wKit.utility.ipynb_helper import multi_column_df_display
from wKit.ML.sk_ml import confusion_matrix_as_df, show_important_features, evaluator_scalable_cls, grid_cv_a_model

In [4]:
from collections import defaultdict

In [5]:
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.svm import SVC

In [6]:
from sklearn.metrics import f1_score

In [7]:
def load_data():
    y = pd.read_csv('../data/y_csl_all_0929.csv', index_col=0).csl
    X_total = pd.read_csv('../data/x_RoadNet+Segment_TOTAL_~2014.csv', index_col=0)
    X_type = pd.read_csv('../data/x_RoadNet+Segment_NO_TOTAL_~2014.csv', index_col=0)
    Xs = {'NO_TOTAL': X_type, 'TOTAL': X_total}
    return Xs, y

In [8]:
def train_test_idx(seed):
    idx_fn = 'exp1/seed_%d/indices.txt' % seed
    with open(idx_fn) as f:
        lines = f.readlines()
        train_idx = lines[0].strip().split('\t')[1].split(',')
        train_idx = [int(x) for x in train_idx]
        test_idx = lines[1].strip().split('\t')[1].split(',')
        test_idx = [int(x) for x in test_idx]
    return train_idx, test_idx

In [9]:

from wKit.ML.scaler import minmax
def scale_ftr(train_x, test_x):
    scaler = minmax()
    scaler.fit(train_x)
    train_x = scaler.transform(train_x)
    test_x = scaler.transform(test_x)

    return train_x, test_x

In [10]:
def init_model(name):
    if name == 'SVM': model = SVC()
    elif name == 'XGBreg': model = xgboost.XGBRegressor()
    elif name == 'XGBcls': model = xgboost.XGBClassifier()
    elif name == 'RFcls': model = RandomForestClassifier()
    elif name == 'GDBcls': model = GradientBoostingClassifier()
    else: raise('no model')
    return model

In [11]:
def load_params(seed, total_or_not, name):
    cv_res_fn = 'exp1/seed_%d/RoadNet+Segment#%s#~2014#min-max#None/cv_5_model_%s.csv' % (seed, total_or_not, name)
    return eval(pd.read_csv(cv_res_fn,index_col=0)['params'].values[0])
    

In [12]:
seeds = [0, 100, 972, 5258, 7821, 40918, 57852, 168352, 291592, 789729423]
combo = [('NO_TOTAL', 'SVM'), ('TOTAL', 'RFcls'), ('TOTAL', 'GDBcls')]

In [13]:
Xs, y = load_data()


In [14]:
imps = defaultdict(list)
cfsns = defaultdict(list)
f1s = defaultdict(list)

for seed in seeds:
    train_idx, test_idx = train_test_idx(seed)
    train_y, test_y = y.loc[train_idx], y.loc[test_idx]
    print('======', seed, len(test_y))
    for total_or_not, name in combo:
#         print(total_or_not, name)
        X = Xs[total_or_not]
        train_x, test_x = X.loc[train_idx], X.loc[test_idx]
        feature_names = train_x.columns
        train_x, test_x = scale_ftr(train_x, test_x)
        params = load_params(seed, total_or_not, name)
#         print(name, "params:", params)
        model = init_model(name)
        model.set_params(**params)
        if 'reg' in name: 
            model.fit(train_x, train_y)
        else:
            model.fit(train_x, train_y.round())
#         print(model)
        f1 = f1_score(test_y.round(), model.predict(test_x).round(), average='weighted')
        f1s[name].append(f1)
#         print(f1)
#         f1s[name].append(evaluator_scalable_cls(model, train_x, train_y, test_x, test_y))

        try:
            imp = show_important_features(model, labels=feature_names, set_std=False, show_plt=False).drop('std', axis=1) 
            imp.columns = ['label', 'importance_%d' % seed]
            imps[name].append(imp)
        except AttributeError as e:
            print(name, 'no import')
        
        cfsn = confusion_matrix_as_df(model, test_x, test_y, labels=[1, 2, 3, 4, 5])      
        cfsns[name].append(cfsn)

SVM no import
SVM no import
SVM no import
SVM no import
SVM no import
SVM no import
SVM no import
SVM no import
SVM no import
SVM no import


In [15]:
sum(f1s['SVM'])/10, sum(f1s['RFcls'])/10, sum(f1s['GDBcls'])/10, 

(0.52753731833757955, 0.54278403973304357, 0.56056410069076468)

# confusion matrix

In [16]:
for total_or_not, name in combo:
    total_or_type = {'TOTAL': 'total', 'NO_TOTAL': 'type'}[total_or_not]
    cfsn = sum(cfsns[name])/len(seeds)
    cfsn.to_csv('deliverables/EXP2-average_confusion_matrix_%s_%s.csv' % (name, total_or_type))
    cfsn_pcnt_per_row = cfsn.apply(lambda x: x / x.sum(), axis=1)
    cfsn_pcnt_per_row.to_csv('deliverables/EXP2-average_confusion_matrix_as_pcnt_per_row_%s_%s.csv' % (name, total_or_type))
    

In [17]:
def plot_imp(list_imp, name, total_or_type, top=10):
    imp = pd.DataFrame(columns=['label'])
    for df in list_imp:
        imp = imp.merge(df, on='label', how='outer')

    imp = imp.set_index('label')
    means = imp.mean(axis=1)
    stds = imp.std(axis=1)
    means_to_plot = means.sort_values(ascending=False).head(top)
    index_to_plot = means_to_plot.index
    stds_to_plot = stds.loc[index_to_plot]
    ax = means_to_plot.sort_values().plot(kind='barh', xerr=stds_to_plot, figsize=(10,7))
    ax.set_xlabel('importance with std as errorbar')
    ax.set_ylabel('feature')
    fig = ax.get_figure()
    fig.savefig(u'deliverables/EXP2-feature importance - %s_RoadNet+social_%s.png' % (name, total_or_type), format='png', bbox_inches='tight', pad_inches=0)
    

In [18]:
total_or_not, name = combo[0]
total_or_type = {'TOTAL': 'total', 'NO_TOTAL': 'type'}[total_or_not]
plot_imp(imps[name], name, total_or_type, 10)

TypeError: Empty 'DataFrame': no numeric data to plot

In [19]:
total_or_not, name = combo[1]
total_or_type = {'TOTAL': 'total', 'NO_TOTAL': 'type'}[total_or_not]
plot_imp(imps[name], name, total_or_type, 10)

In [20]:
total_or_not, name = combo[2]
total_or_type = {'TOTAL': 'total', 'NO_TOTAL': 'type'}[total_or_not]
plot_imp(imps[name], name, total_or_type, 10)