In [1]:
%time
%load_ext autotime
%load_ext autoreload
%autoreload 2

import os
import sys
parent_dir = os.path.abspath(os.getcwd()+'/..')+'/' 
sys.path.append(parent_dir) 

import pandas as pd
import numpy as np
from copy import deepcopy

from utils.path import dir_HugeFiles
from utils.preprocessing import load
from utils.save import make_dir, save_pickle, load_pickle, auto_save_csv, print_time, auto_save_pickle

from models.fmin2 import fmin2, fmin2_model1
from models.nested_validation import *
from models.features import fixed_makedata, salvador_wrap, pretrained_wrap
from models.display import pickle2df

# models
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier

pd.set_option('display.max_colwidth', -1)

import warnings
warnings.filterwarnings("ignore")

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 7.87 µs


In [2]:
dir_save = os.path.normpath('../data/dic_20190819.pickle')
dic = load(dir_save)

amt_GI = load_pickle('../data/amt_hGI_pos.pickle')
for i, v in dic.items():
    if i in amt_GI.keys():
        v['AMT'] = amt_GI[i]
amt_ls = list(amt_GI.keys())
ls = amt_ls

exist
time: 15 s


### 1. word2vec and LR

In [3]:
model1s = [salvador_wrap('../data/fasttext_%.2d.bin'% (number)) for number in range(1,8+1,1)]

def inner_CV_for_model1(corp, model1, method):
    print('inner_CV_for_model1')
    ''' fmin from hyperopt
    Args:
        fn: model to run, in the form of function
        space: dict of potential parameters
    Return:
        best: dict of best parameters
    '''
    p2_lr = {'class_weight':'balanced','solver': 'liblinear', 'penalty':'l2'}
    spaces = {'model1': model1, 'model2': LogisticRegression,'method': method,
              'p2': p2_lr, 'corp': corp}
    # tuning model1
    best_space = fmin2_model1(fn = objective, space=spaces, max_evals = 8)
    print(best_space)
    return best_space['model1']

def inner_CV_for_model2(corp, model1, method):
    print('inner_CV_for_model2')
    from models.fmin2 import p2_lr
    
    spaces = {'model1': model1, 'model2': LogisticRegression,'method': method,
              'p2': p2_lr, 'corp': corp}
    # tuning p2: the params for model2
    best_space = fmin2(fn = objective, space=spaces, max_evals = 10)
    print(best_space)
    model2 = best_space['model2'](**best_space['p2'])
    return model2

'''
0. start the model selection and evaluation
'''
dfs = []
corp0 = fixed_makedata(dic, ls, tag='AMT')
knowns = corp0.knowns
model1 = None
state = 2022
for method in [['wv','scale']]:
    print('state', state)
    corp = copy.deepcopy(corp0)
    results = []
    model2, fold_count = '', 0
    ss = StratifiedKFold(n_splits = 5, shuffle = True, random_state = state)
    for train_index, test_index in ss.split(corp.ls, corp.y):
        print('outer fold')
        print_time()
        '''
        1. model selection
        ::: model1 is for feature engineering
            e.g. model1 = pretrained_wrap(corp.knowns, 'glove-wiki-gigaword-300')
            e.g. model1 = gensim_wrap(corp.knowns, Word2Vec, params = {})
        ::: model2 is a classifier
            e.g. model2 = copy.deepcopy(classifier['default_Logistic'])
        '''
        model1 = inner_CV_for_model1(corp.replace(train_index), model1s, method)
        model2 = inner_CV_for_model2(corp.replace(train_index), model1, method)
        '''
        2. build X
        '''
        data = inputs_generater(corp.replace(train_index), model1, method)
        '''
        3. run and find the best prob threshold
        '''
        threshold = clf_running_search(data, model1, copy.deepcopy(model2), method)
        '''
        4. run again and evaluate on the real test set
        ''' 
        data = inputs_generater(corp.add_train_test(train_index, test_index), model1, method)
        result = clf_running(data, model1, copy.deepcopy(model2), method, threshold).result
        result.update({'model1': model1,'tag': corp.tag, 'method':method})
        results.append(result)
        print(result['test_f1'])

    pickle_path = auto_save_pickle(results)
    df = pickle2df(results, pickle_path)
    dfs.append(df)
auto_save_csv(pd.concat(dfs))

state 2022
outer fold
2019-08-21 10:57:18.285148
inner_CV_for_model1
{'model1': <models.features.salvador_wrap object at 0x7f330948feb8>, 'model2': <class 'sklearn.linear_model.logistic.LogisticRegression'>, 'method': ['wv', 'scale'], 'p2': {'class_weight': 'balanced', 'solver': 'liblinear', 'penalty': 'l2'}, 'corp': <models.features.fixed_makedata object at 0x7f340e9842e8>}
{'class_weight': 'balanced', 'solver': 'liblinear', 'penalty': 'l2'}
2019-08-21 10:57:18.544246
[0.717948717948718, 0.7093023255813952, 0.7204968944099378, 0.735632183908046, 0.7594936708860761]
0.7285747585468345
-0.7285747585468345 -0.7285747585468345 1
{'model1': <models.features.salvador_wrap object at 0x7f331e2c4d30>, 'model2': <class 'sklearn.linear_model.logistic.LogisticRegression'>, 'method': ['wv', 'scale'], 'p2': {'class_weight': 'balanced', 'solver': 'liblinear', 'penalty': 'l2'}, 'corp': <models.features.fixed_makedata object at 0x7f340e9842e8>}
{'class_weight': 'balanced', 'solver': 'liblinear', 'pena

Unnamed: 0,test_f1,test_precision,test_recall,test_accuracy,train_f1,train_precision,train_recall,train_accuracy,model1,model2,method,tag,pickle_path
0,0.787,0.778,0.8,0.779,0.887,0.894,0.88,0.885,<models.features.salvador_wrap object at 0x7f33198eed30>,"LogisticRegression(C=0.1, class_weight='balanced', dual=False,\n fit_intercept=True, intercept_scaling=1, max_iter=100,\n multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,\n solver='liblinear', tol=0.0001, verbose=0, warm_start=False)","[wv, scale]",AMT,../pickle/2019-08-21 11:36:11.698018.pickle


save to ../csv/2019-08-21 11:36:12.417320.csv


Unnamed: 0,test_f1,test_precision,test_recall,test_accuracy,train_f1,train_precision,train_recall,train_accuracy,model1,model2,method,tag,pickle_path
0,0.787,0.778,0.8,0.779,0.887,0.894,0.88,0.885,<models.features.salvador_wrap object at 0x7f33198eed30>,"LogisticRegression(C=0.1, class_weight='balanced', dual=False,\n fit_intercept=True, intercept_scaling=1, max_iter=100,\n multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,\n solver='liblinear', tol=0.0001, verbose=0, warm_start=False)","[wv, scale]",AMT,../pickle/2019-08-21 11:36:11.698018.pickle


time: 39min 7s


### 2. BoW-NB and LGBM

In [4]:
model1s = [salvador_wrap('../data/fasttext_%.2d.bin'% (number)) for number in range(1,8+1,1)]
def inner_CV_for_model1(corp, model1, method):
    p2_lgbm = {'class_weight':'balanced', 'boosting':'gbrt'}
    spaces = {'model1': model1, 'model2': LGBMClassifier,'method': method,
              'p2': p2_lgbm, 'corp': corp} 
    best_space = fmin2_model1(fn = objective, space=spaces, max_evals = 8, field = 'model1')
    print(best_space)
    return best_space['model1']

def inner_CV_for_model2(corp, model1, method):
    from models.fmin2 import p2_lgbm
    spaces = {'model1': model1, 'model2': LGBMClassifier,'method': method,
              'p2': p2_lgbm, 'corp': corp}
    best_space = fmin2(fn = objective, space=spaces, max_evals = 20)
    print(best_space)
    model2 = best_space['model2'](**best_space['p2'])
    return model2

'''
0. start the model selection and evaluation
'''
dfs = []
corp0 = fixed_makedata(dic, ls, tag='AMT')
knowns = corp0.knowns
model1 = None
state = 2022
for method in [['wv','scale']]:
    print('state', state)
    corp = copy.deepcopy(corp0)
    results = []
    model2, fold_count = '', 0
    ss = StratifiedKFold(n_splits = 5, shuffle = True, random_state = state)
    for train_index, test_index in ss.split(corp.ls, corp.y):
        print('outer fold')
        print_time()
        '''
        1. model selection
        ::: model1 is for feature engineering
            e.g. model1 = pretrained_wrap(corp.knowns, 'glove-wiki-gigaword-300')
            e.g. model1 = gensim_wrap(corp.knowns, Word2Vec, params = {})
        ::: model2 is a classifier
            e.g. model2 = copy.deepcopy(classifier['default_Logistic'])
        '''
        model1 = inner_CV_for_model1(corp.replace(train_index), model1s, method)
        model2 = inner_CV_for_model2(corp.replace(train_index), model1, method)
        '''
        2. build X
        '''
        data = inputs_generater(corp.replace(train_index), model1, method)
        '''
        3. run and find the best prob threshold
        '''
        threshold = clf_running_search(data, model1, copy.deepcopy(model2), method)
        '''
        4. run again and evaluate on the real test set
        ''' 
        data = inputs_generater(corp.add_train_test(train_index, test_index), model1, method)
        result = clf_running(data, model1, copy.deepcopy(model2), method, threshold).result
        result.update({'model1': model1,'tag': corp.tag, 'method':method})
        results.append(result)
        print(result['test_f1'])

    pickle_path = auto_save_pickle(results)
    df = pickle2df(results, pickle_path)
    dfs.append(df)
auto_save_csv(pd.concat(dfs))

state 2022
outer fold
2019-08-21 11:36:28.557083
{'model1': <models.features.salvador_wrap object at 0x7f33d7919908>, 'model2': <class 'lightgbm.sklearn.LGBMClassifier'>, 'method': ['wv', 'scale'], 'p2': {'class_weight': 'balanced', 'boosting': 'gbrt'}, 'corp': <models.features.fixed_makedata object at 0x7f339780b2b0>}
{'class_weight': 'balanced', 'boosting': 'gbrt'}
2019-08-21 11:36:28.810454
[0.6785714285714286, 0.7317073170731708, 0.7250000000000001, 0.7636363636363638, 0.7974683544303798]
0.7392766927422686
-0.7392766927422686 -0.7392766927422686 1
{'model1': <models.features.salvador_wrap object at 0x7f33d76f0780>, 'model2': <class 'lightgbm.sklearn.LGBMClassifier'>, 'method': ['wv', 'scale'], 'p2': {'class_weight': 'balanced', 'boosting': 'gbrt'}, 'corp': <models.features.fixed_makedata object at 0x7f339780b2b0>}
{'class_weight': 'balanced', 'boosting': 'gbrt'}
2019-08-21 11:38:24.595013
[0.6875000000000001, 0.7388535031847134, 0.7058823529411765, 0.7840909090909091, 0.8407643312

OSError: [Errno 28] No space left on device

time: 23h 28min 48s


### Run additional commands to save the results

In [8]:
df = pickle2df(results, 'cannot save')
dfs.append(df)
auto_save_csv(pd.concat(dfs))

save to ../csv/2019-08-22 11:13:27.076694.csv


Unnamed: 0,test_f1,test_precision,test_recall,test_accuracy,train_f1,train_precision,train_recall,train_accuracy,model1,model2,method,tag,pickle_path
0,0.755,0.76,0.753,0.75,1,1,1,0.999,<models.features.salvador_wrap object at 0x7f331a01a2e8>,"LGBMClassifier(bagging_fraction=0.75, bagging_freq=10, boosting='gbrt',\n boosting_type='gbdt', class_weight='balanced',\n colsample_bytree=1.0, feature_fraction=0.75, gamma=0.55,\n importance_type='split', lambda_l2=0.1, learning_rate=0.15,\n max_depth=4, min_child_samples=20, min_child_weight=0.001,\n min_split_gain=0.0, n_estimators=2000, n_jobs=-1, num_leaves=512,\n objective=None, random_state=None, reg_alpha=0.0, reg_lambda=0.0,\n silent=True, subsample=0.7, subsample_for_bin=200000,\n subsample_freq=0)","[wv, scale]",AMT,cannot save


save to ../csv/2019-08-22 11:13:27.118675.csv


Unnamed: 0,test_f1,test_precision,test_recall,test_accuracy,train_f1,train_precision,train_recall,train_accuracy,model1,model2,method,tag,pickle_path
0,0.755,0.76,0.753,0.75,1,1,1,0.999,<models.features.salvador_wrap object at 0x7f331a01a2e8>,"LGBMClassifier(bagging_fraction=0.75, bagging_freq=10, boosting='gbrt',\n boosting_type='gbdt', class_weight='balanced',\n colsample_bytree=1.0, feature_fraction=0.75, gamma=0.55,\n importance_type='split', lambda_l2=0.1, learning_rate=0.15,\n max_depth=4, min_child_samples=20, min_child_weight=0.001,\n min_split_gain=0.0, n_estimators=2000, n_jobs=-1, num_leaves=512,\n objective=None, random_state=None, reg_alpha=0.0, reg_lambda=0.0,\n silent=True, subsample=0.7, subsample_for_bin=200000,\n subsample_freq=0)","[wv, scale]",AMT,cannot save


time: 130 ms


In [9]:
pickle_path = auto_save_pickle(results)

PicklingError: Can't pickle <class 'models.features.salvador_wrap'>: it's not the same object as models.features.salvador_wrap

time: 223 ms


In [14]:
for result in results:
    result['model1'] = None

time: 29.9 ms


In [15]:
pickle_path = auto_save_pickle(results)
df = pickle2df(results, pickle_path)
dfs.append(df)
auto_save_csv(pd.concat(dfs))

save to ../pickle/2019-08-22 11:20:02.849255.pickle
save to ../csv/2019-08-22 11:20:03.443018.csv


Unnamed: 0,test_f1,test_precision,test_recall,test_accuracy,train_f1,train_precision,train_recall,train_accuracy,model1,model2,method,tag,pickle_path
0,0.755,0.76,0.753,0.75,1,1,1,0.999,,"LGBMClassifier(bagging_fraction=0.75, bagging_freq=10, boosting='gbrt',\n boosting_type='gbdt', class_weight='balanced',\n colsample_bytree=1.0, feature_fraction=0.75, gamma=0.55,\n importance_type='split', lambda_l2=0.1, learning_rate=0.15,\n max_depth=4, min_child_samples=20, min_child_weight=0.001,\n min_split_gain=0.0, n_estimators=2000, n_jobs=-1, num_leaves=512,\n objective=None, random_state=None, reg_alpha=0.0, reg_lambda=0.0,\n silent=True, subsample=0.7, subsample_for_bin=200000,\n subsample_freq=0)","[wv, scale]",AMT,../pickle/2019-08-22 11:20:02.849255.pickle


save to ../csv/2019-08-22 11:20:03.538590.csv


Unnamed: 0,test_f1,test_precision,test_recall,test_accuracy,train_f1,train_precision,train_recall,train_accuracy,model1,model2,method,tag,pickle_path
0,0.755,0.76,0.753,0.75,1,1,1,0.999,<models.features.salvador_wrap object at 0x7f331a01a2e8>,"LGBMClassifier(bagging_fraction=0.75, bagging_freq=10, boosting='gbrt',\n boosting_type='gbdt', class_weight='balanced',\n colsample_bytree=1.0, feature_fraction=0.75, gamma=0.55,\n importance_type='split', lambda_l2=0.1, learning_rate=0.15,\n max_depth=4, min_child_samples=20, min_child_weight=0.001,\n min_split_gain=0.0, n_estimators=2000, n_jobs=-1, num_leaves=512,\n objective=None, random_state=None, reg_alpha=0.0, reg_lambda=0.0,\n silent=True, subsample=0.7, subsample_for_bin=200000,\n subsample_freq=0)","[wv, scale]",AMT,cannot save
0,0.755,0.76,0.753,0.75,1,1,1,0.999,,"LGBMClassifier(bagging_fraction=0.75, bagging_freq=10, boosting='gbrt',\n boosting_type='gbdt', class_weight='balanced',\n colsample_bytree=1.0, feature_fraction=0.75, gamma=0.55,\n importance_type='split', lambda_l2=0.1, learning_rate=0.15,\n max_depth=4, min_child_samples=20, min_child_weight=0.001,\n min_split_gain=0.0, n_estimators=2000, n_jobs=-1, num_leaves=512,\n objective=None, random_state=None, reg_alpha=0.0, reg_lambda=0.0,\n silent=True, subsample=0.7, subsample_for_bin=200000,\n subsample_freq=0)","[wv, scale]",AMT,../pickle/2019-08-22 11:20:02.849255.pickle


time: 748 ms
