In [1]:
%time
%load_ext autotime
%load_ext autoreload
%autoreload 2

import os
import sys
parent_dir = os.path.abspath(os.getcwd()+'/..')+'/' 
sys.path.append(parent_dir) 

import pandas as pd
import numpy as np
from copy import deepcopy

from utils.path import dir_HugeFiles
from utils.preprocessing import load
from utils.save import make_dir, save_pickle, load_pickle, auto_save_csv, print_time, auto_save_pickle

from models.fmin2 import fmin2, fmin2_model1
from models.nested_validation import *
from models.features import fixed_makedata, salvador_wrap, pretrained_wrap, doc2vec_wrap
from models.display import pickle2df

# models
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier

pd.set_option('display.max_colwidth', -1)

import warnings
warnings.filterwarnings("ignore")

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 6.91 µs


In [2]:
dir_save = os.path.normpath('../data/dic_20190819.pickle')
dic = load(dir_save)

amt_GI = load_pickle('../data/amt_hGI_pos.pickle')
for i, v in dic.items():
    if i in amt_GI.keys():
        v['AMT'] = amt_GI[i]
amt_ls = list(amt_GI.keys())
ls = amt_ls

exist
time: 16.8 s


### 1. word2vec and LR

In [3]:
model1s = [load_pickle('../data/doc2vec_%.2d'% (number)) for number in range(1,8+1,1)]

def inner_CV_for_model1(corp, model1, method):
    print('inner_CV_for_model1')
    ''' fmin from hyperopt
    Args:
        fn: model to run, in the form of function
        space: dict of potential parameters
    Return:
        best: dict of best parameters
    '''
    p2_lr = {'class_weight':'balanced','solver': 'liblinear', 'penalty':'l2'}
    spaces = {'model1': model1, 'model2': LogisticRegression,'method': method,
              'p2': p2_lr, 'corp': corp}
    # tuning model1
    best_space = fmin2_model1(fn = objective, space=spaces, max_evals = 8)
    print(best_space)
    return best_space['model1']

def inner_CV_for_model2(corp, model1, method):
    print('inner_CV_for_model2')
    from models.fmin2 import p2_lr
    
    spaces = {'model1': model1, 'model2': LogisticRegression,'method': method,
              'p2': p2_lr, 'corp': corp}
    # tuning p2: the params for model2
    best_space = fmin2(fn = objective, space=spaces, max_evals = 10)
    print(best_space)
    model2 = best_space['model2'](**best_space['p2'])
    return model2

'''
0. start the model selection and evaluation
'''
dfs = []
corp0 = fixed_makedata(dic, ls, tag='AMT')
knowns = corp0.knowns
model1 = None
state = 2022
for method in [['wv','scale']]:
    print('state', state)
    corp = copy.deepcopy(corp0)
    results = []
    model2, fold_count = '', 0
    ss = StratifiedKFold(n_splits = 5, shuffle = True, random_state = state)
    for train_index, test_index in ss.split(corp.ls, corp.y):
        print('outer fold')
        print_time()
        '''
        1. model selection
        ::: model1 is for feature engineering
            e.g. model1 = pretrained_wrap(corp.knowns, 'glove-wiki-gigaword-300')
            e.g. model1 = gensim_wrap(corp.knowns, Word2Vec, params = {})
        ::: model2 is a classifier
            e.g. model2 = copy.deepcopy(classifier['default_Logistic'])
        '''
        model1 = inner_CV_for_model1(corp.replace(train_index), model1s, method)
        model2 = inner_CV_for_model2(corp.replace(train_index), model1, method)
        '''
        2. build X
        '''
        data = inputs_generater(corp.replace(train_index), model1, method)
        '''
        3. run and find the best prob threshold
        '''
        threshold = clf_running_search(data, model1, copy.deepcopy(model2), method)
        '''
        4. run again and evaluate on the real test set
        ''' 
        data = inputs_generater(corp.add_train_test(train_index, test_index), model1, method)
        result = clf_running(data, model1, copy.deepcopy(model2), method, threshold).result
        result.update({'model1': model1,'tag': corp.tag, 'method':method})
        results.append(result)
        print(result['test_f1'])

    pickle_path = auto_save_pickle(results)
    df = pickle2df(results, pickle_path)
    dfs.append(df)
auto_save_csv(pd.concat(dfs))

state 2022
outer fold
2019-08-21 12:58:54.467966
inner_CV_for_model1
{'model1': <models.features.doc2vec_wrap object at 0x7f14091a6e80>, 'model2': <class 'sklearn.linear_model.logistic.LogisticRegression'>, 'method': ['wv', 'scale'], 'p2': {'class_weight': 'balanced', 'solver': 'liblinear', 'penalty': 'l2'}, 'corp': <models.features.fixed_makedata object at 0x7f141dc53240>}
{'class_weight': 'balanced', 'solver': 'liblinear', 'penalty': 'l2'}
2019-08-21 12:58:54.752416
[0.6540880503144653, 0.6265060240963856, 0.6962025316455697, 0.6538461538461539, 0.6582278481012659]
0.657774121600768
-0.657774121600768 -0.657774121600768 1
{'model1': <models.features.doc2vec_wrap object at 0x7f149d303ef0>, 'model2': <class 'sklearn.linear_model.logistic.LogisticRegression'>, 'method': ['wv', 'scale'], 'p2': {'class_weight': 'balanced', 'solver': 'liblinear', 'penalty': 'l2'}, 'corp': <models.features.fixed_makedata object at 0x7f141dc53240>}
{'class_weight': 'balanced', 'solver': 'liblinear', 'penalty

Unnamed: 0,test_f1,test_precision,test_recall,test_accuracy,train_f1,train_precision,train_recall,train_accuracy,model1,model2,method,tag,pickle_path
0,0.745,0.761,0.731,0.744,0.803,0.816,0.79,0.802,<models.features.doc2vec_wrap object at 0x7f141d572080>,"LogisticRegression(C=0.01, class_weight='balanced', dual=False,\n fit_intercept=True, intercept_scaling=1, max_iter=100,\n multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,\n solver='liblinear', tol=0.0001, verbose=0, warm_start=False)","[wv, scale]",AMT,../pickle/2019-08-21 14:23:39.084825.pickle


save to ../csv/2019-08-21 14:23:43.682872.csv


Unnamed: 0,test_f1,test_precision,test_recall,test_accuracy,train_f1,train_precision,train_recall,train_accuracy,model1,model2,method,tag,pickle_path
0,0.745,0.761,0.731,0.744,0.803,0.816,0.79,0.802,<models.features.doc2vec_wrap object at 0x7f141d572080>,"LogisticRegression(C=0.01, class_weight='balanced', dual=False,\n fit_intercept=True, intercept_scaling=1, max_iter=100,\n multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,\n solver='liblinear', tol=0.0001, verbose=0, warm_start=False)","[wv, scale]",AMT,../pickle/2019-08-21 14:23:39.084825.pickle


time: 1h 25min 8s


### 2. BoW-NB and LGBM

In [4]:
model1s = [load_pickle('../data/doc2vec_%.2d'% (number)) for number in range(1,8+1,1)]
def inner_CV_for_model1(corp, model1, method):
    p2_lgbm = {'class_weight':'balanced', 'boosting':'gbrt'}
    spaces = {'model1': model1, 'model2': LGBMClassifier,'method': method,
              'p2': p2_lgbm, 'corp': corp} 
    best_space = fmin2_model1(fn = objective, space=spaces, max_evals = 8, field = 'model1')
    print(best_space)
    return best_space['model1']

def inner_CV_for_model2(corp, model1, method):
    from models.fmin2 import p2_lgbm
    spaces = {'model1': model1, 'model2': LGBMClassifier,'method': method,
              'p2': p2_lgbm, 'corp': corp}
    best_space = fmin2(fn = objective, space=spaces, max_evals = 20)
    print(best_space)
    model2 = best_space['model2'](**best_space['p2'])
    return model2

'''
0. start the model selection and evaluation
'''
dfs = []
corp0 = fixed_makedata(dic, ls, tag='AMT')
knowns = corp0.knowns
model1 = None
state = 2022
for method in [['wv','scale']]:
    print('state', state)
    corp = copy.deepcopy(corp0)
    results = []
    model2, fold_count = '', 0
    ss = StratifiedKFold(n_splits = 5, shuffle = True, random_state = state)
    for train_index, test_index in ss.split(corp.ls, corp.y):
        print('outer fold')
        print_time()
        '''
        1. model selection
        ::: model1 is for feature engineering
            e.g. model1 = pretrained_wrap(corp.knowns, 'glove-wiki-gigaword-300')
            e.g. model1 = gensim_wrap(corp.knowns, Word2Vec, params = {})
        ::: model2 is a classifier
            e.g. model2 = copy.deepcopy(classifier['default_Logistic'])
        '''
        model1 = inner_CV_for_model1(corp.replace(train_index), model1s, method)
        model2 = inner_CV_for_model2(corp.replace(train_index), model1, method)
        '''
        2. build X
        '''
        data = inputs_generater(corp.replace(train_index), model1, method)
        '''
        3. run and find the best prob threshold
        '''
        threshold = clf_running_search(data, model1, copy.deepcopy(model2), method)
        '''
        4. run again and evaluate on the real test set
        ''' 
        data = inputs_generater(corp.add_train_test(train_index, test_index), model1, method)
        result = clf_running(data, model1, copy.deepcopy(model2), method, threshold).result
        result.update({'model1': model1,'tag': corp.tag, 'method':method})
        results.append(result)
        print(result['test_f1'])

    pickle_path = auto_save_pickle(results)
    df = pickle2df(results, pickle_path)
    dfs.append(df)
auto_save_csv(pd.concat(dfs))

state 2022
outer fold
2019-08-21 14:24:05.152741
{'model1': <models.features.doc2vec_wrap object at 0x7f141d6c0dd8>, 'model2': <class 'lightgbm.sklearn.LGBMClassifier'>, 'method': ['wv', 'scale'], 'p2': {'class_weight': 'balanced', 'boosting': 'gbrt'}, 'corp': <models.features.fixed_makedata object at 0x7f141d572278>}
{'class_weight': 'balanced', 'boosting': 'gbrt'}
2019-08-21 14:24:05.428710
[0.670807453416149, 0.7337278106508877, 0.7439024390243902, 0.6982248520710058, 0.7469879518072289]
0.7187301013939325
-0.7187301013939325 -0.7187301013939325 1
{'model1': <models.features.doc2vec_wrap object at 0x7f141d6c0e48>, 'model2': <class 'lightgbm.sklearn.LGBMClassifier'>, 'method': ['wv', 'scale'], 'p2': {'class_weight': 'balanced', 'boosting': 'gbrt'}, 'corp': <models.features.fixed_makedata object at 0x7f141d572278>}
{'class_weight': 'balanced', 'boosting': 'gbrt'}
2019-08-21 14:27:48.253329
[0.6909090909090909, 0.6792452830188679, 0.7402597402597403, 0.7647058823529411, 0.8051948051948

Unnamed: 0,test_f1,test_precision,test_recall,test_accuracy,train_f1,train_precision,train_recall,train_accuracy,model1,model2,method,tag,pickle_path
0,0.763,0.78,0.749,0.762,1,1,1,0.999,<models.features.doc2vec_wrap object at 0x7f141d615048>,"LGBMClassifier(bagging_fraction=0.75, bagging_freq=5, boosting='gbrt',\n boosting_type='gbdt', class_weight='balanced',\n colsample_bytree=1.0, feature_fraction=0.5, gamma=0.65,\n importance_type='split', lambda_l2=1, learning_rate=0.15,\n max_depth=16, min_child_samples=20, min_child_weight=0.001,\n min_split_gain=0.0, n_estimators=2000, n_jobs=-1, num_leaves=256,\n objective=None, random_state=None, reg_alpha=0.0, reg_lambda=0.0,\n silent=True, subsample=0.7, subsample_for_bin=200000,\n subsample_freq=0)","[wv, scale]",AMT,../pickle/2019-08-22 11:09:31.083960.pickle


save to ../csv/2019-08-22 11:09:36.565984.csv


Unnamed: 0,test_f1,test_precision,test_recall,test_accuracy,train_f1,train_precision,train_recall,train_accuracy,model1,model2,method,tag,pickle_path
0,0.763,0.78,0.749,0.762,1,1,1,0.999,<models.features.doc2vec_wrap object at 0x7f141d615048>,"LGBMClassifier(bagging_fraction=0.75, bagging_freq=5, boosting='gbrt',\n boosting_type='gbdt', class_weight='balanced',\n colsample_bytree=1.0, feature_fraction=0.5, gamma=0.65,\n importance_type='split', lambda_l2=1, learning_rate=0.15,\n max_depth=16, min_child_samples=20, min_child_weight=0.001,\n min_split_gain=0.0, n_estimators=2000, n_jobs=-1, num_leaves=256,\n objective=None, random_state=None, reg_alpha=0.0, reg_lambda=0.0,\n silent=True, subsample=0.7, subsample_for_bin=200000,\n subsample_freq=0)","[wv, scale]",AMT,../pickle/2019-08-22 11:09:31.083960.pickle


time: 20h 45min 52s
