In [1]:
%time
%load_ext autotime
%load_ext autoreload
%autoreload 2

import os
import sys
parent_dir = os.path.abspath(os.getcwd()+'/..')+'/' 
sys.path.append(parent_dir) 

import pandas as pd
import numpy as np
from copy import deepcopy

from utils.path import dir_HugeFiles
from utils.preprocessing import load
from utils.save import make_dir, save_pickle, load_pickle, auto_save_csv, print_time, auto_save_pickle

from models.fmin2 import fmin2, p2_lr, p2_lgbm
from models.nested_validation import *
from models.features import fixed_makedata, salvador_wrap, pretrained_wrap
from models.display import pickle2df

# models
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier

pd.set_option('display.max_colwidth', -1)

import warnings
warnings.filterwarnings("ignore")

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 9.06 µs


### loading the dataset

In [2]:
dic = load_pickle('../data/dic_20191203.pickle')
ls = list(dic.keys())

time: 177 ms


In [3]:
def inner_CV(corp, medel1, method, model2_name):
    ''' 
    speficify the parameters to tune
    '''
    if model2_name =='LR':
        spaces = {'model1': model1, 'model2': LogisticRegression, 'method': method,
                  'p2': p2_lr, 'corp': corp}
    elif model2_name =='LGBM':
        spaces = {'model1': model1, 'model2': LGBMClassifier, 'method': method,
          'p2': p2_lgbm, 'corp': corp}
    else:
        print('cannot detect this model2')
        return
    
    ''' how fmin2 works
    Args:
        fn: model to run, in the form of function
        space: dict of potential parameters
    Return:
        best: dict of best parameters
    ''' 
    best_space = fmin2(fn = objective, space=spaces, max_evals = 10)
    print(best_space)
    model2 = best_space['model2'](**best_space['p2'])
    return model2

time: 28.5 ms


### Example of training the best model in our paper
word embedding (glove) and nutritional properties: ['wv','nu','scale']

In [4]:
'''
0. start the model selection and evaluation
'''
dfs = []
corp0 = fixed_makedata(dic, ls, tag='AMT')
knowns = corp0.knowns
state = 2022
model1 = pretrained_wrap(knowns, 'glove-wiki-gigaword-300')

for method in [['wv','nu','scale']]:
    corp = copy.deepcopy(corp0)
    results = []
    model2, fold_count = '', 0
    print('state', state)
    ss = StratifiedKFold(n_splits = 5, shuffle = True, random_state = state)
    for train_index, test_index in ss.split(corp.ls, corp.y):
        print('outer fold')
        print_time()
        '''
        1. model selection
        ::: model1 is for feature engineering
            e.g. model1 = pretrained_wrap(corp.knowns, 'glove-wiki-gigaword-300')
            e.g. model1 = gensim_wrap(corp.knowns, Word2Vec, params = {})
        ::: model2 is a classifier
            e.g. model2 = copy.deepcopy(classifier['default_Logistic'])
        '''
        model2 = inner_CV(corp.replace(train_index), model1, method, 'LGBM')
        '''
        2. build X
        '''
        data = inputs_generater(corp.replace(train_index), model1, method)
        '''
        3. run and find the best prob threshold
        '''
        threshold = clf_running_search(data, model1, copy.deepcopy(model2), method)
        '''
        4. run again and evaluate on the real test set
        ''' 
        data = inputs_generater(corp.add_train_test(train_index, test_index), model1, method)
        result = clf_running(data, model1, copy.deepcopy(model2) ,method, threshold).result
        result.update({'model1': model1,'tag': corp.tag, 'method':method})
        results.append(result)
        print(result['test_f1'])

    pickle_path = auto_save_pickle(results)
    df = pickle2df(results, pickle_path)
    dfs.append(df)
auto_save_csv(pd.concat(dfs))

state 2022
outer fold
2019-12-03 15:48:44.590906
{'class_weight': 'balanced', 'boosting': 'gbrt', 'num_leaves': 256, 'max_depth': 16, 'learning_rate': 0.15, 'gamma': 0.65, 'n_estimators': 2000, 'lambda_l2': 1, 'feature_fraction': 0.5, 'bagging_fraction': 0.75, 'bagging_freq': 5, 'subsample': 0.7}
2019-12-03 15:48:44.780922
[0.8343558282208587, 0.8322981366459627, 0.802547770700637, 0.8674698795180723, 0.8917197452229298]
0.8456782720616921
-0.8456782720616921 -0.8456782720616921
{'class_weight': 'balanced', 'boosting': 'gbrt', 'num_leaves': 512, 'max_depth': 8, 'learning_rate': 0.15, 'gamma': 0.55, 'n_estimators': 2000, 'lambda_l2': 1, 'feature_fraction': 0.75, 'bagging_fraction': 0.5, 'bagging_freq': 5, 'subsample': 0.7}
2019-12-03 15:49:59.298873
[0.8263473053892216, 0.8383233532934131, 0.7792207792207793, 0.8520710059171598, 0.8875]
0.8366924887641147
-0.8366924887641147 -0.8456782720616921
{'class_weight': 'balanced', 'boosting': 'gbrt', 'num_leaves': 64, 'max_depth': 8, 'learning_

Unnamed: 0,test_f1,test_precision,test_recall,test_accuracy,train_f1,train_precision,train_recall,train_accuracy,model1,model2,method,tag,pickle_path
0,0.853,0.855,0.852,0.849,1,1,1,1,<models.features.pretrained_wrap object at 0x7f3db38a20b8>,"LGBMClassifier(bagging_fraction=0.75, bagging_freq=5, boosting='gbrt',\n boosting_type='gbdt', class_weight='balanced',\n colsample_bytree=1.0, feature_fraction=0.5, gamma=0.65,\n importance_type='split', lambda_l2=1, learning_rate=0.15,\n max_depth=16, min_child_samples=20, min_child_weight=0.001,\n min_split_gain=0.0, n_estimators=2000, n_jobs=-1, num_leaves=256,\n objective=None, random_state=None, reg_alpha=0.0, reg_lambda=0.0,\n silent=True, subsample=0.7, subsample_for_bin=200000,\n subsample_freq=0)","[wv, nu, scale]",AMT,../pickle/2019-12-03 16:53:30.072536.pickle


save to ../csv/2019-12-03 16:53:30.586502.csv


Unnamed: 0,test_f1,test_precision,test_recall,test_accuracy,train_f1,train_precision,train_recall,train_accuracy,model1,model2,method,tag,pickle_path
0,0.853,0.855,0.852,0.849,1,1,1,1,<models.features.pretrained_wrap object at 0x7f3db38a20b8>,"LGBMClassifier(bagging_fraction=0.75, bagging_freq=5, boosting='gbrt',\n boosting_type='gbdt', class_weight='balanced',\n colsample_bytree=1.0, feature_fraction=0.5, gamma=0.65,\n importance_type='split', lambda_l2=1, learning_rate=0.15,\n max_depth=16, min_child_samples=20, min_child_weight=0.001,\n min_split_gain=0.0, n_estimators=2000, n_jobs=-1, num_leaves=256,\n objective=None, random_state=None, reg_alpha=0.0, reg_lambda=0.0,\n silent=True, subsample=0.7, subsample_for_bin=200000,\n subsample_freq=0)","[wv, nu, scale]",AMT,../pickle/2019-12-03 16:53:30.072536.pickle


time: 1h 7min 11s


### Examples of training these models with logistic regression
Naive Bayes BoW: ['wc','nb'] <br>
Nutritions only: ['nu','scale'] <br>
Nutritions + Naive Bayes BoW: ['wc','nb','nu','scale']

In [5]:
'''
0. start the model selection and evaluation
'''
dfs = []
corp0 = fixed_makedata(dic, ls, tag='AMT')
knowns = corp0.knowns
model1 = None
state = 2022
p2 = p2_lr

for method in [['wc','nb'], ['nu','scale'], ['wc','nb','nu','scale']]:
    print('state', state)
    corp = copy.deepcopy(corp0)
    results = []
    model2, fold_count = '', 0
    ss = StratifiedKFold(n_splits = 5, shuffle = True, random_state = state)
    for train_index, test_index in ss.split(corp.ls, corp.y):
        print('outer fold')
        print_time()
        '''
        1. model selection
        ::: model1 is for feature engineering
            e.g. model1 = pretrained_wrap(corp.knowns, 'glove-wiki-gigaword-300')
            e.g. model1 = gensim_wrap(corp.knowns, Word2Vec, params = {})
        ::: model2 is a classifier
            e.g. model2 = copy.deepcopy(classifier['default_Logistic'])
        '''
        model2 = inner_CV(corp.replace(train_index), model1, method, 'LR')
        '''
        2. build X
        '''
        data = inputs_generater(corp.replace(train_index), model1, method)
        '''
        3. run and find the best prob threshold
        '''
        threshold = clf_running_search(data, model1, copy.deepcopy(model2), method)
        '''
        4. run again and evaluate on the real test set
        ''' 
        data = inputs_generater(corp.add_train_test(train_index, test_index), model1, method)
        result = clf_running(data, model1, copy.deepcopy(model2), method, threshold).result
        result.update({'model1': model1,'tag': corp.tag, 'method':method})
        results.append(result)
        print(result['test_f1'])

    pickle_path = auto_save_pickle(results)
    df = pickle2df(results, pickle_path)
    dfs.append(df)
auto_save_csv(pd.concat(dfs))

state 2022
outer fold
2019-12-03 16:58:36.822864
{'class_weight': 'balanced', 'solver': 'liblinear', 'penalty': 'l2', 'C': 0.5}
2019-12-03 16:58:36.921181
[0.7547169811320754, 0.8024691358024691, 0.830188679245283, 0.7901234567901234, 0.7712418300653595]
0.7897480166070621
-0.7897480166070621 -0.7897480166070621
{'class_weight': 'balanced', 'solver': 'liblinear', 'penalty': 'l2', 'C': 1.0}
2019-12-03 16:58:45.094085
[0.7515923566878981, 0.7901234567901234, 0.8220858895705522, 0.7757575757575756, 0.7564102564102564]
0.7791939070432811
-0.7791939070432811 -0.7897480166070621
{'class_weight': 'balanced', 'solver': 'liblinear', 'penalty': 'l2', 'C': 3.0}
2019-12-03 16:58:52.793119
[0.7468354430379747, 0.75, 0.8242424242424242, 0.7682926829268293, 0.7564102564102564]
0.7691561613234968
-0.7691561613234968 -0.7897480166070621
{'class_weight': 'balanced', 'solver': 'liblinear', 'penalty': 'l2', 'C': 1000}
2019-12-03 16:59:00.681515
[0.7341772151898733, 0.7560975609756098, 0.8170731707317075, 

Unnamed: 0,test_f1,test_precision,test_recall,test_accuracy,train_f1,train_precision,train_recall,train_accuracy,model1,model2,method,tag,pickle_path
0,0.817,0.844,0.794,0.818,0.938,0.948,0.929,0.938,,"LogisticRegression(C=0.05, class_weight='balanced', dual=False,\n fit_intercept=True, intercept_scaling=1, max_iter=100,\n multi_class='warn', n_jobs=None, penalty='l2', random_state=None,\n solver='liblinear', tol=0.0001, verbose=0, warm_start=False)","[wc, nb]",AMT,../pickle/2019-12-03 17:05:14.787145.pickle


state 2022
outer fold
2019-12-03 17:05:15.002723
{'class_weight': 'balanced', 'solver': 'liblinear', 'penalty': 'l2', 'C': 0.5}
2019-12-03 17:05:15.108281
[0.7857142857142857, 0.8098159509202455, 0.8152866242038217, 0.8481012658227849, 0.8627450980392157]
0.8243326449400709
-0.8243326449400709 -0.8243326449400709
{'class_weight': 'balanced', 'solver': 'liblinear', 'penalty': 'l2', 'C': 1.0}
2019-12-03 17:05:17.296444
[0.7857142857142857, 0.8024691358024691, 0.8152866242038217, 0.8481012658227849, 0.8684210526315789]
0.823998472834988
-0.823998472834988 -0.8243326449400709
{'class_weight': 'balanced', 'solver': 'liblinear', 'penalty': 'l2', 'C': 3.0}
2019-12-03 17:05:19.405596
[0.7810650887573963, 0.8024691358024691, 0.8152866242038217, 0.8481012658227849, 0.8684210526315789]
0.8230686334436103
-0.8230686334436103 -0.8243326449400709
{'class_weight': 'balanced', 'solver': 'liblinear', 'penalty': 'l2', 'C': 1000}
2019-12-03 17:05:21.485558
[0.7810650887573963, 0.8074534161490683, 0.81528

Unnamed: 0,test_f1,test_precision,test_recall,test_accuracy,train_f1,train_precision,train_recall,train_accuracy,model1,model2,method,tag,pickle_path
0,0.825,0.83,0.822,0.822,0.839,0.841,0.837,0.836,,"LogisticRegression(C=0.1, class_weight='balanced', dual=False,\n fit_intercept=True, intercept_scaling=1, max_iter=100,\n multi_class='warn', n_jobs=None, penalty='l2', random_state=None,\n solver='liblinear', tol=0.0001, verbose=0, warm_start=False)","[nu, scale]",AMT,../pickle/2019-12-03 17:07:05.648392.pickle


state 2022
outer fold
2019-12-03 17:07:05.779588
{'class_weight': 'balanced', 'solver': 'liblinear', 'penalty': 'l2', 'C': 0.5}
2019-12-03 17:07:05.887161
[0.7901234567901234, 0.8242424242424242, 0.8461538461538461, 0.8048780487804877, 0.8079470198675496]
0.8146689591668862
-0.8146689591668862 -0.8146689591668862
{'class_weight': 'balanced', 'solver': 'liblinear', 'penalty': 'l2', 'C': 1.0}
2019-12-03 17:07:13.837242
[0.7852760736196319, 0.8148148148148148, 0.830188679245283, 0.7975460122699387, 0.8133333333333334]
0.8082317826566003
-0.8082317826566003 -0.8146689591668862
{'class_weight': 'balanced', 'solver': 'liblinear', 'penalty': 'l2', 'C': 3.0}
2019-12-03 17:07:21.561494
[0.7804878048780488, 0.8098159509202455, 0.830188679245283, 0.7926829268292682, 0.8211920529801324]
0.8068734829705955
-0.8068734829705955 -0.8146689591668862
{'class_weight': 'balanced', 'solver': 'liblinear', 'penalty': 'l2', 'C': 1000}
2019-12-03 17:07:29.196390
[0.7484662576687118, 0.8023952095808382, 0.80745

Unnamed: 0,test_f1,test_precision,test_recall,test_accuracy,train_f1,train_precision,train_recall,train_accuracy,model1,model2,method,tag,pickle_path
0,0.85,0.859,0.842,0.847,0.933,0.937,0.929,0.932,,"LogisticRegression(C=0.05, class_weight='balanced', dual=False,\n fit_intercept=True, intercept_scaling=1, max_iter=100,\n multi_class='warn', n_jobs=None, penalty='l2', random_state=None,\n solver='liblinear', tol=0.0001, verbose=0, warm_start=False)","[wc, nb, nu, scale]",AMT,../pickle/2019-12-03 17:13:45.459874.pickle


save to ../csv/2019-12-03 17:13:45.499644.csv


Unnamed: 0,test_f1,test_precision,test_recall,test_accuracy,train_f1,train_precision,train_recall,train_accuracy,model1,model2,method,tag,pickle_path
0,0.817,0.844,0.794,0.818,0.938,0.948,0.929,0.938,,"LogisticRegression(C=0.05, class_weight='balanced', dual=False,\n fit_intercept=True, intercept_scaling=1, max_iter=100,\n multi_class='warn', n_jobs=None, penalty='l2', random_state=None,\n solver='liblinear', tol=0.0001, verbose=0, warm_start=False)","[wc, nb]",AMT,../pickle/2019-12-03 17:05:14.787145.pickle
0,0.825,0.83,0.822,0.822,0.839,0.841,0.837,0.836,,"LogisticRegression(C=0.1, class_weight='balanced', dual=False,\n fit_intercept=True, intercept_scaling=1, max_iter=100,\n multi_class='warn', n_jobs=None, penalty='l2', random_state=None,\n solver='liblinear', tol=0.0001, verbose=0, warm_start=False)","[nu, scale]",AMT,../pickle/2019-12-03 17:07:05.648392.pickle
0,0.85,0.859,0.842,0.847,0.933,0.937,0.929,0.932,,"LogisticRegression(C=0.05, class_weight='balanced', dual=False,\n fit_intercept=True, intercept_scaling=1, max_iter=100,\n multi_class='warn', n_jobs=None, penalty='l2', random_state=None,\n solver='liblinear', tol=0.0001, verbose=0, warm_start=False)","[wc, nb, nu, scale]",AMT,../pickle/2019-12-03 17:13:45.459874.pickle


time: 15min 10s
