forked from chu-data-lab/CleanML
-
Notifications
You must be signed in to change notification settings - Fork 0
/
train.py
164 lines (143 loc) · 6.51 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
"""Train the model"""
import numpy as np
import pandas as pd
import argparse
import config
import utils
import sys
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import pickle
from preprocess import preprocess
from sklearn.metrics import f1_score
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import cross_val_score
def parse_searcher(searcher):
"""Get results from gridsearch
Args:
searcher: GridSearchCV object
"""
train_accs = searcher.cv_results_['mean_train_score']
val_accs = searcher.cv_results_['mean_test_score']
best_idx = searcher.best_index_
best_params = searcher.best_params_
train_acc, val_acc = train_accs[best_idx], val_accs[best_idx]
best_model = searcher.best_estimator_
return best_model, best_params, train_acc, val_acc
def train(X_train, y_train, estimator, param_grid, seed=1, n_jobs=1, skip=False):
"""Train the model
Args:
X_train (pd.DataFrame): features (train)
y_train (pd.DataFrame): label (train)
estimator (sklearn.model): model
param_grid (dict): hyper-parameters to tune
seed (int): seed for training
n_jobs (int): num of threads
"""
np.random.seed(seed)
# cleamml 2020
if skip:
best_model = estimator
best_model.fit(X_train, y_train)
result = {}
return best_model, result
# train and tune hyper parameter with 5-fold cross validation
if param_grid is not None:
searcher = GridSearchCV(estimator, param_grid, cv=5, n_jobs=n_jobs, return_train_score=True)
searcher.fit(X_train, y_train)
best_model, best_params, train_acc, val_acc = parse_searcher(searcher)
else:
# if no hyper parameter is given, train directly
best_model = estimator
val_acc = cross_val_score(best_model, X_train, y_train, cv=5).mean()
best_model.fit(X_train, y_train)
train_acc = best_model.score(X_train, y_train)
best_params = {}
result = {"best_params": best_params, "train_acc":train_acc, "val_acc": val_acc}
return best_model, result
def evaluate(best_model, X_test_list, y_test_list, test_files):
# evaluate on test sets
result = {}
for X_test, y_test, file in zip(X_test_list, y_test_list, test_files):
y_pred = best_model.predict(X_test)
test_acc = np.mean(y_pred == y_test)
result[file + "_test_acc"] = test_acc
if len(set(y_test)) > 2:
test_f1 = f1_score(y_test, y_pred, average='macro')
else:
test_f1 = f1_score(y_test, y_pred)
result[file + "_test_f1"] = test_f1
return result
def get_coarse_grid(model, seed, n_jobs, N):
"""Get hyper parameters (coarse random search) """
np.random.seed(seed)
low, high = model["hyperparams_range"]
if model["hyperparams_type"] == "real":
param_grid = {model['hyperparams']: 10 ** np.random.uniform(low, high, 20)}
if model["hyperparams_type"] == "int":
if model["name"] == "knn_classification":
high = min(high, int(N/5*4))
param_grid = {model['hyperparams']: np.random.randint(low, high, 20)}
return param_grid
def get_fine_grid(model, best_param_coarse, n_jobs, N):
"""Get hyper parameters (fine grid search, around the best parameter in coarse search) """
if model["hyperparams_type"] == "real":
base = np.log10(best_param_coarse)
param_grid = {model['hyperparams']: np.linspace(10**(base-0.5), 10**(base+0.5), 20)}
if model["hyperparams_type"] == "int":
low = max(best_param_coarse - 10, 1)
high = low + 20
if model["name"] == "knn_classification":
high = min(high, int(N/5*4))
param_grid = {model['hyperparams']: np.arange(low, high)}
return param_grid
def hyperparam_search(X_train, y_train, model, n_jobs=1, seed=1, hyperparams=None):
np.random.seed(seed)
coarse_param_seed, coarse_train_seed, fine_train_seed = np.random.randint(1000, size=3)
fixed_params = model["fixed_params"]
if "parallelable" in model.keys() and model['parallelable']:
fixed_params["n_jobs"] = n_jobs
if hyperparams is not None:
if "hyperparams_type" in model and model["hyperparams_type"] == "int":
hyperparams[model["hyperparams"]] = int(hyperparams[model["hyperparams"]])
fixed_params.update(hyperparams)
estimator = model["fn"](**fixed_params)
# hyperparameter search
if "hyperparams" not in model.keys() or hyperparams is not None:
# if no hyper parmeter, train directly
best_model, result = train(X_train, y_train, estimator, None, n_jobs=n_jobs, seed=coarse_train_seed, skip=(hyperparams is not None))
else:
# coarse random search
param_grid = get_coarse_grid(model, coarse_param_seed, n_jobs, len(y_train))
best_model_coarse, result_coarse = train(X_train, y_train, estimator, param_grid, n_jobs=n_jobs, seed=coarse_train_seed)
val_acc_coarse = result_coarse['val_acc']
# fine grid search
best_param_coarse = result_coarse['best_params'][model['hyperparams']]
param_grid = get_fine_grid(model, best_param_coarse, n_jobs, len(y_train))
best_model_fine, result_fine = train(X_train, y_train, estimator, param_grid, n_jobs=n_jobs, seed=fine_train_seed)
val_acc_fine = result_fine['val_acc']
if val_acc_fine > val_acc_coarse:
result = result_fine
best_model = best_model_fine
else:
result = result_coarse
best_model = best_model_coarse
# convert int to float to avoid json error
if model["hyperparams_type"] == "int":
result['best_params'][model["hyperparams"]] *= 1.0
return best_model, result
def train_and_evaluate(X_train, y_train, X_test_list, y_test_list, test_files, model, n_jobs=1, seed=1, hyperparams=None):
"""Search hyperparameters and evaluate
Args:
X_train (pd.DataFrame): features (train)
y_train (pd.DataFrame): label (train)
X_test_list (list): list of features (test)
y_test_list (list): list of label (test)
test_files (list): list of filenames of test set
model (dict): ml model dict in model.py
seed (int): seed for training
n_jobs (int): num of threads
"""
best_model, result_train = hyperparam_search(X_train, y_train, model, n_jobs, seed, hyperparams)
result_test = evaluate(best_model, X_test_list, y_test_list, test_files)
result = {**result_train, **result_test}
return result