-
Notifications
You must be signed in to change notification settings - Fork 0
/
train_utils.py
136 lines (102 loc) · 4.25 KB
/
train_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import os
import time
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.externals import joblib
MODELS_DIR = 'models'
TRAIN_LOGS_DIR = 'train_logs'
TENSORBOARD_LOGS_DIR = os.path.join(TRAIN_LOGS_DIR, 'tensorboard')
def cross_val_rmse(model, X, y, cv=5, random_state=None, model_name=None, verbose=False):
"""
Using K-fold cross validation, this function evaluates root mean squared error on training folds and validation folds
"""
# make sure X and y are numpy arrays for slicing later
X = np.array(X)
y = np.array(y)
# split data into folds
kf = KFold(n_splits=cv, shuffle=False, random_state=random_state)
fold_indices = kf.split(X)
if verbose:
print(f"Starting {cv}-fold cross validation")
rmse_list = []
for i, indices in enumerate(fold_indices):
train_indices = indices[0]
val_indices = indices[1]
# train the model on the training folds
timer_start = time.perf_counter_ns()
model.fit(X[train_indices], y[train_indices])
train_time = time.perf_counter_ns() - timer_start
# evaluate the model on all folds
y_pred = model.predict(X)
train_rmse = np.sqrt(mean_squared_error(y[train_indices], y_pred[train_indices]))
val_rmse = np.sqrt(mean_squared_error(y[val_indices], y_pred[val_indices]))
if verbose:
print(f"fit {i}\ttrain RMSE: {train_rmse:.3f}\t val RMSE: {val_rmse:.3f}\t train time: {train_time * 1e-9:.2f} s")
rmse_list.append([train_rmse, val_rmse])
# create a data frame
index = ['train', 'val']
if model_name is not None:
for i in range(len(index)):
index[i] = f"{model_name} {index[i]}"
df = pd.DataFrame(np.array(rmse_list).T, index=index, columns=["fold " + str(i) for i in range(cv)])
# compute mean and standard deviation
df_mean = df.mean(axis=1)
df_std = df.std(axis=1)
df['mean'] = df_mean
df['std'] = df_std
return df
def save_model(model, filename):
"""
Saves a scikit-learn model
"""
if not os.path.isdir(MODELS_DIR):
os.makedirs(MODELS_DIR)
joblib.dump(model, os.path.join(MODELS_DIR, filename))
def load_model(filename):
"""
Loads a scikit-learn model
"""
return joblib.load(os.path.join(MODELS_DIR, filename))
def save_cv_results(cv_results, filename):
df = pd.DataFrame(cv_results)
if not os.path.isdir(TRAIN_LOGS_DIR):
os.makedirs(TRAIN_LOGS_DIR)
df.to_csv(os.path.join(TRAIN_LOGS_DIR, filename))
def summarize_cv_results(cv_results):
# count number of splits
n_splits = 0
while f"split{n_splits}_train_score" in cv_results.keys():
n_splits += 1
# get parameters
params = pd.DataFrame(cv_results['params'])
# calculate errors rather than scores
train_errors = np.array([np.sqrt(-cv_results[f"split{i}_train_score"]) for i in range(n_splits)]).T
val_errors = np.array([np.sqrt(-cv_results[f"split{i}_test_score"]) for i in range(n_splits)]).T
# mean and standard deviations of errors
train_error_mean = train_errors.mean(axis=1)
val_error_mean = val_errors.mean(axis=1)
train_error_std = train_errors.std(axis=1)
val_error_std = val_errors.std(axis=1)
# difference between validation errors and training errors
error_diff_mean = val_error_mean - train_error_mean
errors_df = pd.DataFrame({
"train_error_mean": train_error_mean,
"val_error_mean": val_error_mean,
"error_diff_mean": error_diff_mean,
"train_error_std": train_error_std,
"val_error_std": val_error_std,
})
return pd.concat([params, errors_df], axis=1)
def get_fit_logdir():
fit_id = time.strftime("fit_%Y_%m_%d-%H_%M_%S")
return os.path.join(TENSORBOARD_LOGS_DIR, fit_id)
def evaluate_keras_model(model, train_X, train_y, val_X, val_y):
# make predictions
train_pred_y = model.predict(train_X)
val_pred_y = model.predict(val_X)
# calculate root mean squared error
train_rmse = np.sqrt(mean_squared_error(train_y, train_pred_y))
val_rmse = np.sqrt(mean_squared_error(val_y, val_pred_y))
return pd.DataFrame({'train_rmse': [train_rmse], 'val_rmse': [val_rmse]})