-
Notifications
You must be signed in to change notification settings - Fork 0
/
ModelTraining.py
521 lines (408 loc) · 21.4 KB
/
ModelTraining.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
import sys
import os
import time
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
print(time.strftime('%Y/%m/%d %H:%M'))
print('OS:', sys.platform)
print('CPU Cores:', os.cpu_count())
print('Python:', sys.version)
print('NumPy:', np.__version__)
print('Pandas:', pd.__version__)
print('Scikit-Learn:', sklearn.__version__)
# Formatting for seaborn plots
sns.set_context('notebook', font_scale=1.1)
sns.set_style('ticks')
# Displays all dataframe columns
pd.set_option('display.max_columns', None)
#################################################################################################################
##### Cross Validation
# Holdout method
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=46)
# 80/10/10 train/eval/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=46)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=(0.1 / (1 - 0.1)), random_state=46)
# K-fold cross validation
from sklearn.model_selection import KFold, cross_val_score
k_fold = KFold(n_splits=10, shuffle=True, random_state=46)
cross_val_score(model, X, y, cv=k_fold, n_jobs=-1)
#################################################################################################################
##### Hyperparameter tuning
# Random Search
def hyperparameter_random_search(X: np.ndarray, y: np.ndarray, model, parameters: dict, num_iterations: int = 50, num_folds: int = 5):
"""
Perform random search for hyperparameter tuning using RandomizedSearchCV.
Args:
X (np.ndarray): The features used to train the model
y (np.ndarray): Target values.
model (Estimator object): The model to be tuned.
param_distributions (dict):
Dictionary with parameters names (string) as keys and distributions
or lists of parameters to try as values.
num_iterations (int, optional): Number of parameter settings that are sampled.
num_folds (int, optional): Number of cross-validation folds.
Returns:
BaseEstimator: Fitted estimator with best hyperparameters.
"""
# Randomized Search
randomized_search = RandomizedSearchCV(model, param_distributions=param_distributions,
n_iter=num_iterations, cv=num_folds, n_jobs=-1, verbose=2)
randomized_search.fit(X, y)
# Reporting the results
print('Best Estimator:', randomized_search.best_estimator_)
print('Best Parameters:', randomized_search.best_params_)
print('Best Score:', randomized_search.best_score_)
return randomized_search.best_estimator_
# Grid search
from sklearn.model_selection import GridSearchCV
# Specifying the model and parameters to use
parameters = {'kernel': ('linear', 'rbf'), 'C': [1, 10]}
svc = svm.SVC()
# Performing grid search
model = GridSearchCV(svc, parameters)
model.fit(X, y)
print('Best Estimator:', model.best_estimator_, '\n',
'Best Parameters:', model.best_params_, '\n',
'Best Score:', model.best_score_)
# Iteratively training ensemble models
# Note: this function needs some more work :)
def iteratively_train_ensemble_model(model, num_trees_to_try: int, X_train, y_train, X_test, y_test) -> list:
"""
Iteratively trains an ensemble model with different numbers of trees, collects the error for plotting, and returns the testing errors.
Args:
model (Ensemble model object): The ensemble model to be trained.
num_trees_to_try (int): The different numbers of trees to try.
X_train (np.ndarray): The training features.
y_train (np.ndarray): The training labels.
X_test (np.ndarray): The test data.
y_test (np.ndarray): The test labels.
Returns:
list: A list of testing errors for each number of trees.
# TODO: Allow different metrics, adjust for regression vs. classification, add total number of trees
"""
# Enforcing the model has a warm start for iterative training
if model.warm_start == False:
model.set_params(warm_start=True)
# Adding a seed if it does not exist
if model.random_state == None:
model.set_params(random_state=46)
# Collecting the error for plotting
testing_error = []
# Iteratively training the model
for num_trees in num_trees_to_try:
model.set_params(n_estimators=num_trees)
print('Fitting with {0} trees'.format(num_trees))
model.fit(X_train, y_train)
testing_error.append(metrics.log_loss(y_test, model.predict_proba(X_test)))
return testing_errors
#################################################################################################################
##### Class Probability Cutoffs
# Probability Threshold Search - xgboost
cv = cross_validation.KFold(len(X), n_folds=5, shuffle=True, random_state=46)
# Making a dataframe to store results of various iterations
xgbResults = pd.DataFrame(columns=['probabilityThreshold', 'f1'])
accuracy, precision, recall, f1 = [], [], [], []
# Parameters for the model
num_rounds = 8000
params = {'booster': 'gbtree', 'max_depth': 4, 'eta': 0.001, 'objective': 'binary:logistic'}
for traincv, testcv in cv:
# Converting the data frames/series to DMatrix objects for xgboost
Dtrain = xgb.DMatrix(X.ix[traincv], label=y[traincv])
Dtest = xgb.DMatrix(X.ix[testcv])
# Building the model and outputting class probability estimations
model = xgb.train(params, Dtrain, num_rounds)
predictions = model.predict(Dtest)
temporaryResults = pd.DataFrame(columns=['probabilityThreshold', 'f1'])
# Looping through probability thresholds to gather the f1 score at each threshold
for probabilityThreshold in np.linspace(0,0.1,100):
predBin = pd.Series(predictions).apply(lambda x: 1 if x > probabilityThreshold else 0)
threshF1 = {'probabilityThreshold': probabilityThreshold, 'f1': f1_score(y[testcv], predBin)}
temporaryResults = temporaryResults.append(threshF1, ignore_index=True)
# Retrieving the f1 score and probability thresholds at the highest f1 score
bestIndex = list(temporaryResults['f1']).index(max(temporaryResults['f1']))
bestTempResults = {'probabilityThreshold': temporaryResults.ix[bestIndex][0], 'f1': temporaryResults.ix[bestIndex][1]}
xgbResults = xgbResults.append(bestTempResults, ignore_index=True)
print('The Model performace is:')
print(xgbResults.mean())
# Probability Threshold Search - scikit-learn
def optimal_probability_cutoff(model, test_dataset: np.ndarray, test_labels: np.ndarray, max_thresh: float = 0.3, step_size: float = 0.01):
'''
Finds the optimal probability cutoff to maximize the F1 score.
Args:
model: The trained model used for prediction.
test_dataset (np.ndarray): The test dataset used for prediction.
test_labels (np.ndarray): The true labels of the test dataset.
max_thresh (float, optional): The maximum probability threshold to consider. Defaults to 0.3.
step_size (float, optional): The step size between probability thresholds. Defaults to 0.01.
Returns:
pd.Series: A pandas Series containing the optimal probability cutoff, F1 score.
The Series index represents the threshold and score.
'''
from sklearn import metrics
# Prediction probabilities of the test dataset
predicted = model.predict_proba(test_dataset)[:, 1]
# Creating an empty dataframe to fill with probability cutoff thresholds and f1 scores
results = pd.DataFrame(columns=['Threshold', 'F1 Score'])
# Setting f1 score average metric based on binary or multi-class classification
if len(np.unique(test_labels)) == 2:
avg = 'binary'
else:
avg = 'micro'
# Looping trhough different probability thresholds
for thresh in np.arange(0, (max_thresh+step_size), step_size):
pred_bin = pd.Series(predicted).apply(lambda x: 1 if x > thresh else 0)
f1 = metrics.f1_score(test_labels, pred_bin, average=avg)
tempResults = {'Threshold': thresh, 'F1 Score': f1}
results = results.append(tempResults, ignore_index=True)
# Plotting the F1 score throughout different probability thresholds
results.plot(x='Threshold', y='F1 Score')
plt.title('F1 Score by Probability Cutoff Threshold')
best_index = list(results['F1 Score']).index(max(results['F1 Score']))
print('Threshold for Optimal F1 Score:')
return results.iloc[best_index]
#################################################################################################################
##### Prediction Intervals
# Prediction Intervals - Ensemble Scikit-Learn Models
# This is also a messy function that needs work
def ensemble_prediction_intervals(model, X: np.ndarray, X_train=None, y_train=None, percentile: float = 0.95) -> pd.DataFrame:
"""
Calculates the specified prediction intervals for each prediction
from an ensemble scikit-learn model.
Args:
model:
The scikit-learn model to create prediction intervals for. This must be
either a RandomForestRegressor or GradientBoostingRegressor
X (np.ndarray): The input array to create predictions & prediction intervals for
X_train (np.ndarray, optional): The training features for the gradient boosted trees
y_train (np.ndarray, optional): The training label for the gradient boosted trees
percentile (float): The prediction interval percentile. Default of 0.95 is 0.025 - 0.975
Note: Use X_train and y_train when using a gradient boosted regressor because a copy of
the model will be re-trained with quantile loss.
These are not needed for a random forest regressor
Returns:
pd.DataFrame: The predictions and prediction intervals for X
TODO:
- Try to optimize by removing loops where possible
- Fix upper prediction intervals for gradient boosted regressors
- Make work with xgboost and lightgbm
"""
# Checking if the model has the estimators_ attribute
if 'estimators_' not in dir(model):
print('Not an ensemble model - exiting function')
return
# Accumulating lower and upper prediction intervals
lower_PI = []
upper_PI = []
# Generating predictions to be returned with prediction intervals
print('Generating predictions with the model')
predictions = model.predict(X)
# Prediction intervals for a random forest regressor
# Taken from https://blog.datadive.net/prediction-intervals-for-random-forests/
if str(type(model)) == "<class 'sklearn.ensemble.forest.RandomForestRegressor'>":
print('Generating upper and lower prediction intervals')
# Looping through individual records for predictions
for record in range(len(X)):
estimator_predictions = []
# Looping through estimators and gathering predictions
for estimator in model.estimators_:
estimator_predictions.append(estimator.predict(X[record].reshape(1, -1))[0])
# Adding prediction intervals
lower_PI.append(np.percentile(estimator_predictions, (1 - percentile) / 2.))
upper_PI.append(np.percentile(estimator_predictions, 100 - (1 - percentile) / 2.))
# Prediction intervals for gradient boosted trees
# Taken from http://scikit-learn.org/stable/auto_examples/ensemble/plot_gradient_boosting_quantile.html
if str(type(model)) == "<class 'sklearn.ensemble.gradient_boosting.GradientBoostingRegressor'>":
# Cloning the model so the original version isn't overwritten
from sklearn.base import clone
quantile_model = clone(model)
# Calculating buffer for upper/lower alpha to get the Xth percentile
alpha_buffer = ((1 - x) / 2)
alpha = percentile + alpha_buffer
# Setting the loss function to quantile before re-fitting
quantile_model.set_params(loss='quantile')
# Upper prediction interval
print('Generating upper prediction intervals')
quantile_model.set_params(alpha=alpha)
quantile_model.fit(X_train, y_train)
upper_PI = quantile_model.predict(X)
# Lower prediction interval
print('Generating lower prediction intervals')
quantile_model.set_params(alpha=(1 - alpha))
quantile_model.fit(X_train, y_train)
lower_PI = quantile_model.predict(X)
# Compiling results of prediction intervals and the actual predictions
results = pd.DataFrame({'lower_PI': lower_PI,
'prediction': predictions,
'upper_PI': upper_PI})
return results
#################################################################################################################
##### Ensemble Predictions
# Blending predictions - xgboost
def blend_xgboost_predictions(train_features: np.ndarray, train_labels: np.ndarray, prediction_features: np.ndarray, num_models: int = 3) -> np.ndarray:
"""
Trains the number of specified xgboost models and averages the predictions.
Args:
train_features (np.ndarray): The features for the training dataset
train_labels (np.ndarray): The labels for the training dataset
prediction_features (np.ndarray): The features to create predictions for
num_models (int): The number of models to train
Returns:
np.ndarray: Point or class probability predictions
"""
# Auto-detecting if it's a classification problem and setting the objective for the model
# Adjust the num_classes cutoff if dealing with a high number of classes
num_classes = len(np.unique(train_labels))
if num_classes < 50:
is_classification = 1
if num_classes == 2:
objective = 'binary:logistic'
else:
objective = 'multi:softprob'
else:
is_classification = 0
objective = 'reg:linear'
# Creating the prediction object to append results to
predictions = []
# Parameters for the model - http://xgboost.readthedocs.io/en/latest/parameter.html
num_rounds = 100
params = {'booster': 'gbtree',
'max_depth': 6, # Default is 6
'eta': 0.3, # Step size shrinkage. Default is 0.3
'alpha': 0, # L1 regularization. Default is 0.
'lambda': 1, # L2 regularization. Default is 1.
# Use reg:linear for regression
# Use binary:logistic, or multi:softprob for classification
# Add gpu: to the beginning if training with a GPU. Ex. 'gpu:'+objective
'objective': objective
}
# Adding the required parameter for num_classes if performing multiclass classificaiton
if is_classification == 1 and num_classes != 2:
params['num_class'] = num_classes
# Creating DMatrix objects from X/y
D_train = xgb.DMatrix(train_features, label=train_labels)
D_test = xgb.DMatrix(prediction_features)
# Training each model and gathering the predictions
for num_model in range(num_models):
# Progress printing for every 10% of completion
if (num_model+1) % (round(num_models) / 10) == 0:
print('Training model number', num_model+1)
# Training the model and gathering predictions
model = xgb.train(params, D_train, num_rounds)
model_prediction = model.predict(D_test)
predictions.append(model_prediction)
# Averaging the predictions for output
predictions = np.asarray(predictions).mean(axis=0)
return predictions
# Blending predictions - Scikit-Learn & LightGBM
def blend_predictions(model, train_features: np.ndarray, train_labels: np.ndarray, prediction_features: np.ndarray,
num_models: int = 3, average_results: bool = False) -> np.ndarray:
"""
Trains the number of specified scikit-learn or LightGBM models and averages the predictions.
Args:
train_features (np.ndarray): The features for the training dataset
train_labels (np.ndarray): The labels for the training dataset
prediction_features (np.ndarray): The features to create predictions for
num_models (int): The number of models to train
average_results (bool): Whether or not to return the raw results or the averaged results
Returns:
np.ndarray: A numpy array of point or class probability predictions
"""
from sklearn.base import clone
# Auto-detecting if it's a classification problem
# Adjust the num_classes cutoff if dealing with a high number of classes
num_classes = len(np.unique(train_labels))
if num_classes < 50:
is_classification = 1
else:
is_classification = 0
# Creating the prediction object to append results to
predictions = []
# Training each model and gathering the predictions
for num_model in range(num_models):
# Progress printing for every 10% of completion
if (num_model+1) % (round(num_models) / 10) == 0:
print('Training model number', num_model+1)
# Cloning the original model
model_iteration = clone(model)
# Training the model
model_iteration.fit(train_features, train_labels)
# Gathering predictions
if is_classification == 1:
model_prediction = model_iteration.predict_proba(prediction_features)
else:
model_prediction = model_iteration.predict(prediction_features)
predictions.append(model_prediction)
# Averaging the predictions for output
if average_results == True:
predictions = np.asarray(predictions).mean(axis=0)
return predictions
#################################################################################################################
##### Evaluating Clusters
def evaluate_k_means(data: np.ndarray, max_num_clusters: int = 10, is_data_scaled: bool = True) -> list:
"""
Evaluates the K-means clustering algorithm by computing the inertia for different numbers of clusters.
Args:
data (np.ndarray): The input data to be clustered.
max_num_clusters (int, optional): The maximum number of clusters to consider. Defaults to 10.
is_data_scaled: (bool, optional): Specifies whether the input data is already scaled. Defaults to True.
Returns:
List: A list containing the inertia values for each number of clusters.
"""
from sklearn.cluster import KMeans
# Min max scaling the data if it isn't already scaled
if is_data_scaled == False:
from sklearn.preprocessing import MinMaxScaler
data = MinMaxScaler().fit_transform(data)
# For gathering the results and plotting
inertia = []
clusters_to_try = np.arange(2, max_num_clusters+1)
# Iterating through the clusters and gathering the inertia
for num_clusters in np.arange(2, max_num_clusters+1):
print('Fitting with {0} clusters'.format(num_clusters))
model = KMeans(n_clusters=num_clusters, n_jobs=-1)
model.fit(data)
inertia.append(model.inertia_)
# Plotting the results
plt.figure(figsize=(10, 7))
plt.plot(clusters_to_try, inertia, marker='o')
plt.xticks(clusters_to_try)
plt.xlabel('# Clusters')
plt.ylabel('Inertia')
plt.title('Inertia by Number of Clusters')
plt.show()
return inertia
#################################################################################################################
##### Saving & Loading Models
def save_model(model, filepath: str, add_timestamp: bool = True) -> None:
"""
Saves a machine learning model to a file.
Args:
model: The trained model object to be saved.
filepath (str): The file path (including the file name and extension) where the model will be saved.
add_timestamp (bool, optional): Specifies whether to add a timestamp to the file name. Defaults to True.
Returns:
None
"""
import os
# Creating the sub directory if it does not exist
directory = filepath.split('/')[:-1] # Gathering the components of the file path
directory = '/'.join(directory) + '/' # Formatting into the directory/subdirectory/ format
if not os.path.exists(directory):
print('Creating the directory')
os.makedirs(directory)
# Adding the date to the end of the file name if it doesn't exist
# E.g. instead of model.pkl, model_yyyymmdd.pkl
if add_timestamp == True:
import datetime
today = datetime.datetime.today().strftime('%Y%m%d')
today = '_' + today + '.'
filepath = filepath.split('.')
filepath.insert(1, today)
filepath = ''.join(filepath)
print('Saving model')
pickle.dump(model, open(filepath, 'wb'))
print('Model saved')