In [0]:
import pickle
import h5py
import os
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

In [3]:
# mount Google Drive
# note that authorization code migh be reguired
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
# set and check paths for the database and the output
databasePath = r"/content/gdrive/My Drive/KrakN/database/features_s_2.hdf5"
modelPath = r"/content/gdrive/My Drive/KrakN/KrakN_model.cpickle"

In [0]:
# check if the features database is present
if not os.path.exists(databasePath):
    print("Features file at {}\nDoes not exist!\nQuitting now".format(databasePath))
    quit()

In [0]:
# check if the model file exists
# old files will be overwritten
if os.path.exists(modelPath):
    os.remove(modelPath)

In [0]:
# specify hom many jobs to run in parallel by model selection
# -1 means all processors
jobs = 1

In [0]:
# open database
db = h5py.File(databasePath, "r")
# and set the training / testing split index
i = int(db["labels"].shape[0] * 0.75)

In [0]:
# train Logistic Regression classifier
print("Tuning hyperparameters...")
params = {"C": [0.1, 1.0, 10.0, 100.0, 1000.0, 10000.0]}
model = GridSearchCV(LogisticRegression(max_iter=700), params, cv=3, n_jobs=jobs, verbose=20)
model.fit(db["features"][:i], db["labels"][:i])
print("Best hyperparameter: {}".format(model.best_params_))

Tuning hyperparameters...
Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV] C=0.1 ...........................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ............................... C=0.1, score=0.976, total= 1.4min
[CV] C=0.1 ...........................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.4min remaining:    0.0s


[CV] ............................... C=0.1, score=0.968, total= 1.3min
[CV] C=0.1 ...........................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  2.8min remaining:    0.0s


[CV] ............................... C=0.1, score=0.969, total= 1.2min
[CV] C=1.0 ...........................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  3.9min remaining:    0.0s


[CV] ............................... C=1.0, score=0.975, total= 1.7min
[CV] C=1.0 ...........................................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  5.6min remaining:    0.0s


[CV] ............................... C=1.0, score=0.968, total= 1.4min
[CV] C=1.0 ...........................................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  7.0min remaining:    0.0s


[CV] ............................... C=1.0, score=0.968, total= 1.5min
[CV] C=10.0 ..........................................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  8.4min remaining:    0.0s


[CV] .............................. C=10.0, score=0.974, total= 1.7min
[CV] C=10.0 ..........................................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed: 10.2min remaining:    0.0s


[CV] .............................. C=10.0, score=0.968, total= 1.2min
[CV] C=10.0 ..........................................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed: 11.4min remaining:    0.0s


[CV] .............................. C=10.0, score=0.966, total= 1.5min
[CV] C=100.0 .........................................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed: 12.8min remaining:    0.0s


[CV] ............................. C=100.0, score=0.973, total= 1.3min
[CV] C=100.0 .........................................................


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed: 14.2min remaining:    0.0s


[CV] ............................. C=100.0, score=0.967, total= 1.3min
[CV] C=100.0 .........................................................


[Parallel(n_jobs=1)]: Done  11 out of  11 | elapsed: 15.5min remaining:    0.0s


In [0]:
# evaluate model
print("Evaluating...")
preds = model.predict(db["features"][i:])
print(classification_report(db["labels"][i:], preds, target_names=db["label_names"]))

In [0]:
# save model to disk
print("Saving model...")
f = open(modelPath, "wb")
f.write(pickle.dumps(model.best_estimator_))
f.close()

In [0]:
# close the features database
db.close()