In [0]:
import pickle
import h5py
import os
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

In [0]:
# mount Google Drive
# note that the authorization code might be required
from google.colab import drive
drive.mount('/content/gdrive')

In [0]:
# set and check paths for the database and the output
databasePath = r"/content/gdrive/My Drive/KrakN/database/features_s_2.hdf5"
modelPath = r"/content/gdrive/My Drive/KrakN/KrakN_model.cpickle"

In [0]:
# check if the features database is present
if not os.path.exists(databasePath):
    print("Features file at {}\nDoes not exist!\nQuitting now".format(databasePath))
    quit()

In [0]:
# check if the model file exists
# old files will be overwritten
if os.path.exists(modelPath):
    os.remove(modelPath)

In [0]:
# specify hom many jobs to run in parallel by model selection
# -1 means all processors
jobs = 1

In [0]:
# open database
db = h5py.File(databasePath, "r")
# and set the training / testing split index
i = int(db["labels"].shape[0] * 0.75)

In [0]:
# train Logistic Regression classifier
print("Tuning hyperparameters...")
params = {"C": [0.1, 1.0, 10.0, 100.0, 1000.0, 10000.0]}
model = GridSearchCV(LogisticRegression(max_iter=700), params, cv=3, n_jobs=jobs, verbose=20)
model.fit(db["features"][:i], db["labels"][:i])
print("Best hyperparameter: {}".format(model.best_params_))

In [0]:
# evaluate model
print("Evaluating...")
preds = model.predict(db["features"][i:])
print(classification_report(db["labels"][i:], preds, target_names=db["label_names"]))

In [0]:
# save model to disk
print("Saving model...")
f = open(modelPath, "wb")
f.write(pickle.dumps(model.best_estimator_))
f.close()

In [0]:
# close the features database
db.close()