In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
import time
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, recall_score, precision_score
from tqdm import tqdm_notebook

In [2]:
# https://archive.ics.uci.edu/ml/datasets/iris
path = "./data/iris.data"#

In [3]:
df = pd.read_csv(path, header=0)
dc = {"Iris-setosa": 0, "Iris-versicolor":1, "Iris-virginica":2}
df["class"].replace(dc, inplace=True)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
features = df.drop("class", axis=1)
labels = df["class"]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.7, random_state=42)

In [18]:
def train_CV(X, Y, par, mod):
    start = time.time()
    model = mod
    clf = GridSearchCV(model, par, n_jobs=-1, cv=3, verbose=1)
    clf = clf.fit(X, Y)
    print("best train accuracy: ", clf.best_score_)
    print(clf.best_params_)
    end = time.time()
    elapsed = end - start
    print("training time: ", elapsed)
    return clf

def test_scr(model):
    start = time.time()
    for i in tqdm_notebook(range(5000)):
        pred = model.predict(X_test)
    f1 = f1_score(pred, y_test, average="weighted")
    recall = recall_score(pred, y_test, average="weighted")
    prec = precision_score(pred, y_test, average="weighted")
    print("F1: ", f1, "; Recall: ", recall, "; Precision: ", prec, "; Acc: ", np.mean(pred==y_test) )
    end = time.time()
    elapsed = end - start
    print("predicting 5.000 times: ", elapsed)
    return pred

In [19]:
par = {
    "class_weight" : [None, "balanced"],
    "solver" : ["newton-cg", "sag", "saga", "lbfgs"],
    "multi_class" : ["auto", "ovr", "multinomial"],
    "C" : np.arange(0.01,1,0.01)
}
mod = LogisticRegression(n_jobs = -1,random_state=123, max_iter=20000)
log_reg = train_CV(X_train, y_train, par, mod)
pred_LR = test_scr(log_reg)

Fitting 3 folds for each of 2376 candidates, totalling 7128 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:    8.9s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   13.7s
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:   19.9s
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:   27.2s
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed:   36.0s
[Parallel(n_jobs=-1)]: Done 3456 tasks      | elapsed:   49.7s
[Parallel(n_jobs=-1)]: Done 5156 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 7056 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 7113 out of 7128 | elapsed:  1.6min remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 7128 out of 7128 | elapsed:  1.6min finished


best train accuracy:  0.9333333333333333
{'C': 0.05, 'class_weight': 'balanced', 'multi_class': 'auto', 'solver': 'newton-cg'}
training time:  101.83012533187866


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  from ipykernel import kernelapp as app


HBox(children=(FloatProgress(value=0.0, max=5000.0), HTML(value='')))


F1:  0.8679166666666666 ; Recall:  0.8666666666666667 ; Precision:  0.8768155911013055 ; Acc:  0.8666666666666667
predicting 5.000 times:  2.36970853805542


In [20]:
par = {
    "class_weight" : [None, "balanced", "balanced_subsample"],
    "criterion" : ["gini", "entropy"],
    "min_samples_split" : [2,3,4,5,6,7, 8],
    "min_samples_leaf" : np.arange(1,20,1),
    "max_features" : ["auto", "sqrt", "log2"]
}
mod = RandomForestClassifier(n_jobs = -1,random_state=123)
RF = train_CV(X_train, y_train, par, mod)

Fitting 3 folds for each of 2394 candidates, totalling 7182 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   15.0s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   24.5s
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:   36.7s
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:   51.6s
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 4984 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 6034 tasks      | elapsed:  2.8min


best train accuracy:  0.9333333333333333
{'class_weight': 'balanced', 'criterion': 'gini', 'max_features': 'auto', 'min_samples_leaf': 5, 'min_samples_split': 2}
training time:  200.7949938774109


[Parallel(n_jobs=-1)]: Done 7182 out of 7182 | elapsed:  3.3min finished


In [21]:
pred_RF = test_scr(RF)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  from ipykernel import kernelapp as app


HBox(children=(FloatProgress(value=0.0, max=5000.0), HTML(value='')))


F1:  0.9428571428571428 ; Recall:  0.9428571428571428 ; Precision:  0.9438279095421952 ; Acc:  0.9428571428571428
predicting 5.000 times:  534.1122047901154


In [22]:
par = {
    "class_weight" : [None, "balanced"],
    "penalty" : ["l2"],
    "loss" : ["hinge", "squared_hinge"],
    "C" : np.arange(0.1, 1, 0.003),
    "multi_class" : ["ovr", "crammer_singer"],
    "class_weight" : ["balanced", None]
}
mod = LinearSVC(random_state=123, max_iter=80000000)
SVM = train_CV(X_train, y_train, par, mod)
pred_SVM = test_scr(SVM)

Fitting 3 folds for each of 2400 candidates, totalling 7200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done 2584 tasks      | elapsed:    6.8s


best train accuracy:  0.9555555555555556
{'C': 0.9010000000000007, 'class_weight': 'balanced', 'loss': 'hinge', 'multi_class': 'crammer_singer', 'penalty': 'l2'}
training time:  15.149875402450562


[Parallel(n_jobs=-1)]: Done 7200 out of 7200 | elapsed:   15.0s finished
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  from ipykernel import kernelapp as app


HBox(children=(FloatProgress(value=0.0, max=5000.0), HTML(value='')))


F1:  0.9809818664307829 ; Recall:  0.9809523809523809 ; Precision:  0.9820408163265306 ; Acc:  0.9809523809523809
predicting 5.000 times:  2.5108327865600586
