In [1]:
random_seed = 666

import os, warnings, sys, six

from datetime import datetime

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
warnings.filterwarnings("ignore")
sys.modules['sklearn.externals.six'] = six

import numpy as np
import pandas as pd

from sklearn.model_selection import (
    train_test_split,
    StratifiedKFold,
)

from dask_ml.model_selection import RandomizedSearchCV

from project.utils import (
    custome_read_data, 
    make_metrics_for_tuning, 
    evalute_test_data,
    save_results
)

from project.models import (
    get_encoders,
    get_ml_algo,
    get_oversampler,
    get_imputer,
)

from project.pipelines import make_main_pipeline

pd.set_option("max_rows", 100)
pd.set_option("max_columns", 100)

#### Reading data

In [2]:
data_dict = custome_read_data()

h1n1_data = data_dict["data"]

#### Examine class label imbalance 

In [3]:
print(
    "Dont receive vaccine",
    round(h1n1_data["label"].compute().value_counts()[0] / len(h1n1_data) * 100, 2),
    "% of the dataset\n",
)
print(
    "Receive vaccine",
    round(h1n1_data["label"].compute().value_counts()[1] / len(h1n1_data) * 100, 2),
    "% of the dataset",
)

Dont receive vaccine 78.75 % of the dataset

Receive vaccine 21.25 % of the dataset


#### Splitting train test 

In [4]:
h1n1_copy = h1n1_data.compute().copy()

x_train, x_test, y_train, y_test = train_test_split(
    h1n1_copy.drop("label", axis=1),
    h1n1_copy["label"],
    test_size=0.30,
    random_state=random_seed,
    stratify=h1n1_copy["label"],
    shuffle=True,
)
print("TRAIN DATA SHAPE: ", x_train.shape, "\n")
print("TEST DATA SHAPE: ", x_test.shape, "\n")

train_unique_label, train_counts_label = np.unique(y_train, return_counts=True)
test_unique_label, test_counts_label = np.unique(y_test, return_counts=True)

x_train_copy = x_train.copy()
y_train_copy = y_train.copy()

print("Label Distributions: \n")
print("Train :", train_counts_label / len(y_train), "\n")
print("Test :", test_counts_label / len(y_test))

TRAIN DATA SHAPE:  (18694, 35) 

TEST DATA SHAPE:  (8013, 35) 

Label Distributions: 

Train : [0.78752541 0.21247459] 

Test : [0.78759516 0.21240484]


In [5]:
def run(
    scores,
    ml_models_dict,
    imputer_dict,
    oversampler_dict,
    encoders_dict,
    x_train_run,
    y_train_run,
    x_test_run,
    y_test_run,
    n_folds,
    randomized_iteration,
):

    for model_name, model_hps in ml_models_dict.items():

        for imputer_name, imputer_hps in imputer_dict.items():

            for over_name, oversampler_hps in oversampler_dict.items():

                for encoder_name, encoder_hps in encoders_dict.items():

                    pipeline = make_main_pipeline(
                        model_hps["model"],
                        imputer_hps["model"],
                        encoder_hps["model"],
                        oversampler_hps["model"],
                    )

                    skf = StratifiedKFold(
                        n_splits=n_folds, shuffle=True, random_state=random_seed
                    )

                    # Creating search space
                    ml_model_hypers = {
                        "classifying__" + str(k): v
                        for k, v in model_hps["hyperparameters"].items()
                    }
                    oversampler_hypers = {
                        "oversampling__" + str(k): v
                        for k, v in oversampler_hps["hyperparameters"].items()
                    }
                    imputer_hypers =  {
                        "imputing__" + str(k): v
                        for k, v in imputer_hps["hyperparameters"].items()
                    }
                    encoder_hypers =  {
                        "preprocessing__categorical__encoding__" + str(k): v
                        for k, v in encoder_hps["hyperparameters"].items()
                    }
                    
                    search_space = {
                        **ml_model_hypers,
                        **oversampler_hypers,
                        **imputer_hypers,
                        **encoder_hypers, 
                    }
                    
                    # Calculating search space size
                    search_space_size = 1
                    for k, v in search_space.items():
                        search_space_size *= len(v)

                    # Defining RandomizedSearch on search space
                    randomized_tuner = RandomizedSearchCV(
                        pipeline,
                        search_space,
                        n_iter=randomized_iteration,
                        n_jobs=-1,
                        random_state=random_seed,
                        cv=skf,
                        refit="f1",
                        scoring=scores,
                        return_train_score=True,
                    )

                    print(
                        "Fitting following config with RandomizedSearch : \nmodel : {0} \
                            \noversampller : {1} \nImputer : {2} \nencoder : {3}\n".format(
                            model_name, over_name, imputer_name, encoder_name
                        ),
                    )
                    
                    print("Training....\n")
                    start_time = datetime.now()
                    
                    randomized_tuner.fit(x_train_run, y_train_run)
                    
                    print("Training is done.\n")
                    end_time = datetime.now()
                    
                    total_time = end_time - start_time
                    print("Execution time: {}".format(total_time))
                    
                    # Making prediction on test data
                    y_pred_run = randomized_tuner.predict(x_test_run)
                    test_results = evalute_test_data(y_test_run, y_pred_run)

                    save_results(
                        tuner=randomized_tuner,
                        total_exe_time=total_time,
                        search_spcace_size=search_space_size,
                        model_name=model_name,
                        over_name=over_name,
                        imputer_name=imputer_name,
                        encoder_name=encoder_name,
                        test_results=test_results,
                    )

                print(50 * "-")

    print("All configs have run successfully")


In [6]:
metrics = make_metrics_for_tuning()
encoders = get_encoders(["BackwardDifferenceEncoder"])
imputers = get_imputer(["KnnImputer"])
oversamplers = get_oversampler(["SMOTE"])
models = get_ml_algo(["XGB"])

In [7]:
best = run(scores = metrics,
    ml_models_dict=models,
    imputer_dict=imputers,
    oversampler_dict=oversamplers,
    encoders_dict=encoders, 
    x_train_run=x_train_copy,
    y_train_run=y_train_copy,
    y_test_run=y_test,
    x_test_run=x_test,
    n_folds=2,
    randomized_iteration=1
)

Fitting following config with RandomizedSearch : 
model : XGB                             
oversampller : SMOTE 
Imputer : KnnImputer 
encoder : BackwardDifferenceEncoder

Training....

