In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import RandomizedSearchCV

# Process data

In [None]:
def get_data():
    test_dfn = pd.read_csv(r'test_df_after_fs_100.csv')
    #test_dfn = test_dfn.drop(['id'], axis=1)
    train_dfn = pd.read_csv(r'train_df_after_fs_100.csv')
    #train_dfn = train_dfn.drop(['id'], axis=1)

    X_train_df = train_dfn.iloc[:, :train_dfn.shape[1]-1]
    X_test_df = test_dfn.iloc[:, :test_dfn.shape[1]-1]

    Y_train_df = train_dfn.iloc[:, -1:]
    Y_test_df = test_dfn.iloc[:, -1:]


    return X_train_df, Y_train_df, X_test_df, Y_test_df


X_train_df, Y_train_df, X_test_df, Y_test_df = get_data()

In [None]:
pip install optuna

In [None]:
from sklearn import metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score

import seaborn as sns
import matplotlib.pyplot as plt

# KNN Tuning

In [None]:
import optuna
from sklearn.neighbors import KNeighborsClassifier




def knn_objective(trial):

    optimizer = trial.suggest_categorical('algorithm', ['auto','ball_tree','kd_tree','brute'])
    rf_max_depth = trial.suggest_int("k_n_neighbors", 2, 10, log=True)
    weights = trial.suggest_categorical("weights", ["uniform",  "distance"])
    p = trial.suggest_categorical("p", [1, 2])
    leaf_size = trial.suggest_int("leaf_size", 2, 16, step=4),


    clf = KNeighborsClassifier(n_neighbors=rf_max_depth,algorithm=optimizer, weights = weights, p = p, n_jobs = -1)
    clf.fit(X_train_df, Y_train_df)

    preds = clf.predict(X_test_df)
    pred_labels = np.rint(preds)
    accuracy = metrics.accuracy_score(Y_test_df, pred_labels)
    accuracy

    return accuracy

if __name__ == "__main__":
    study = optuna.create_study(direction="maximize")
    study.optimize(knn_objective, n_trials=128, timeout=360000)

    print("Number of finished trials: ", len(study.trials))
    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))
    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

# xgboost tuning

In [None]:
"""
Optuna example that optimizes a classifier configuration for cancer dataset
using XGBoost.
In this example, we optimize the validation accuracy of cancer detection
using XGBoost. We optimize both the choice of booster model and its
hyperparameters.
"""

import numpy as np
import optuna

import sklearn.datasets
import sklearn.metrics
from sklearn.model_selection import train_test_split
import xgboost as xgb


def objective(trial):

    dtrain = xgb.DMatrix(X_train_df, label=Y_train_df)
    dvalid = xgb.DMatrix(X_test_df, label=Y_test_df)

    param = {
        "verbosity": 0,
        "objective": "multi:softmax",
        # use exact for small dataset.
        #"tree_method": "exact",
        "num_class" : 10,
        # defines booster, gblinear for linear functions.
        "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
        # L2 regularization weight.
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
        # L1 regularization weight.
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
        # sampling ratio for training data.
        "subsample": trial.suggest_float("subsample", 0.2, 1.0),
        # sampling according to each tree.
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
        "early_stopping_rounds": trial.suggest_int("early_stopping_rounds", 8, 32),
        "n_estimators": trial.suggest_categorical("n_estimators", [16, 32, 64, 96]),
    }

    if param["booster"] in ["gbtree", "dart"]:
        # maximum depth of the tree, signifies complexity of the tree.
        param["max_depth"] = trial.suggest_int("max_depth", 3, 9, step=2)
        # minimum child weight, larger the term more conservative the tree.
        param["min_child_weight"] = trial.suggest_int("min_child_weight", 2, 10)
        param["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
        # defines how selective algorithm is.
        param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
        param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])

    if param["booster"] == "dart":
        param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
        param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
        param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
        param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)

    bst = xgb.train(param, dtrain)
    preds = bst.predict(dvalid)
    pred_labels = np.rint(preds)
    accuracy = sklearn.metrics.accuracy_score(Y_test_df, pred_labels)
    return accuracy


if __name__ == "__main__":
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=128, timeout=360000)

    print("Number of finished trials: ", len(study.trials))
    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))
    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

def rf_objective(trial:optuna.trial.Trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 8, 128),
        'max_depth': trial.suggest_int('max_depth', 4, 16),
        'min_samples_split': trial.suggest_int('min_samples_split', 4, 32),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 2, 16),
    }

    clf = RandomForestClassifier(random_state=42, **params)

    clf.fit(X_train_df, Y_train_df)

    preds = clf.predict(X_test_df)
    pred_labels = np.rint(preds)
    accuracy = sklearn.metrics.accuracy_score(Y_test_df, pred_labels)
    accuracy

    return accuracy

if __name__ == "__main__":
    study = optuna.create_study(direction="maximize")
    study.optimize(rf_objective, n_trials=128, timeout=360000)

    print("Number of finished trials: ", len(study.trials))
    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))
    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

# Decision Tree

In [None]:
import sklearn
print(sklearn.__version__)

In [None]:
from sklearn.tree import DecisionTreeClassifier


def dt_objective(trial:optuna.trial.Trial):
    params = {
        "splitter" : trial.suggest_categorical("splitter", ["best","random"]),
        "criterion" : trial.suggest_categorical("criterion", ["gini", "entropy"]),
        "max_depth" : trial.suggest_int('max_depth', 4, 16),
        "min_samples_split" : trial.suggest_int('min_samples_split', 4, 32),
        "min_samples_leaf" : trial.suggest_int('min_samples_leaf', 2, 16),
        "min_weight_fraction_leaf" : trial.suggest_float("min_weight_fraction_leaf", 1e-8, 0.5, log=True),
        "min_impurity_decrease" : trial.suggest_float("min_impurity_decrease", 1e-8, 1.0, log=True),
        #"max_features" : trial.suggest_categorical("max_features", [int, float, "auto", "sqrt", "log2"]),
        "max_leaf_nodes" : trial.suggest_int('max_leaf_nodes', 4, 32)
    }

    clf = DecisionTreeClassifier(**params)


    clf.fit(X_train_df, Y_train_df)

    preds = clf.predict(X_test_df)
    pred_labels = np.rint(preds)
    accuracy = metrics.accuracy_score(Y_test_df, pred_labels)
    accuracy

    return accuracy

if __name__ == "__main__":
    study = optuna.create_study(direction="maximize")
    study.optimize(dt_objective, n_trials=128, timeout=360000)

    print("Number of finished trials: ", len(study.trials))
    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))
    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

# SVM

In [None]:
from sklearn.svm import SVC


def svm_objective(trial):
    # C
    svc_c = trial.suggest_loguniform('C', 1e0, 1e2)
    # kernel
    kernel = trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf'])
    # SVC
    clf = SVC(C=svc_c, kernel=kernel)
    clf.fit(X_train_df, Y_train_df)

    preds = clf.predict(X_test_df)
    pred_labels = np.rint(preds)
    accuracy = sklearn.metrics.accuracy_score(Y_test_df, pred_labels)
    accuracy

    return accuracy

if __name__ == "__main__":
    study = optuna.create_study(direction="maximize")
    study.optimize(svm_objective, n_trials=128, timeout=36000000)

    print("Number of finished trials: ", len(study.trials))
    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))
    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))