In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

from pandas import read_csv

#import datasets
data= read_csv(r"~/forest_balanced.csv")
data.shape

In [None]:
X=data.iloc[:,0:11]
Y=data.iloc[:,11]
Y

In [None]:
import optuna
import catboost as cb
from sklearn.metrics import accuracy_score
def objective(trial):
    train_x, valid_x, train_y, valid_y = train_test_split(X, Y, test_size=0.3)

    param = {
        "model_size_reg":trial.suggest_float("model_size_reg",2, 6),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg",2, 7),
        "learning_rate":trial.suggest_float("learning_rate", 0.01, 0.5),
        "loss_function": "MultiClass",
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
        "depth": trial.suggest_int("depth", 1, 10),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]),}

    
    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1)

    CB = cb.CatBoostClassifier(**param)

    CB.fit(train_x, train_y, eval_set=[(valid_x, valid_y)], verbose=0)

    preds = CB.predict(valid_x)
    pred_labels = np.rint(preds)
    accuracy = accuracy_score(valid_y, pred_labels)
    return accuracy


if __name__ == "__main__":
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=300)

    print("Number of finished trials: {}".format(len(study.trials)))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

In [None]:
from optuna.visualization import plot_optimization_history
from optuna.visualization import plot_parallel_coordinate

plot_optimization_history(study)

In [None]:
from catboost import CatBoostClassifier
CB=CatBoostClassifier(iterations=2500,
                        learning_rate=0.1, depth=7, l2_leaf_reg=3,
                        model_size_reg=2, loss_function='MultiClassOneVsAll',verbose=250,eval_metric='Accuracy',task_type='GPU');

cb = CB.fit(xtrain, ytrain,eval_set=(xtest,ytest))
cb_trscore=cb.score(xtrain,ytrain)
ypred_cb=cb.predict(xtest)
print("cb Train Accuracy:",cb_trscore)
print("cb Test Accuracy:", metrics.accuracy_score(ytest, ypred_cb))
print('----------------------------------------')
print('* cb Classification Report')
print(classification_report(ytest, ypred_cb))
print('----------------------------------------')
print('* cb Confusion Matrix')


# Plot confusion matrix
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
# Plot confusion matrix
cm = confusion_matrix(ytest, ypred_cb)
cmn = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
sns.heatmap(cmn, annot=True,cmap='RdPu',  fmt='.2f',xticklabels=["C1", "C2", "C3", "C4","C5","C6","C7"],
            yticklabels=["C1", "C2", "C3", "C4","C5","C6","C7"])


In [None]:
#XGBoost optimization

import sklearn.metrics
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import accuracy_score


def objective(trial):
    train_x, valid_x, train_y, valid_y = train_test_split(X, Y, test_size=0.3)
    dtrain = xgb.DMatrix(train_x, label=train_y)
    dvalid = xgb.DMatrix(valid_x, label=valid_y)

    param = {"num_class":7,
        "verbosity": 0,
#        "objective":trial.suggest_categorical("objective",[ "multi:softmax","multi:softprob"]),
        # use exact for small dataset.
        "tree_method":trial.suggest_categorical("tree_method", ["exact" ,"approx"]),
        # defines booster, gblinear for linear functions.
        "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
        # L2 regularization weight.
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
        # L1 regularization weight.
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
        # sampling ratio for training data.
        "subsample": trial.suggest_float("subsample", 0.2, 1.0),
        # sampling according to each tree.
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
         "max_depth": trial.suggest_int("max_depth", 1, 120),
             "n_estimators":trial.suggest_int("n_estimators", 1, 2000)
    }

    if param["booster"] in ["gbtree", "dart"]:
        # minimum child weight, larger the term more conservative the tree.
        param["min_child_weight"] = trial.suggest_int("min_child_weight", 2, 10)
        param["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
        # defines how selective algorithm is.
        param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
        param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])

    if param["booster"] == "dart":
        param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
        param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
        param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
        param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)

    bst = xgb.train(param, dtrain)
    preds = bst.predict(dvalid)
    pred_labels = np.rint(preds)
    accuracy = sklearn.metrics.accuracy_score(valid_y, pred_labels)
    return accuracy


if __name__ == "__main__":
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=240, timeout=600)

    print("Number of finished trials: ", len(study.trials))
    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))
    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

In [None]:
import xgboost
XGB=xgboost.XGBClassifier(objective="multi:softprob", max_depth=9, n_estimators=250,booster='gbtree',min_child_weight=0.9,subsample=0.5)

xgb = XGB.fit(xtrain, ytrain)
xgb_trscore=xgb.score(xtrain,ytrain)
ypred_xgb=xgb.predict(xtest)
print("xgb Train Accuracy:",xgb_trscore)
print("xgb Test Accuracy:", metrics.accuracy_score(ytest, ypred_xgb))
print('----------------------------------------')
print('* xgb Classification Report')
print(classification_report(ytest, ypred_xgb))
print('----------------------------------------')
print('* xgb Confusion Matrix')


# Plot confusion matrix
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
# Plot confusion matrix
cm = confusion_matrix(ytest, ypred_xgb)
cmn = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
sns.heatmap(cmn, annot=True,cmap='RdPu',  fmt='.2f',xticklabels=["C1", "C2", "C3", "C4","C5","C6","C7"],
            yticklabels=["C1", "C2", "C3", "C4","C5","C6","C7"])

