In [None]:
import sys
from pathlib import Path
PROJECT_ROOT = Path().resolve().parents[0]
sys.path.insert(0, str(PROJECT_ROOT))
from utils import show_classification_report, random_over_sample, random_under_sample, smote_over_sample, tomek_under_sample, SampledDataset

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, roc_curve, auc
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter
import pandas as pd
from skopt import BayesSearchCV
from skopt.space import Integer, Categorical, Real


# Load Dataset 

In [4]:
dataset = pd.read_parquet(r'../local_dataset/dataset/merged_full2.parquet')
dataset.drop(columns=['lon', 'lat'], inplace=True, errors='ignore')

dataset_numeric_columns = dataset.select_dtypes(include=['number']).columns
dataset_string_columns = dataset.select_dtypes(include=['object']).columns
dataset[dataset_numeric_columns] = dataset[dataset_numeric_columns].fillna(dataset[dataset_numeric_columns].mean())
dataset[dataset_string_columns] = dataset[dataset_string_columns].fillna(dataset[dataset_string_columns].mode().iloc[0])

print(dataset.isnull().sum())

X_df = dataset.drop('fire', axis=1)
X_string_columns = X_df.select_dtypes(include=['object']).columns
X_df[X_string_columns] = OrdinalEncoder().fit_transform(X_df[X_string_columns])

Y_df = dataset['fire']

print(X_df.head())
print(Y_df.head())


fire                      0
log_precip_s1             0
log_precip_s2             0
log_precip_s3             0
log_precip_s4             0
tmax_s1                   0
tmax_s2                   0
tmax_s3                   0
tmax_s4                   0
amplitude_thermique_s1    0
amplitude_thermique_s2    0
amplitude_thermique_s3    0
amplitude_thermique_s4    0
GRIDCODE                  0
log_area_sqm              0
lcc_code_encoded          0
elevation                 0
COARSE                    0
SAND                      0
SILT                      0
CLAY                      0
TEXTURE_USDA              0
TEXTURE_SOTER             0
BULK                      0
REF_BULK                  0
ORG_CARBON                0
PH_WATER                  0
TOTAL_N                   0
CN_RATIO                  0
CEC_SOIL                  0
CEC_CLAY                  0
CEC_EFF                   0
TEB                       0
BSAT                      0
ALUM_SAT                  0
ESP                 

# Util Functions

In [None]:
sampled_dataset = SampledDataset(["Original", "Random Over-Sampling", "Random Under-Sampling", "SMOTE Over-Sampling", "Tomek Links Under-Sampling"])

def main(X_df, Y_df):
    _X_np_train, _X_np_test, _Y_np_train, _Y_np_test = train_test_split(
        X_df.to_numpy(), Y_df.to_numpy(),
        test_size=0.2,      # 20% test
        random_state=42,    # for reproducibility
        shuffle=True        # default True
    )
    model = DecisionTreeClassifier(random_state=42)
    model.fit(_X_np_train, _Y_np_train)
    Y_pred = model.predict(_X_np_test)
    metrics = show_classification_report(_Y_np_test, Y_pred)

    sampled_dataset.go_next_method(_X_np_train, _Y_np_train, _X_np_test, _Y_np_test, metrics.roc_auc)

# Finding Best Sampling Method

In [None]:
print((Y_df == 1).sum(), (Y_df == 0).sum())
print("Original dataset:")
main(X_df, Y_df)
print("Random Over-Sampling:")
main(*random_over_sample(X_df, Y_df))
print("Random Under-Sampling:")
main(*random_under_sample(X_df, Y_df))
print("SMOTE Over-Sampling:")
main(*smote_over_sample(X_df, Y_df))
print("Tomek Links Under-Sampling:")
main(*tomek_under_sample(X_df, Y_df))

sampled_dataset.print_report("ROC AUC")
X_np_train = sampled_dataset.best_X_train
Y_np_train = sampled_dataset.best_y_train
X_np_test = sampled_dataset.best_X_test
Y_np_test = sampled_dataset.best_y_test

14477 42387
Original dataset:


NameError: name 'main' is not defined

# Parameter Tuning

In [5]:

search_space = {
    "max_depth": Integer(1, 50),
    "min_samples_split": Integer(2, 50),
    "min_samples_leaf": Integer(1, 50),
    "max_features": Categorical([None, "sqrt", "log2"]),
    "criterion": Categorical(["gini", "entropy"]),
    "ccp_alpha": Real(1e-6, 1e-1, prior="log-uniform"),
}

dt = DecisionTreeClassifier(random_state=42)

bayes = BayesSearchCV(
    estimator=dt,
    search_spaces=search_space,
    n_iter=50,           # number of evaluations
    cv=5,
    scoring="f1_macro",
    n_jobs=-1,
    random_state=42,
)

bayes.fit(X_np_train, Y_np_train)

best_model = bayes.best_estimator_
print(bayes.best_params_)


OrderedDict([('ccp_alpha', 1e-06), ('criterion', 'gini'), ('max_depth', 39), ('max_features', None), ('min_samples_leaf', 1), ('min_samples_split', 2)])
