In [7]:
import sys

assert sys.version_info >= (3, 7)
import pandas as pd

from packaging import version
import sklearn

assert version.parse(sklearn.__version__) >= version.parse("1.0.1")

import matplotlib.pyplot as plt

plt.rc('font', size=14)
plt.rc('axes', labelsize=14, titlesize=14)
plt.rc('legend', fontsize=14)
plt.rc('xtick', labelsize=10)
plt.rc('ytick', labelsize=10)

In [8]:
from pathlib import Path

IMAGES_PATH = Path() / "images" / "decision_trees"
IMAGES_PATH.mkdir(parents=True, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = IMAGES_PATH / f"{fig_id}.{fig_extension}"
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [9]:
import openml
import pandas as pd

# Get Dataset from OPENML
dataset = openml.datasets.get_dataset(23)


df, *_ = dataset.get_data()

# Get features as X
X = df.drop('Contraceptive_method_used',axis=1)


# Get class as y
y = df['Contraceptive_method_used']


In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
X_train_c2 = X_train.copy()
X_test_c2 = X_test.copy()

#Use dummies
categorical_features = ["Wifes_education",'Husbands_occupation',"Husbands_education","Standard-of-living_index" ]
X_train_enc2 = pd.get_dummies(X_train_c2, columns=categorical_features, drop_first=True)
X_test_enc2 = pd.get_dummies(X_test_c2, columns=categorical_features, drop_first=True)

X_test_enc2 = X_test_enc2.reindex(columns=X_train_enc2.columns, fill_value=0)

In [12]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold

cv = StratifiedKFold(n_splits=8, shuffle=True, random_state=42)


param_grid = {
    'max_depth': [None, 5, 10, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}


# Set up GridSearchCV
grid_search = GridSearchCV(
    estimator=DecisionTreeClassifier(random_state=42),
    param_grid=param_grid,
    scoring='f1_weighted',  
    cv=cv,
)

grid_search.fit(X_train_enc2, y_train)

best_tree = grid_search.best_estimator_


y_pred_default = best_tree.predict(X_test_enc2)
print(classification_report(y_test, y_pred_default))


print("Best parameters:", grid_search.best_params_)
print("Best weighted F1 score:", grid_search.best_score_)


              precision    recall  f1-score   support

           1       0.72      0.66      0.69       130
           2       0.54      0.39      0.46        71
           3       0.45      0.59      0.51        94

    accuracy                           0.57       295
   macro avg       0.57      0.55      0.55       295
weighted avg       0.59      0.57      0.57       295

Best parameters: {'criterion': 'entropy', 'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 10}
Best weighted F1 score: 0.5462761504551412
