In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV

from google.colab import drive
from joblib import dump

In [4]:
drive.mount('/content/drive')

Mounted at /content/drive


Loading pre-split datasets:

In [5]:
X_train = pd.read_csv("/content/drive/MyDrive/road_accidents/X_train_new.csv", index_col = 0)
y_train = pd.read_csv("/content/drive/MyDrive/road_accidents/y_train_new.csv", index_col = 0)

X_test = pd.read_csv("/content/drive/MyDrive/road_accidents/X_test_new.csv", index_col = 0)
y_test = pd.read_csv("/content/drive/MyDrive/road_accidents/y_test_new.csv", index_col = 0)

In [6]:
categorical_features = ["day_of_week", "first_road_class", "road_type", "junction_detail",
                   "second_road_class", "pedestrian_crossing_human_control",
                   "pedestrian_crossing_physical_facilities", "light_conditions", "weather_conditions",
                   "road_surface_conditions", "special_conditions_at_site", "carriageway_hazards",
                   "did_police_officer_attend_scene_of_accident", "vehicle_type", "towing_and_articulation",
                   "vehicle_manoeuvre", "vehicle_location_restricted_lane", "junction_location",
                   "skidding_and_overturning", "hit_object_in_carriageway", "hit_object_off_carriageway",
                   "vehicle_leaving_carriageway", "first_point_of_impact", "sex_of_driver", "vehicle_left_hand_drive",
                   "journey_purpose_of_driver", "casualty_class", "sex_of_casualty",
                   "pedestrian_location", "pedestrian_movement", "car_passenger", "bus_or_coach_passenger",
                   "pedestrian_road_maintenance_worker", "vehicle_direction_from", "vehicle_direction_to",
                   "age_band_of_casualty", "day_of_month", "month", "time_rounded",]

In [7]:
for col in categorical_features:
    X_train[col] = X_train[col].astype(str)
    X_test[col] = X_test[col].astype(str)

Defining our parameter grid:

In [9]:
param_grid = {
    'criterion': ['gini', 'entropy', 'log_loss'],  # Splitting criteria
    'max_depth': [3, 5, 10, None],                # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],              # Minimum samples required to split a node
    'min_samples_leaf': [1, 2, 4],                # Minimum samples in a leaf node
    'max_features': [None, 'sqrt', 'log2'],       # Number of features to consider for split
    'class_weight': [None, 'balanced']            # Handle class imbalance
}

In [13]:
dt_model = DecisionTreeClassifier(random_state=42)

Performing randomized search:

In [14]:
random_search = RandomizedSearchCV(
    estimator=dt_model,
    param_distributions=param_grid,
    n_iter=50,                  # Number of random parameter combinations to try
    scoring='f1_weighted',      # Use an appropriate metric (e.g., 'accuracy', 'f1_weighted', etc.)
    cv=3,                       # 3-fold cross-validation
    verbose=3,                  # Print progress
    random_state=42,            # For reproducibility
    n_jobs=-1                   # Use all available cores for parallel computation
)

In [15]:
random_search.fit(X_train, np.ravel(y_train))

Fitting 3 folds for each of 50 candidates, totalling 150 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Saving the model in a separate file to explore it in the main notebook:

In [18]:
dump(random_search.best_estimator_, '/content/drive/MyDrive/road_accidents/best_decision_tree_model.joblib')

['/content/drive/MyDrive/road_accidents/best_decision_tree_model.joblib']