<a href="https://colab.research.google.com/github/Jencinias/Ensemble-Based-Traffic-Sign-Recognition/blob/main/decision_tree_experiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Mount Drive

In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


# Import Libraries

In [2]:
import os
import cv2
import numpy as np
import pandas as pd
from sklearn.model_selection import ParameterGrid, KFold
from sklearn.tree import DecisionTreeClassifier

# Read Data

In [3]:
#Using Pandas creates a dateframe with the contents of the road_sign.csv
df = pd.read_csv("/content/drive/MyDrive/transformed_downsampled_folder/transformed_downsampled_df.csv")

#Shows the first 5 rows of the dataframe
print(df.head())

      filename  width  height       class  xmin  ymin  xmax  ymax
0  road862.png    300     400  speedlimit   285    29   382   101
1  road442.png    300     400  speedlimit   121   138   158   165
2  road437.png    300     400  speedlimit   220   245   268   280
3  road411.png    300     400  speedlimit   158    77   216   119
4  road232.png    300     400  speedlimit   169   221   189   234


# Read Images

In [4]:
# Function to read images and corresponding data from CSV
def read_images_and_labels(folder_path, csv_path):
    # Read CSV file
    data = pd.read_csv(csv_path)

    images = []
    labels = []

    # Iterate through each row in CSV
    for index, row in data.iterrows():
        image_name = row["filename"]
        image_path = os.path.join(folder_path, image_name)

        # Read image
        image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
        if image is None:
            print(f"Could not read image: {image_path}")
            continue

        # Flatten image and add to the list
        images.append(image.flatten())

        # Add label to the list
        labels.append(row['class'])

    return np.array(images), np.array(labels)

# Example usage
folder_path = "/content/drive/MyDrive/transformed_downsampled_folder/transformed_downsampled_images/"
csv_path = "/content/drive/MyDrive/transformed_downsampled_folder/transformed_downsampled_df.csv"

images, labels = read_images_and_labels(folder_path, csv_path)

print("Shape of images array:", images.shape)
print("Shape of labels array:", labels.shape)

Shape of images array: (244, 160000)
Shape of labels array: (244,)


## First Iteration without Parallelization

In [None]:
param_grid = [
    {
        'criterion': ['gini', 'entropy'],
        'splitter': ['best', 'random'],
        'max_depth': [None, 5, 10],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': [None, 'sqrt', 'log2']
    }
]

def nested_cv(X, y, inner_cv, outer_cv, Classifier, parameter_grid):
    outer_scores = []
    # for each split of the data in the outer cross-validation
    # (split method returns indices)
    for training_samples, test_samples in outer_cv.split(X, y):
        # find best parameter using inner cross-validation
        best_params = {}
        best_score = -np.inf
        # iterate over parameters
        for parameters in parameter_grid:
            # accumulate score over inner splits
            cv_scores = []
            # iterate over inner cross-validation
            for inner_train, inner_test in inner_cv.split(X[training_samples], y[training_samples]):
                # build classifier given parameters and training data
                clf = Classifier(**parameters)
                clf.fit(X[inner_train], y[inner_train])
                # evaluate on inner test set
                score = clf.score(X[inner_test], y[inner_test])
                cv_scores.append(score)
            # compute mean score over inner folds
            mean_score = np.mean(cv_scores)
            if mean_score > best_score:
                # if better than so far, remember parameters
                best_score = mean_score
                best_params = parameters
        # build classifier on best parameters using outer training set
        clf = Classifier(**best_params)
        clf.fit(X[training_samples], y[training_samples])
        # evaluate
        outer_scores.append(clf.score(X[test_samples], y[test_samples]))
    return np.array(outer_scores)

scores = nested_cv(
    images,
    labels,
    KFold(5),
    KFold(5),
    DecisionTreeClassifier,
    ParameterGrid(param_grid)
)

print("Cross-validation scores: {}".format(scores))

## Second Iteration with Parallelization


In [1]:
param_grid = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'sqrt', 'log2']
}

outer_cv = KFold(5)
inner_cv = KFold(5)

def nested_cv(X, y, inner_cv, outer_cv, param_grid):
    outer_scores = []
    for train_index, test_index in outer_cv.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        clf = GridSearchCV(
            estimator=DecisionTreeClassifier(),
            param_grid=param_grid,
            cv=inner_cv,
            n_jobs=-1  # Parallelize grid search
        )
        clf.fit(X_train, y_train)
        outer_scores.append(clf.score(X_test, y_test))
    return outer_scores

scores = nested_cv(images, labels, inner_cv, outer_cv, param_grid)
print("Cross-Validation Scores: {}".format(scores))
print("Average Cross-Validation Score: {}".format(np.mean(scores)))