In [19]:
# -----------------------------------------------------------
# Activity 4.02: Random Forest and Extreme Random Forest
# Car Evaluation Classification
# -----------------------------------------------------------

# Importing required libraries
import pandas as pd
from sklearn import preprocessing, model_selection
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [20]:
# -----------------------------------------------------------
# Loading and Preparing the Dataset
# -----------------------------------------------------------

# Loading the Car Evaluation dataset from GitHub
file_url = 'https://raw.githubusercontent.com/PacktWorkshops/The-Applied-Artificial-Intelligence-Workshop/master/Datasets/car.csv'
df = pd.read_csv(file_url)

In [21]:
# Function to label-encode each categorical column
def encode(data_frame, column):
    label_encoder = preprocessing.LabelEncoder()
    label_encoder.fit(data_frame[column].unique())
    return label_encoder.transform(data_frame[column])

In [22]:
# Encoding all columns in the dataset
for column in df.columns:
    df[column] = encode(df, column)

In [23]:
# Separating the target label (class) from feature columns
label = df.pop('class')

In [24]:
# Splitting the dataset into training and testing sets
# Using 90% for training and 10% for testing
features_train, features_test, label_train, label_test = model_selection.train_test_split(
    df, label, test_size=0.1, random_state=88
)

In [25]:
# -----------------------------------------------------------
# Random Forest Classifier
# -----------------------------------------------------------

# Creating a Random Forest model with 100 trees
# max_depth=6 prevents overfitting and keeps trees manageable
random_forest_classifier = RandomForestClassifier(
    n_estimators=100,
    max_depth=6,
    random_state=168
)

In [26]:
# Training the Random Forest model
random_forest_classifier.fit(features_train, label_train)

In [27]:
# Predicting on the test set
rf_preds_test = random_forest_classifier.predict(features_test)

In [28]:
# Printing the classification report for evaluation
# NOTE:
# The classification_report may generate a warning:
# "Precision is ill-defined and being set to 0.0 in labels with no predicted samples."
#
# This occurs when the model does not predict at least one of the target classes
# in the test set (common in imbalanced datasets such as the Car Evaluation dataset).
#
# To handle this gracefully, we add zero_division=0.
# This removes the warning and explicitly assigns a value of 0
# for undefined precision/recall cases without affecting the model itself.
#
print("----- Random Forest Classification Report -----")
print(classification_report(label_test, rf_preds_test, zero_division=0))

----- Random Forest Classification Report -----
              precision    recall  f1-score   support

           0       0.67      0.76      0.71        42
           1       0.00      0.00      0.00         9
           2       0.92      0.96      0.94       114
           3       0.83      0.62      0.71         8

    accuracy                           0.84       173
   macro avg       0.60      0.59      0.59       173
weighted avg       0.80      0.84      0.82       173



In [29]:
# Confusion matrix to see detailed prediction distribution
print("Random Forest Confusion Matrix:")
print(confusion_matrix(label_test, rf_preds_test))

Random Forest Confusion Matrix:
[[ 32   0  10   0]
 [  8   0   0   1]
 [  5   0 109   0]
 [  3   0   0   5]]


In [30]:
# Extracting feature importance values
rf_varimp = random_forest_classifier.feature_importances_
rf_varimp
# (You may print this or visualize it in a bar chart if desired)

array([0.12676384, 0.10366314, 0.02119621, 0.35266673, 0.05915769,
       0.33655239])

In [31]:
# -----------------------------------------------------------
# Extreme Random Forest (Extra Trees Classifier)
# -----------------------------------------------------------

# Creating an Extra Trees model
# Extra Trees introduces more randomness and tends to train faster
extra_trees_classifier = ExtraTreesClassifier(
    n_estimators=100,
    max_depth=6,
    random_state=168
)

In [32]:
# Training the Extra Trees model
extra_trees_classifier.fit(features_train, label_train)

In [33]:
# Predicting on the test set
et_preds_test = extra_trees_classifier.predict(features_test)

In [34]:
# Printing the classification report for comparison
# NOTE:
# The classification_report may generate a warning:
# "Precision is ill-defined and being set to 0.0 in labels with no predicted samples."
#
# This occurs when the model does not predict at least one of the target classes
# in the test set (common in imbalanced datasets such as the Car Evaluation dataset).
#
# To handle this gracefully, we add zero_division=0.
# This removes the warning and explicitly assigns a value of 0
# for undefined precision/recall cases without affecting the model itself.
#
print("----- Extra Trees Classification Report -----")
print(classification_report(label_test, et_preds_test, zero_division=0))

----- Extra Trees Classification Report -----
              precision    recall  f1-score   support

           0       0.61      0.67      0.64        42
           1       0.00      0.00      0.00         9
           2       0.89      0.98      0.93       114
           3       1.00      0.12      0.22         8

    accuracy                           0.82       173
   macro avg       0.62      0.44      0.45       173
weighted avg       0.78      0.82      0.78       173



In [35]:
# Confusion matrix for Extra Trees model
print("Extra Trees Confusion Matrix:")
print(confusion_matrix(label_test, et_preds_test))

Extra Trees Confusion Matrix:
[[ 28   0  14   0]
 [  9   0   0   0]
 [  2   0 112   0]
 [  7   0   0   1]]


In [36]:
# Extracting feature importance scores
et_varimp = extra_trees_classifier.feature_importances_
et_varimp

array([0.08844544, 0.0702334 , 0.01440408, 0.37662014, 0.05965896,
       0.39063797])