# Project 1 – Decision Trees and Random Forests

In [12]:
# Reload all modules without having to restart the kernel
# Useful for development if you have edited any of the external code files.
%load_ext autoreload
%autoreload 2

# Imports
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from decision_tree import DecisionTree
from random_forest import RandomForest

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Dataset

Do data loading, exploration and preprocessing as you see fit.

Here is some code to load the dataset to get you started.

In [13]:
data = np.genfromtxt("letters.csv", delimiter=",", dtype=float, names=True)

feature_names = list(data.dtype.names[:-1])
target_name = data.dtype.names[-1]

X = np.array([data[feature] for feature in feature_names]).T
y = data[target_name].astype(int)

print(f"Feature columns names: {feature_names}")
print(f"Target column name: {target_name}")
print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

Feature columns names: ['xbox', 'ybox', 'width', 'high', 'onpix', 'xbar', 'ybar', 'x2bar', 'y2bar', 'xybar', 'x2ybr', 'xy2br', 'xege', 'xegvy', 'yege', 'yegvx']
Target column name: label
X shape: (2000, 16)
y shape: (2000,)


In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, shuffle=True
)

In [15]:
dt = DecisionTree(max_depth=10, criterion="entropy")
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)
accuracy_dt = accuracy_score(y_test, y_pred_dt)
print("DecisionTree Test Accuracy:", accuracy_dt)

DecisionTree Test Accuracy: 0.8916666666666667


In [16]:
rf = RandomForest(n_estimators=20, max_depth=10, criterion="entropy", max_features="sqrt")
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("RandomForest Test Accuracy:", accuracy_rf)


RandomForest Test Accuracy: 0.9466666666666667


In [17]:
if accuracy_rf > accuracy_dt:
    print("✅ RandomForest outperformed DecisionTree")
else:
    print("⚠️ DecisionTree performed as good or better")

✅ RandomForest outperformed DecisionTree


In [18]:
sk_dt = DecisionTreeClassifier(max_depth=10, criterion="entropy", random_state=42)
sk_dt.fit(X_train, y_train)
y_pred_sk_dt = sk_dt.predict(X_test)
accuracy_sk_dt = accuracy_score(y_test, y_pred_sk_dt)

sk_rf = RandomForestClassifier(
    n_estimators=20, max_depth=10, criterion="entropy", max_features="sqrt", random_state=42
)
sk_rf.fit(X_train, y_train)
y_pred_sk_rf = sk_rf.predict(X_test)
accuracy_sk_rf = accuracy_score(y_test, y_pred_sk_rf)

print("Sklearn DecisionTree Accuracy:", accuracy_sk_dt)
print("Sklearn RandomForest Accuracy:", accuracy_sk_rf)

Sklearn DecisionTree Accuracy: 0.9
Sklearn RandomForest Accuracy: 0.9483333333333334


In [19]:
results = pd.DataFrame({
    "Model": [
        "Homemade DecisionTree", "Homemade RandomForest",
        "Sklearn DecisionTree", "Sklearn RandomForest"
    ],
    "Accuracy": [accuracy_dt, accuracy_rf, accuracy_sk_dt, accuracy_sk_rf]
})

print(results)

                   Model  Accuracy
0  Homemade DecisionTree  0.891667
1  Homemade RandomForest  0.946667
2   Sklearn DecisionTree  0.900000
3   Sklearn RandomForest  0.948333
