# Project 1 – Decision Trees and Random Forests

In [None]:
# Reload all modules without having to restart the kernel
# Useful for development if you have edited any of the external code files.
%load_ext autoreload
%autoreload 2

# Imports
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from random_forest import RandomForest
from decision_tree import DecisionTree


In [None]:
seed = 42
np.random.seed(seed)

## Dataset

Do data loading, exploration and preprocessing as you see fit.

Here is some code to load the dataset to get you started.

In [None]:
data = np.genfromtxt("letters.csv", delimiter=",", dtype=float, names=True)

feature_names = list(data.dtype.names[:-1])
target_name = data.dtype.names[-1]

X = np.array([data[feature] for feature in feature_names]).T
y = data[target_name].astype(int)
X_trainval, X_test, y_trainval, y_test = train_test_split(
    X, y, test_size=0.3, random_state=seed, stratify=y
)

print(f"Feature columns names: {feature_names}")
print(f"Target column name: {target_name}")
print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

In [None]:
param_grid_dt = {
    "criterion": ["gini", "entropy"],
    "max_depth": [None, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 20, 50, 100],
}

#dt = DecisionTreeClassifier(random_state=0)
dt = DecisionTree()

grid_dt = GridSearchCV(
    estimator=dt,
    param_grid=param_grid_dt,
    cv=5,
    scoring="accuracy",
    n_jobs=-1,
)

grid_dt.fit(X_trainval, y_trainval)

print("Best DT hyperparameters:", grid_dt.best_params_)
print("Best mean CV accuracy:", grid_dt.best_score_)

# Evaluate on unseen test set
test_preds = grid_dt.best_estimator_.predict(X_test)
print("Test accuracy:", accuracy_score(y_test, test_preds))


In [None]:
param_grid_rf = {
    "n_estimators": [2, 5, 10, 15, 20, 50],
    "criterion": ["gini", "entropy"],
    "max_depth": [2, 5, 10, 20, 50, None],
    "max_features": ["sqrt", "log2", None]
}

rf_wrapper = RandomForest()

grid_rf = GridSearchCV(
    estimator=rf_wrapper,
    param_grid=param_grid_rf,
    cv=5,
    scoring="accuracy",
    n_jobs=-1
)

grid_rf.fit(X_trainval, y_trainval)

print("Best RF hyperparameters:", grid_rf.best_params_)
print("Best mean CV accuracy:", grid_rf.best_score_)

# Evaluate on unseen test set
test_preds = grid_rf.best_estimator_.predict(X_test)
print("Test accuracy:", accuracy_score(y_test, test_preds))

