# Project 1 – Decision Trees and Random Forests

#### Imports

In [1]:
# Reload all modules without having to restart the kernel
# Useful for development if you have edited any of the external code files.
%load_ext autoreload
%autoreload 2

# Imports
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

from itertools import product
from decision_tree import DecisionTree

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
# ... add more imports as needed

# My implementations
from decision_tree import DecisionTree
from random_forest import RandomForest


### Load dataset

We load the `letters.csv` dataset, separate features and labels, and split into train/test sets.


In [2]:
data = np.genfromtxt("letters.csv", delimiter=",", dtype=float, names=True)

feature_names = list(data.dtype.names[:-1])
target_name = data.dtype.names[-1]

X = np.array([data[f] for f in feature_names]).T
y = data[target_name].astype(int)

# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=0, shuffle=True
)

print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")


X_train shape: (1400, 16), y_train shape: (1400,)
X_test shape: (600, 16), y_test shape: (600,)


Hyperparametere

Forklar hva som skjer her.

In [3]:
dt_params = {
    "criterion": ["entropy", "gini"],
    "max_depth": [None, 5, 10, 20],
    "max_features": [None, "sqrt", "log2"]
}

rf_params = {
    "n_estimators": [10, 20, 50],
    "max_depth": [5, 10, None],
    "criterion": ["entropy", "gini"],
    "max_features": ["sqrt", "log2"]
}


### Cross-validation helper

We implement a helper function to evaluate models with k-fold cross-validation.


In [4]:
def cross_val_score_custom(model_class, params, X, y, k=5):
    kf = KFold(n_splits=k, shuffle=True, random_state=0)
    scores = []
    for train_idx, val_idx in kf.split(X):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        model = model_class(**params)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        scores.append(accuracy_score(y_val, y_pred))
    return np.mean(scores)


### DecisionTree model selection

We loop over hyperparameter combinations for our custom DecisionTree, evaluate with cross-validation, and select the best.


In [5]:
best_dt_params = None
best_dt_score = -1

for criterion, max_depth, max_features in product(
    dt_params["criterion"], dt_params["max_depth"], dt_params["max_features"]
):
    params = {
        "criterion": criterion,
        "max_depth": max_depth,
        "max_features": max_features
    }
    score = cross_val_score_custom(DecisionTree, params, X_train, y_train, k=5)
    if score > best_dt_score:
        best_dt_score = score
        best_dt_params = params

print("Best DecisionTree params:", best_dt_params)
print("Best DecisionTree CV accuracy:", best_dt_score)


TypeError: DecisionTree.__init__() got an unexpected keyword argument 'max_features'

### Final evaluation

We retrain our models with the best hyperparameters on the full training set and evaluate on the test set.


In [None]:
# DecisionTree
dt_best = DecisionTree(**best_dt_params)
dt_best.fit(X_train, y_train)
dt_test_acc = accuracy_score(y_test, dt_best.predict(X_test))

# RandomForest
rf_best = RandomForest(**best_rf_params)
rf_best.fit(X_train, y_train)
rf_test_acc = accuracy_score(y_test, rf_best.predict(X_test))

print("Custom DecisionTree test accuracy:", dt_test_acc)
print("Custom RandomForest test accuracy:", rf_test_acc)


### Comparison with sklearn

We compare our implementations against sklearn’s DecisionTreeClassifier and RandomForestClassifier.


In [None]:
# Sklearn DecisionTree
sk_dt = DecisionTreeClassifier(
    criterion=best_dt_params["criterion"],
    max_depth=best_dt_params["max_depth"],
    max_features=best_dt_params["max_features"],
    random_state=0
)
sk_dt.fit(X_train, y_train)
sk_dt_acc = accuracy_score(y_test, sk_dt.predict(X_test))

# Sklearn RandomForest
sk_rf = RandomForestClassifier(
    n_estimators=best_rf_params["n_estimators"],
    max_depth=best_rf_params["max_depth"],
    criterion=best_rf_params["criterion"],
    max_features=best_rf_params["max_features"],
    random_state=0
)
sk_rf.fit(X_train, y_train)
sk_rf_acc = accuracy_score(y_test, sk_rf.predict(X_test))

print("Sklearn DecisionTree test accuracy:", sk_dt_acc)
print("Sklearn RandomForest test accuracy:", sk_rf_acc)
