# Project 1 – Decision Trees and Random Forests

#### Imports
Imports og random seed

In [None]:


import numpy as np

from itertools import product
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from decision_tree import DecisionTree
from random_forest import RandomForest

# random seed (bare satt en tilfeldig verdi)
np.random.seed(21)


### Load dataset

Laster inn letters.csv og deler opp i features (X) og labels (y), og deler inn i test/train datasett (80/20)

In [None]:
data = np.genfromtxt("letters.csv", delimiter=",", dtype=float, names=True)

feature_names = list(data.dtype.names[:-1])
target_name = data.dtype.names[-1]

X = np.array([data[f] for f in feature_names]).T
y = data[target_name].astype(int)

# 80/20 train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=0, shuffle=True, stratify=y
)

print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")
print(data[:10])


### Hyperparametere

Setter opp verdier som skal testes i grid search for DecisionTree og RandomForest.

In [None]:
dt_params = {
    "criterion": ["entropy", "gini"],
    "max_depth": [None, 5, 10, 20],
    "max_features": [None, "sqrt", "log2"],
}

rf_params = {
    "n_estimators": [10, 20, 40],
    "max_depth": [5, 10, None],
    "criterion": ["entropy", "gini"],
    "max_features": ["sqrt", "log2"],
}


### Cross-validation

Funksjon for k-fold cross validation som regner ut accuracy.

In [None]:
def cross_val_score_custom(model_class, params, X, y, k=5, seed=21):
    skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=seed)
    scores = []
    for train_idx, val_idx in skf.split(X, y):
        X_tr, X_val = X[train_idx], X[val_idx]
        y_tr, y_val = y[train_idx], y[val_idx]

        model = model_class(**params)
        model.fit(X_tr, y_tr)
        y_pred = model.predict(X_val)
        scores.append(accuracy_score(y_val, y_pred))
    return float(np.mean(scores))


### DecisionTree grid search

Tester kombinasjoner av hyperparametere og finner beste for DecisionTree.

In [None]:
best_dt_params = None
best_dt_score = -1.0

for criterion, max_depth, max_features in product(
    dt_params["criterion"], dt_params["max_depth"], dt_params["max_features"]
):
    params = {
        "criterion": criterion,
        "max_depth": max_depth,
        "max_features": max_features,
    }
    score = cross_val_score_custom(DecisionTree, params, X_train, y_train, k=5, seed=21)
    if score > best_dt_score:
        best_dt_score = score
        best_dt_params = params

print("Best DecisionTree params:", best_dt_params)
print("Best DecisionTree 5-fold CV accuracy:", round(best_dt_score, 4))


### RandomForset - grid Serach
Tester kombinasjoner av hyperparametere og finner beste for RandomForest.

In [None]:
best_rf_params = None
best_rf_score = -1.0

for n_estimators, max_depth, criterion, max_features in product(
    rf_params["n_estimators"],
    rf_params["max_depth"],
    rf_params["criterion"],
    rf_params["max_features"],
):
    params = {
        "n_estimators": n_estimators,
        "max_depth": max_depth,
        "criterion": criterion,
        "max_features": max_features,
    }
    score = cross_val_score_custom(RandomForest, params, X_train, y_train, k=5, seed=21)
    if score > best_rf_score:
        best_rf_score = score
        best_rf_params = params

print("Best RandomForest params:", best_rf_params)
print("Best RandomForest 5-fold CV accuracy:", round(best_rf_score, 4))


## DecisionTree accuracy vs max_depth


In [None]:
depths = [1, 2, 5, 10, 20, None]
train_scores = []
test_scores = []

for d in depths:
    model = DecisionTree(max_depth=d, criterion="gini", max_features=None)
    model.fit(X_train, y_train)
    train_scores.append(accuracy_score(y_train, model.predict(X_train)))
    test_scores.append(accuracy_score(y_test, model.predict(X_test)))

plt.plot([str(d) for d in depths], train_scores, marker="o", label="Train")
plt.plot([str(d) for d in depths], test_scores, marker="o", label="Test")
plt.xlabel("max_depth")
plt.ylabel("Accuracy")
plt.title("DecisionTree accuracy vs max_depth")
plt.legend()
plt.show()

## RandomForest accuracy vs n_estimator

In [None]:
estimators = [1, 5, 10, 20, 50, 100]
train_scores = []
test_scores = []

for n in estimators:
    model = RandomForest(n_estimators=n, max_depth=10, criterion="gini", max_features="sqrt")
    model.fit(X_train, y_train)
    train_scores.append(accuracy_score(y_train, model.predict(X_train)))
    test_scores.append(accuracy_score(y_test, model.predict(X_test)))

plt.plot(estimators, train_scores, marker="o", label="Train")
plt.plot(estimators, test_scores, marker="o", label="Test")
plt.xlabel("n_estimators")
plt.ylabel("Accuracy")
plt.title("RandomForest accuracy vs n_estimators")
plt.legend()
plt.show()

### Final evaluation

We retrain our models with the best hyperparameters on the full training set and evaluate on the test set.


In [None]:
# DecisionTree
dt_best = DecisionTree(**best_dt_params)
dt_best.fit(X_train, y_train)
dt_pred = dt_best.predict(X_test)
dt_acc = accuracy_score(y_test, dt_pred)

# RandomForest
rf_best = RandomForest(**best_rf_params)
rf_best.fit(X_train, y_train)
rf_pred = rf_best.predict(X_test)
rf_acc = accuracy_score(y_test, rf_pred)

# Print accuracy
print(f"Custom DecisionTree test accuracy: {dt_acc:.4f}")
print(f"Custom RandomForest test accuracy: {rf_acc:.4f}")


### SKLearn modeller

Trener modell med sklearn med de samme paramterene

In [None]:
# Sklearn DecisionTree
sk_dt = DecisionTreeClassifier(
    criterion=best_dt_params["criterion"],
    max_depth=best_dt_params["max_depth"],
    max_features=best_dt_params["max_features"],
    random_state=0,
)
sk_dt.fit(X_train, y_train)
sk_dt_acc = accuracy_score(y_test, sk_dt.predicx    t(X_test))

# Sklearn RandomForest
sk_rf = RandomForestClassifier(
    n_estimators=best_rf_params["n_estimators"],
    max_depth=best_rf_params["max_depth"],
    criterion=best_rf_params["criterion"],
    max_features=best_rf_params["max_features"],
    random_state=0,
)
sk_rf.fit(X_train, y_train)
sk_rf_acc = accuracy_score(y_test, sk_rf.predict(X_test))

print("Sklearn DecisionTree test accuracy:", round(sk_dt_acc, 4))
print("Sklearn RandomForest test accuracy:", round(sk_rf_acc, 4))


#### Task 4 - Permutation Importance

In [None]:
def permutation_importance(model, X, y, metric=accuracy_score, n_repeats=30, seed=21):
    rng = np.random.default_rng(seed)
    baseline = metric(y, model.predict(X))
    importances = []

    for col in range(X.shape[1]):
        scores = []
        for _ in range(n_repeats):
            X_permuted = X.copy()
            rng.shuffle(X_permuted[:, col])
            score = metric(y, model.predict(X_permuted))
            scores.append(baseline - score)
        importances.append(np.mean(scores))
    return np.array(importances)

rf_best = RandomForest(**best_rf_params)
rf_best.fit(X_train, y_train)

importances = permutation_importance(rf_best, X_test, y_test, n_repeats=30, seed=21)

import matplotlib.pyplot as plt

plt.bar(range(len(feature_names)), importances)
plt.xticks(range(len(feature_names)), feature_names, rotation=90)
plt.ylabel("Permutation importance")
plt.title("Feature importance RandomForest")
plt.show()
