Custom Decision Tree with Information Gain

In [1]:
import numpy as np

In [2]:
class CustomDecisionTree:

  def __init__(self, max_depth=None):
    self.max_depth = max_depth
    self.tree = None

  def fit(self, X, y):
    self.tree=self._build_tree(X,y)

  def _build_tree(self, X, y, depth=0):
    num_samples, num_features = X.shape
    unique_classes = np.unique(y)

    if len(unique_classes) == 1:
      return{'class':unique_classes[0]}
    if num_samples == 0 or (self.max_depth and depth >= self.max_depth):
      return{'class':np.bincount(y).argmax()}

    best_info_gain = -float('inf')
    best_split = None

    for feature_idx in range(num_features):
      thresholds = np.unique(X[:,feature_idx])
      for threshold in thresholds:
        left_mask = X[:,feature_idx] <= threshold
        right_mask = ~left_mask

        left_y = y[left_mask]
        right_y = y[right_mask]

        info_gain = self._information_gain(y, left_y, right_y)

        if info_gain > best_info_gain:
          best_info_gain = info_gain
          best_split={
          'feature_idx': feature_idx,
          'threshold': threshold,
          'left_y': left_y,
          'right_y': right_y,
          }
    if best_split is None:
      return{'class':np.bincount(y).argmax()}

    left_tree = self._build_tree(X[best_split['left_y']], y[best_split['left_y']], depth + 1)
    right_tree = self._build_tree(X[best_split['right_y']], y[best_split['right_y']], depth + 1)
    return{
        'feature_idx': best_split['feature_idx'],
        'threshold': best_split['threshold'],
        'left_tree': left_tree,
        'right_tree': right_tree
    }
  def _information_gain(self, parent, left, right):
    parent_entropy = self._entropy(parent)
    left_entropy = self._entropy(left)
    right_entropy = self._entropy(right)

    weighted_avg_entropy = (len(left)/len(parent)) * left_entropy + (len(right)/len(parent)) * right_entropy

    return parent_entropy - weighted_avg_entropy

  def _entropy(self,y):
    class_probs = np.bincount(y) / len(y)
    return -np.sum(class_probs * np.log2(class_probs + 1e-9))

  def predict(self, X):
    return [self._predict_single(x, self.tree) for x in X]

  def _predict_single(self, x, tree):
    if 'class' in tree:
      return tree['class']

    feature_val = x[tree['feature_idx']]
    if feature_val <= tree['threshold']:
      return self._predict_single(x, tree['left_tree'])
    else:
      return self._predict_single(x, tree['right_tree'])

In [3]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
# Load the Iris dataset
data = load_iris()
X = data.data
y = data.target
# Split into training and test sets (80% training, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
custom_tree = CustomDecisionTree(max_depth=3)
custom_tree.fit(X_train, y_train)
# Predict on the test set
y_pred_custom = custom_tree.predict(X_test)
# Calculate accuracy
accuracy_custom = accuracy_score(y_test, y_pred_custom)
print(f"Custom Decision Tree Accuracy: {accuracy_custom:.4f}")

Custom Decision Tree Accuracy: 0.5333


In [5]:
sklearn_tree = DecisionTreeClassifier(max_depth=3, random_state=42)
sklearn_tree.fit(X_train, y_train)
# Predict on the test set
y_pred_sklearn = sklearn_tree.predict(X_test)
# Calculate accuracy
accuracy_sklearn = accuracy_score(y_test, y_pred_sklearn)
print(f"Scikit-learn Decision Tree Accuracy: {accuracy_sklearn:.4f}")

Scikit-learn Decision Tree Accuracy: 1.0000


In [6]:
print(f"Accuracy Comparison:")
print(f"Custom Decision Tree: {accuracy_custom:.4f}")
print(f"Scikit-learn Decision Tree: {accuracy_sklearn:.4f}")

Accuracy Comparison:
Custom Decision Tree: 0.5333
Scikit-learn Decision Tree: 1.0000


##Exercise 3

In [7]:
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

from sklearn.datasets import load_wine
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

1. Implement Classification Models

In [8]:
wine = load_wine()

In [9]:
X = wine.data
y = wine.target

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42 )

In [11]:
dt_clf = DecisionTreeClassifier(random_state = 42)
dt_clf.fit(X_train, y_train)
y_pred_dt = dt_clf.predict(X_test)

In [12]:
rf_clf = RandomForestClassifier(random_state = 42)
rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_test)

In [13]:
f1_dt = f1_score(y_test, y_pred_dt, average = 'macro')
f1_rf = f1_score(y_test, y_pred_rf, average = 'macro')

In [14]:
print("F1 Score Comparision:")
print(f"Decision Tree: {f1_dt:.4f}")
print(f"Random Forest: {f1_rf:.4f}")

F1 Score Comparision:
Decision Tree: 0.9425
Random Forest: 1.0000


2. Hyperparameter Tuning:

In [15]:
from sklearn.model_selection import GridSearchCV

In [16]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10,],
    'min_samples_split': [2, 5, 10]
}

In [17]:
grid_search = GridSearchCV(
    estimator = RandomForestClassifier(random_state = 42),
    param_grid = param_grid,
    cv = 5,
    scoring = 'f1_macro',
    n_jobs = -1
)

In [18]:
grid_search.fit(X_train, y_train)

In [19]:
print("Best Parameters: ", grid_search.best_params_)
print("Best F1 Score: ", grid_search.best_score_)

Best Parameters:  {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 100}
Best F1 Score:  0.9788068209740036


3. Implement Regression Model:

In [20]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import RandomizedSearchCV

In [21]:
dt_reg = DecisionTreeRegressor(random_state = 42)
dt_reg.fit(X_train, y_train)
y_pred_dt = dt_reg.predict(X_test)

In [22]:
rf_reg = RandomForestRegressor(random_state = 42)
rf_reg.fit(X_train, y_train)
y_pred_rf = rf_reg.predict(X_test)

In [23]:
param_dist = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [None, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4]   # valid parameter
}

In [24]:
random_search = RandomizedSearchCV(
    estimator=RandomForestRegressor(random_state=42),
    param_distributions=param_dist,
    scoring='neg_mean_squared_error',
    n_iter=10,
    cv=5,
    random_state=42,
    n_jobs=-1
)

In [25]:
random_search.fit(X_train, y_train)

In [26]:
print("Best Parameters: ", random_search.best_params_)
print("Best Score: ", -random_search.best_score_)

Best Parameters:  {'n_estimators': 300, 'min_samples_leaf': 1, 'max_depth': 10}
Best Score:  0.04472559113300492
