<a href="https://colab.research.google.com/github/IP-Michael/Project-FOML/blob/main/cs24mtech12010_assign1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install ucimlrepo


Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score
from ucimlrepo import fetch_ucirepo

# Define the Decision Tree Classifier
class DecisionTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        self.tree = None

    def fit(self, X, y):
        self.features = X.columns
        self.tree = self._build_tree(X, y, depth=0)

    def _build_tree(self, X, y, depth):
        if len(np.unique(y)) == 1:
            return y.iloc[0]

        if self.max_depth is not None and depth >= self.max_depth:
            return y.mode()[0]

        best_feature, best_threshold = self._best_split(X, y)

        if best_feature is None:
            return y.mode()[0]

        left_indices = X[best_feature] <= best_threshold
        right_indices = X[best_feature] > best_threshold

        left_tree = self._build_tree(X[left_indices], y[left_indices], depth + 1)
        right_tree = self._build_tree(X[right_indices], y[right_indices], depth + 1)

        return (best_feature, best_threshold, left_tree, right_tree)

    def _best_split(self, X, y):
        best_gain = 0
        best_feature = None
        best_threshold = None

        for feature in X.columns:
            thresholds = np.unique(X[feature])
            for threshold in thresholds:
                gain = self._information_gain(X, y, feature, threshold)
                if gain > best_gain:
                    best_gain = gain
                    best_feature = feature
                    best_threshold = threshold

        return best_feature, best_threshold

    def _information_gain(self, X, y, feature, threshold):
        parent_entropy = self._entropy(y)

        left_indices = X[feature] <= threshold
        right_indices = X[feature] > threshold

        if len(y[left_indices]) == 0 or len(y[right_indices]) == 0:
            return 0


        left_entropy = self._entropy(y[left_indices])
        right_entropy = self._entropy(y[right_indices])

        left_weight = len(y[left_indices]) / len(y)
        right_weight = len(y[right_indices]) / len(y)

        child_entropy = left_weight * left_entropy + right_weight * right_entropy

        return parent_entropy - child_entropy


    def _entropy(self, y):
        proportions = np.bincount(y) / len(y)          #getting proportion of 0's and 1's
        return -np.sum(proportions * np.log2(proportions + 1e-10))  # Small constant to avoid log(0)

    def predict(self, X):
        return X.apply(lambda row: self._predict_node(row, self.tree), axis=1)

    def _predict_node(self, row, node):
        if not isinstance(node, tuple):
            return node

        feature, threshold, left_tree, right_tree = node
        if row[feature] <= threshold:
            return self._predict_node(row, left_tree)
        else:
            return self._predict_node(row, right_tree)

# Define the cross-validation function
def cross_validate(X, y, k=10):

    kf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42) #For Stratified Sampling
    accuracies = []

    for train_index, test_index in kf.split(X, y):

        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Initialize and train the decision tree
        tree = DecisionTree(max_depth=6)
        tree.fit(X_train, y_train)

        # Make predictions
        y_pred = tree.predict(X_test)

        # Calculate accuracy
        accuracy = accuracy_score(y_test, y_pred)
        print(accuracy)
        accuracies.append(accuracy)

    return np.mean(accuracies)

# Fetch the data
wine_quality = fetch_ucirepo(id=186)
X = wine_quality.data.features
y = wine_quality.data.targets

# Convert 'quality' to binary classification: 0 for < 7, 1 for >= 7
y = (y['quality'] >= 7).astype(int)

# Perform cross-validation
average_accuracy = cross_validate(X, y, k=10)
print(f"Average Accuracy (10-fold CV): {average_accuracy:.3f}")

0.8061538461538461
0.8123076923076923
0.8246153846153846
0.82
0.8353846153846154
0.8430769230769231
0.8246153846153846
0.8397534668721109
0.8489984591679507
0.8412942989214176
Average Accuracy (10-fold CV): 0.830


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from ucimlrepo import fetch_ucirepo

# Define the Decision Tree Classifier with multi-way splits
class DecisionTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        self.tree = None

    def fit(self, X, y):
        self.features = X.columns
        self.tree = self._build_tree(X, y, depth=0)

    def _build_tree(self, X, y, depth):
      if len(np.unique(y)) == 1:
        return y.iloc[0]

      if self.max_depth is not None and depth >= self.max_depth:
        return y.mode().iloc[0] if not y.mode().empty else None

      # if len(y) < 2:  # Early stopping for small nodes
      #   return y.mode().iloc[0] if not y.mode().empty else None

      best_feature, best_thresholds = self._best_split(X, y)

      if best_feature is None:
        return y.mode().iloc[0] if not y.mode().empty else None

      sub_trees = {}
      for i in range(len(best_thresholds) + 1):
        if i == 0:
            indices = X[best_feature] <= best_thresholds[i]
        elif i == len(best_thresholds):
            indices = X[best_feature] > best_thresholds[i - 1]
        else:
            indices = (X[best_feature] > best_thresholds[i - 1]) & (X[best_feature] <= best_thresholds[i])
        if len(y[indices]) == 0:
          sub_trees[i] = y.mode().iloc[0] if not y.mode().empty else None
        else:
          sub_trees[i] = self._build_tree(X[indices], y[indices], depth + 1)

      return (best_feature, best_thresholds, sub_trees)

    def _best_split(self, X, y):
        best_gain = 0
        best_feature = None
        best_thresholds = None

        for feature in X.columns:
            thresholds = np.unique(X[feature])
            if len(thresholds) == 0:
                continue
            potential_splits = np.percentile(thresholds, [25, 50, 75])  # Example: multiway split at quartiles
            gain = self._multiway_gini_gain(X, y, feature, potential_splits)

            if gain > best_gain:
                best_gain = gain
                best_feature = feature
                best_thresholds = potential_splits

        return best_feature, best_thresholds

    def _multiway_gini_gain(self, X, y, feature, thresholds):
        parent_gini = self._gini(y)
        total_weighted_gini = 0

        for i in range(len(thresholds) + 1):
            if i == 0:
                indices = X[feature] <= thresholds[i]
            elif i == len(thresholds):
                indices = X[feature] > thresholds[i - 1]
            else:
                indices = (X[feature] > thresholds[i - 1]) & (X[feature] <= thresholds[i])

            if len(y[indices]) == 0:
                continue

            split_gini = self._gini(y[indices])
            weight = len(y[indices]) / len(y)
            total_weighted_gini += weight * split_gini

        return parent_gini - total_weighted_gini

    def _gini(self, y):
        proportions = y.value_counts(normalize=True)
        return 1 - np.sum(proportions**2)

    def predict(self, X):
        return X.apply(lambda row: self._predict_node(row, self.tree), axis=1)

    def _predict_node(self, row, node):
        if not isinstance(node, tuple):
            return node

        feature, thresholds, sub_trees = node
        for i in range(len(thresholds)):
            if row[feature] <= thresholds[i]:
                return self._predict_node(row, sub_trees[i])
        return self._predict_node(row, sub_trees[len(thresholds)])

# Define the cross-validation function
def cross_validate(X, y, k=10):
    kf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
    accuracies = []

    for train_index, test_index in kf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Initialize and train the decision tree
        tree = DecisionTree(max_depth=8)
        tree.fit(X_train, y_train)

        # Make predictions
        y_pred = tree.predict(X_test)

        # Calculate accuracy
        accuracy = accuracy_score(y_test, y_pred)
        print(accuracy)
        accuracies.append(accuracy)

    return np.mean(accuracies)

# Fetch the data
wine_quality = fetch_ucirepo(id=186)
X = wine_quality.data.features
y = wine_quality.data.targets

# Convert 'quality' to binary classification: 0 for < 7, 1 for >= 7
y = (y['quality'] >= 7).astype(int)

# Perform cross-validation
average_accuracy = cross_validate(X, y, k=10)
print(f"Average Accuracy (10-fold CV): {average_accuracy:.3f}")

0.8307692307692308
0.8430769230769231
0.8523076923076923
0.8184615384615385
0.8476923076923077
0.8353846153846154
0.8184615384615385
0.8459167950693375
0.8551617873651772
0.8536209553158706
Average Accuracy (10-fold CV): 0.840
