<a href="https://colab.research.google.com/github/HarshitaKalani3/Machine-Learning/blob/main/Computational_Efficiency.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install cupy-cuda12x --upgrade
import cupy as np
import numpy as np_cpu
import pandas as pd
from collections import deque



In [3]:
class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

    def is_leaf_node(self):
        return self.value is not None

class DecisionTree:
    def __init__(self, min_samples_split=2, max_depth=100, n_features=None):
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        self.n_features = n_features
        self.root = None

    def fit(self, x, y, feature_names=None):
        self.feature_names = feature_names
        self.n_features = x.shape[1] if not self.n_features else min(x.shape[1], self.n_features)
        self.root = self._grow_tree(x, y)

    def _grow_tree(self, x, y, depth=0):
        n_samples, n_feats = x.shape
        n_labels = len(np.unique(y))
        if (depth >= self.max_depth) or (n_labels == 1) or (n_samples < self.min_samples_split):
            leaf_value = self._most_common_label(y)
            return Node(value=leaf_value)
        feat_idxs = np.random.choice(n_feats, self.n_features, replace=False)
        best_feature, best_thresh = self._best_split(x, y, feat_idxs)
        left_idxs, right_idxs = self._split(x[:, best_feature], best_thresh)
        left = self._grow_tree(x[left_idxs, :], y[left_idxs], depth + 1)
        right = self._grow_tree(x[right_idxs, :], y[right_idxs], depth + 1)
        return Node(best_feature, best_thresh, left, right)

    def print_splits_level_order(self):
        if self.root is None:
            print("Tree is not built yet.")
            return
        queue = deque()
        queue.append((self.root, 0))
        while queue:
            node, depth = queue.popleft()
            if node.is_leaf_node():
                continue
            feat_name = self.feature_names[node.feature] if self.feature_names else f"feature_{node.feature}"
            print(f"Depth {depth}: Split on '{feat_name}' at threshold {node.threshold}")
            queue.append((node.left, depth + 1))
            queue.append((node.right, depth + 1))

    def _best_split(self, x, y, feat_idxs):
        best_gain = -1
        split_idx, split_threshold = None, None
        for feat_idx in feat_idxs:
            x_column = x[:, feat_idx]
            thresholds = np.unique(x_column)
            for thr in thresholds:
                gain = self._information_gain(y, x_column, thr)
                if gain > best_gain:
                    best_gain = gain
                    split_idx = feat_idx
                    split_threshold = thr
        return split_idx, split_threshold

    def _information_gain(self, y, x_column, threshold):
        parent_entropy = self._entropy(y)
        left_idxs, right_idxs = self._split(x_column, threshold)
        if len(left_idxs) == 0 or len(right_idxs) == 0:
            return 0
        n = len(y)
        n_l, n_r = len(left_idxs), len(right_idxs)
        e_l, e_r = self._entropy(y[left_idxs]), self._entropy(y[right_idxs])
        child_entropy = (n_l/n) * e_l + (n_r/n) * e_r
        information_gain = parent_entropy - child_entropy
        return information_gain

    def _split(self, x_column, split_thresh):
        left_idxs = np.argwhere(x_column <= split_thresh).flatten()
        right_idxs = np.argwhere(x_column > split_thresh).flatten()
        return left_idxs, right_idxs

    def _entropy(self, y):
        hist = np.bincount(y)
        ps = hist / len(y)
        ps = ps[ps > 0]
        return -np.sum(ps * np.log(ps))

    def _most_common_label(self, y):
        vals, counts = np.unique(y, return_counts=True)
        return vals[np.argmax(counts)]

    def predict(self, x):
        return np.array([self._traverse_tree(a, self.root) for a in x])

    def _traverse_tree(self, a, node):
        if node.is_leaf_node():
            return node.value
        if a[node.feature] <= node.threshold:
            return self._traverse_tree(a, node.left)
        return self._traverse_tree(a, node.right)

In [4]:
from google.colab import files
uploaded = files.upload()
import pandas as pd
import numpy as np
df = pd.read_csv('adult.csv')
df.rename(columns={"income":"label"}, inplace=True)
df = df.replace('?', np.nan).dropna()

X = df.drop("label", axis=1)
y = df["label"]
X = pd.get_dummies(X)
classes = {label : i for i, label in enumerate(np.unique(y))}
y = np.array([classes[label] for label in y])
feature_names = X.columns.tolist()
X = X.values

def train_test_split(X, y, test_size=0.3, random_state=1234):
    np.random.seed(random_state)
    idxs = np.arange(len(X))
    np.random.shuffle(idxs)
    split = int(len(X) * (1 - test_size))
    train_idx, test_idx = idxs[:split], idxs[split:]
    return X[train_idx], X[test_idx], y[train_idx], y[test_idx]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

Saving adult.csv to adult (1).csv
Train shape: (31655, 104)
Test shape: (13567, 104)


In [5]:
clf = DecisionTree(max_depth=10)
clf.fit(X_train, y_train, feature_names=feature_names)
clf.print_splits_level_order()

Depth 0: Split on 'marital-status_Married-civ-spouse' at threshold False
Depth 1: Split on 'capital-gain' at threshold 6849
Depth 1: Split on 'educational-num' at threshold 12
Depth 2: Split on 'educational-num' at threshold 12
Depth 2: Split on 'capital-gain' at threshold 7978
Depth 2: Split on 'capital-gain' at threshold 5013
Depth 2: Split on 'capital-gain' at threshold 5013
Depth 3: Split on 'age' at threshold 26
Depth 3: Split on 'age' at threshold 27
Depth 3: Split on 'capital-gain' at threshold 7430
Depth 3: Split on 'age' at threshold 19
Depth 3: Split on 'educational-num' at threshold 8
Depth 3: Split on 'age' at threshold 63
Depth 3: Split on 'capital-loss' at threshold 1740
Depth 3: Split on 'age' at threshold 62
Depth 4: Split on 'age' at threshold 21
Depth 4: Split on 'capital-loss' at threshold 2206
Depth 4: Split on 'hours-per-week' at threshold 52
Depth 4: Split on 'hours-per-week' at threshold 43
Depth 4: Split on 'hours-per-week' at threshold 45
Depth 4: Split on 'edu

In [6]:
predictions = clf.predict(X_test)
def to_cpu(array):
    try:
        return array.get()
    except AttributeError:
        return array
def accuracy(y_true, y_pred):
    return np_cpu.sum(y_true == y_pred) / len(y_true)
acc = accuracy(to_cpu(y_test), to_cpu(predictions))
print(f"\n Accuracy:{acc:.2f}")


 Accuracy:0.85


In [7]:
def confusion_matrix(y_true, y_pred):
    tp = fp = tn = fn = 0
    for true, pred in zip(y_true, y_pred):
        if true == 1 and pred == 1:
            tp += 1
        elif true == 0 and pred == 0:
            tn += 1
        elif true == 0 and pred == 1:
            fp += 1
        elif true == 1 and pred == 0:
            fn += 1
    return [[tn, fp], [fn, tp]]
c = confusion_matrix(to_cpu(y_test), to_cpu(predictions))
print("\nConfusion Matrix:")
for row in c:
    print(row)


Confusion Matrix:
[9707, 525]
[1454, 1881]


In [8]:
def k_fold_cross_validation(X, y, k=10, random_seed=1234):
    np.random.seed(random_seed)
    all_indexes = np.arange(len(X))
    np.random.shuffle(all_indexes)
    fold_size = len(X) // k
    accuracies = []
    for i in range(k):
        start = i * fold_size
        if i == k - 1:
            end = len(X)
        else:
            end = (i + 1) * fold_size
        test_indexes = all_indexes[start:end]
        train_indexes = np.concatenate([all_indexes[:start], all_indexes[end:]])

        X_train, y_train = X[train_indexes], y[train_indexes]
        X_test, y_test = X[test_indexes], y[test_indexes]

        tree = DecisionTree(max_depth=10)
        tree.fit(X_train, y_train, feature_names=feature_names)
        predictions = tree.predict(X_test)
        correct = np.sum(predictions == y_test)
        acc = correct / len(y_test)
        print(f"Fold {i + 1} acc: {acc: }")
        accuracies.append(acc)
    avg_acc = sum(accuracies) / k
    print(f"\nAvg acc on {k} folds: {avg_acc:}")
k_fold_cross_validation(X, y, k=10)

Fold 1 acc:  0.8586908447589562
Fold 2 acc:  0.8560371517027864
Fold 3 acc:  0.8520566121185317
Fold 4 acc:  0.8536045997346307
Fold 5 acc:  0.8520566121185317
Fold 6 acc:  0.8558160106147722
Fold 7 acc:  0.8624502432551968
Fold 8 acc:  0.8564794338788146
Fold 9 acc:  0.8547103051747015
Fold 10 acc:  0.843501326259947

Avg acc on 10 folds: 0.854540313961687


In [9]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler
import numpy as np
def to_cpu(x):
    try:
        return x.get()
    except:
        return np.array(x)
class MyNet(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.layer1 = nn.Linear(input_size, 32)
        self.layer2 = nn.Linear(32, 16)
        self.layer3 = nn.Linear(16, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = torch.relu(self.layer1(x))
        x = torch.relu(self.layer2(x))
        x = self.sigmoid(self.layer3(x))
        return x

In [11]:
def k_fold_hybrid(X, y, tree_clf, k=10, random_seed=1234):
    np.random.seed(random_seed)
    all_indexes = np.arange(len(X))
    np.random.shuffle(all_indexes)
    fold_size = len(X) // k
    accuracies = []

    for i in range(k):
        start = i * fold_size
        end = len(X) if i == k-1 else (i+1)*fold_size
        test_idx = all_indexes[start:end]
        train_idx = np.concatenate([all_indexes[:start], all_indexes[end:]])

        X_train_fold, y_train_fold = X[train_idx], y[train_idx]
        X_test_fold, y_test_fold = X[test_idx], y[test_idx]

        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train_fold)
        X_test_scaled = scaler.transform(X_test_fold)

        tree_train_pred = to_cpu(tree_clf.predict(X_train_fold)).reshape(-1,1)
        tree_test_pred = to_cpu(tree_clf.predict(X_test_fold)).reshape(-1,1)

        X_train_hybrid = np.hstack((X_train_scaled, tree_train_pred))
        X_test_hybrid = np.hstack((X_test_scaled, tree_test_pred))
        X_train_tensor = torch.tensor(X_train_hybrid, dtype=torch.float32)
        y_train_tensor = torch.tensor(y_train_fold, dtype=torch.float32).view(-1,1)
        X_test_tensor = torch.tensor(X_test_hybrid, dtype=torch.float32)
        y_test_tensor = torch.tensor(y_test_fold, dtype=torch.float32).view(-1,1)

        net = MyNet(X_train_tensor.shape[1])
        loss_function = nn.BCELoss()
        optimizer = torch.optim.Adam(net.parameters(), lr=0.001)
        batch_size = 64
        epochs = 30

        for epoch in range(epochs):
            idxs = np.arange(len(X_train_tensor))
            np.random.shuffle(idxs)
            for start_b in range(0, len(X_train_tensor), batch_size):
                end_b = start_b + batch_size
                batch_idx = idxs[start_b:end_b]
                batch_X = X_train_tensor[batch_idx]
                batch_y = y_train_tensor[batch_idx]

                outputs = net(batch_X)
                loss = loss_function(outputs, batch_y)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
        with torch.no_grad():
            pred = net(X_test_tensor)
            pred_label = (pred > 0.5).float()
            acc_fold = (pred_label == y_test_tensor).sum().item() / len(y_test_tensor)
            print(f"Fold {i+1} Hybrid Accuracy: {acc_fold:.4f}")
            accuracies.append(acc_fold)
        avg_acc = sum(accuracies)/k
    print(f"\nAverage Hybrid Accuracy: {avg_acc:.4f}")
    print(f"Decision Tree Accuracy: {acc:.4f}")
    print("Hence, we can see that the hybrid model performs better than the decision tree model.")
k_fold_hybrid(X_train, y_train, clf, k=10)

Fold 1 Hybrid Accuracy: 0.8591
Fold 2 Hybrid Accuracy: 0.8657
Fold 3 Hybrid Accuracy: 0.8597
Fold 4 Hybrid Accuracy: 0.8629
Fold 5 Hybrid Accuracy: 0.8695
Fold 6 Hybrid Accuracy: 0.8667
Fold 7 Hybrid Accuracy: 0.8581
Fold 8 Hybrid Accuracy: 0.8613
Fold 9 Hybrid Accuracy: 0.8638
Fold 10 Hybrid Accuracy: 0.8666

Average Hybrid Accuracy: 0.8633
Decision Tree Accuracy: 0.8541
Hence, we can see that the hybrid model performs better than the decision tree model.
