<a href="https://colab.research.google.com/github/ManjotSran/Binary-Classification-ML-Project/blob/main/Copy_of_HACKANONS_COLAB_25GB_RAM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# New Section

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Load the dataset
filename = 'http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
df = pd.read_csv(filename, header=None)

# Replace '?' with NaN and drop rows with missing values
df.replace('?', np.nan, inplace=True)
df.dropna(inplace=True)

# Encoding categorical features
label_encoders = {}
for column in df.columns:
    if df[column].dtype == object:
        label_encoders[column] = LabelEncoder()
        df[column] = label_encoders[column].fit_transform(df[column])

# Split the data into features and target label
X = df.drop(14, axis=1)
y = df[14]

# Normalize the numerical features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Convert to numpy array for the model
X = np.array(X)
y = np.array(y)

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Combine training features and target for the model
training_data = np.column_stack((X_train, y_train))

# Custom CART implementation
# ... [Include your CART class and related functions here]
def calculate_measure_of_goodness(y, left_y, right_y, num_classes):
    P_L = len(left_y) / len(y)
    P_R = 1 - P_L
    goodness = 0

    for j in range(num_classes):
        P_j_tL = np.sum(left_y == j) / len(left_y) if len(left_y) > 0 else 0
        P_j_tR = np.sum(right_y == j) / len(right_y) if len(right_y) > 0 else 0
        goodness += abs(P_j_tL - P_j_tR)

    return 2 * P_L * P_R * goodness

def calculate_best_split(dataset, num_features, num_classes):
    best_split = {}
    max_goodness = -float("inf")

    for feature_index in range(num_features):
        feature_values = np.unique(dataset[:, feature_index])
        for value in feature_values:
            left, right = split_dataset(dataset, feature_index, value)
            if len(left) > 0 and len(right) > 0:
                y, left_y, right_y = dataset[:, -1], left[:, -1], right[:, -1]
                current_goodness = calculate_measure_of_goodness(y, left_y, right_y, num_classes)
                if current_goodness > max_goodness:
                    best_split["feature_index"] = feature_index
                    best_split["threshold"] = value
                    best_split["left"] = left
                    best_split["right"] = right
                    best_split["gain"] = current_goodness
                    max_goodness = current_goodness
    return best_split

def split_dataset(dataset, feature_index, threshold):
    left = np.array([row for row in dataset if row[feature_index] <= threshold])
    right = np.array([row for row in dataset if row[feature_index] > threshold])
    return left, right

class TreeNode:
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, value=None, gain=None):
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value
        self.gain = gain

class CART:
    def __init__(self, max_depth=10, min_size=2):
        self.root = None
        self.max_depth = max_depth
        self.min_size = min_size

    def build_tree(self, dataset, current_depth=0, num_classes=None):
        X, y = dataset[:, :-1], dataset[:, -1]
        num_features = X.shape[1]
        if num_classes is None:
            num_classes = len(np.unique(y))

        # Stopping conditions
        if len(set(y)) == 1 or current_depth >= self.max_depth:
            return TreeNode(value=self.most_common_label(y))

        # Calculate the best split
        best_split = calculate_best_split(dataset, num_features, num_classes)
        if best_split["gain"] == 0 or len(best_split["left"]) < self.min_size or len(best_split["right"]) < self.min_size:
            return TreeNode(value=self.most_common_label(y))

        # Build left and right subtrees
        left_subtree = self.build_tree(best_split["left"], current_depth + 1, num_classes)
        right_subtree = self.build_tree(best_split["right"], current_depth + 1, num_classes)

        # Create a tree node
        return TreeNode(feature_index=best_split["feature_index"], threshold=best_split["threshold"],
                        left=left_subtree, right=right_subtree, gain=best_split["gain"])

    def most_common_label(self, y):
        return np.bincount(y.astype(int)).argmax()

    def fit(self, dataset):
        num_classes = len(np.unique(dataset[:, -1]))
        self.root = self.build_tree(dataset, num_classes=num_classes)

    def predict(self, x, node=None):
        if node is None:
            node = self.root

        if node.value is not None:
            return node.value
        if x[node.feature_index] <= node.threshold:
            return self.predict(x, node.left)
        else:
            return self.predict(x, node.right)

    def predict_dataset(self, X):
        return [self.predict(x) for x in X]

# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
    return correct / float(len(actual)) * 100.0

# CART Model Evaluation with Train-Test Split
model = CART(max_depth=5, min_size=10)
model.fit(training_data)
predictions = model.predict_dataset(X_test)

# Calculate and print the accuracy
accuracy = accuracy_metric(y_test, predictions)
print('Accuracy: %.3f%%' % accuracy)
