In [1]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

class DecisionTree:
    def __init__(self, max_depth=None, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.tree = None

    def entropy(self, y):
        values, counts = np.unique(y, return_counts=True)
        probabilities = counts / counts.sum()
        entropy = -np.sum(probabilities * np.log2(probabilities + 1e-9))  # small epsilon for stability
        return entropy

    def info_gain(self, y, splits):
        total_entropy = self.entropy(y)
        weighted_entropy = 0
        total_len = len(y)
        for subset in splits:
            weight = len(subset) / total_len
            weighted_entropy += weight * self.entropy(subset)
        return total_entropy - weighted_entropy

    def best_split(self, X, y):
        best_feature, best_threshold, best_info_gain = None, None, -1
        num_features = X.shape[1]

        for feature in range(num_features):
            values = X[:, feature]
            thresholds = np.unique(values)

            for threshold in thresholds:
                left_mask = values <= threshold
                right_mask = values > threshold
                if sum(left_mask) == 0 or sum(right_mask) == 0:
                    continue

                left_y = y[left_mask]
                right_y = y[right_mask]

                gain = self.info_gain(y, [left_y, right_y])

                if gain > best_info_gain:
                    best_info_gain = gain
                    best_feature = feature
                    best_threshold = threshold
        return best_feature, best_threshold, best_info_gain

    def build_tree(self, X, y, depth=0):
        num_samples, num_features = X.shape
        num_labels = len(np.unique(y))

        if (num_labels == 1) or (num_samples < self.min_samples_split) or (self.max_depth is not None and depth >= self.max_depth):
            leaf_label = Counter(y).most_common(1)[0][0]
            return {'type': 'leaf', 'class': leaf_label}

        feature, threshold, info_gain = self.best_split(X, y)

        if info_gain == 0 or feature is None:
            leaf_label = Counter(y).most_common(1)[0][0]
            return {'type': 'leaf', 'class': leaf_label}

        left_mask = X[:, feature] <= threshold
        right_mask = X[:, feature] > threshold

        left_subtree = self.build_tree(X[left_mask], y[left_mask], depth + 1)
        right_subtree = self.build_tree(X[right_mask], y[right_mask], depth + 1)

        return {
            'type': 'node',
            'feature': feature,
            'threshold': threshold,
            'left': left_subtree,
            'right': right_subtree
        }

    def fit(self, X, y):
        self.tree = self.build_tree(X, y)

    def predict_single(self, x, tree):
        if tree['type'] == 'leaf':
            return tree['class']

        feature = tree['feature']
        threshold = tree['threshold']

        if x[feature] <= threshold:
            return self.predict_single(x, tree['left'])
        else:
            return self.predict_single(x, tree['right'])

    def predict(self, X):
        return np.array([self.predict_single(x, self.tree) for x in X])

if __name__ == "__main__":
    # Load dataset
    df = pd.read_csv('Salaries.csv')

    # Prepare features and target
    X = df.drop('salary_more_than_100k', axis=1)
    y = df['salary_more_than_100k']

    # Encode categorical columns
    le_company = LabelEncoder()
    le_job = LabelEncoder()
    le_degree = LabelEncoder()

    X['company_n'] = le_company.fit_transform(X['company'])
    X['job_n'] = le_job.fit_transform(X['job'])
    X['degree_n'] = le_degree.fit_transform(X['degree'])

    # Drop original categorical columns
    X = X.drop(['company', 'job', 'degree'], axis=1)

    # Train/test split
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Convert to numpy arrays
    x_train_np = x_train.to_numpy()
    y_train_np = y_train.to_numpy()
    x_test_np = x_test.to_numpy()
    y_test_np = y_test.to_numpy()

    # Train decision tree
    dt = DecisionTree(max_depth=5)
    dt.fit(x_train_np, y_train_np)

    # Predict
    predictions = dt.predict(x_test_np)

    # Evaluate
    accuracy = np.mean(predictions == y_test_np)
    print("Predictions:", predictions)
    print(f"Accuracy: {accuracy:.4f}")

Predictions: [0 0 0 1]
Accuracy: 0.7500
