In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [None]:
class DecisionNode:
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, value=None):
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

In [1]:
def gini(y):
    m = len(y)
    return 1.0 - sum([(np.sum(y == c) / m) ** 2 for c in np.unique(y)])


In [2]:
def split(X, y, feature_index, threshold):
    left_idx = np.where(X[:, feature_index] <= threshold)
    right_idx = np.where(X[:, feature_index] > threshold)
    return X[left_idx], y[left_idx], X[right_idx], y[right_idx]

In [3]:
def best_split(X, y, num_thresholds=10):
    best_gini = float('inf')
    split_idx, split_threshold = None, None
    for feature_index in range(X.shape[1]):
        thresholds = np.linspace(X[:, feature_index].min(), X[:, feature_index].max(), num_thresholds)
        for threshold in thresholds:
            X_left, y_left, X_right, y_right = split(X, y, feature_index, threshold)
            if len(y_left) == 0 or len(y_right) == 0:
                continue

            w_left = len(y_left) / len(y)
            w_right = len(y_right) / len(y)
            gini_after = w_left * gini(y_left) + w_right * gini(y_right)
            
            if gini_after < best_gini:
                best_gini = gini_after
                split_idx = feature_index
                split_threshold = threshold

    return split_idx, split_threshold

In [4]:
def build_tree(X, y, max_depth=3):
    num_samples, num_features = X.shape
    num_labels = len(np.unique(y))
    if max_depth == 0 or num_labels == 1:
        leaf_value = np.argmax(np.bincount(y))
        return DecisionNode(value=leaf_value)
    
    feature_idx, threshold = best_split(X, y)
    if feature_idx is None:
        leaf_value = np.argmax(np.bincount(y))
        return DecisionNode(value=leaf_value)
    
    X_left, y_left, X_right, y_right = split(X, y, feature_idx, threshold)
    left_subtree = build_tree(X_left, y_left, max_depth - 1)
    right_subtree = build_tree(X_right, y_right, max_depth - 1)

    return DecisionNode(feature_index=feature_idx, threshold=threshold, left=left_subtree, right=right_subtree)


In [5]:
def predict_tree(sample, tree):
    if tree.value is not None:
        return tree.value
    if sample[tree.feature_index] <= tree.threshold:
        return predict_tree(sample, tree.left)
    return predict_tree(sample, tree.right)

In [6]:
def train_test_split(df, target_column_name, test_size=0.2):
    shuffled_indices = np.random.permutation(len(df))
    test_set_size = int(len(df) * test_size)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return df.iloc[train_indices], df.iloc[test_indices]

In [27]:
# Load data
df = pd.read_csv('preprocessed_age.csv')
df.head()

In [None]:
df = df.drop(["Unnamed: 0", "Id", "Death year"], axis=1)

target_column_name = "Age of death"
df = df.dropna(subset=[target_column_name])
X = df.drop(columns=[target_column_name]).values
y = df[target_column_name].values

In [None]:
le = LabelEncoder()
for column in df.columns:
    if (df[column].dtype != "int64"):
        df[column] = le.fit_transform(df[column])

In [7]:
X_train_df, X_test_df = train_test_split(df, target_column_name)

X_train = X_train_df.drop(columns=[target_column_name]).to_numpy()
y_train = X_train_df[target_column_name].to_numpy()
X_test = X_test_df.drop(columns=[target_column_name]).to_numpy()
y_test = X_test_df[target_column_name].to_numpy()

NameError: name 'df' is not defined

In [None]:
tree = build_tree(X_train, y_train, max_depth=5)

predictions = [predict_tree(sample, tree) for sample in X_test]
accuracy = np.mean(predictions == y_test)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 3.60%
