In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
class DecisionNode:
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, value=None):
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

In [3]:
def gini(y):
    m = len(y)
    return 1.0 - sum([(np.sum(y == c) / m) ** 2 for c in np.unique(y)])


In [4]:
def split(X, y, feature_index, threshold):
    left_idx = np.where(X[:, feature_index] <= threshold)
    right_idx = np.where(X[:, feature_index] > threshold)
    return X[left_idx], y[left_idx], X[right_idx], y[right_idx]

In [5]:
def best_split(X, y, num_thresholds=10):
    best_gini = float('inf')
    split_index, split_threshold = None, None
    for feature_index in range(X.shape[1]):
        thresholds = np.linspace(X[:, feature_index].min(), X[:, feature_index].max(), num_thresholds)
        for threshold in thresholds:
            # Left = feature_index <= threshold
            # Right = feature_index >= threshold
            X_left, y_left, X_right, y_right = split(X, y, feature_index, threshold)
            # If either the left or right split doesn't contain any data points, the loop skips the rest of the current iteration and moves on to the next threshold. 
            if len(y_left) == 0 or len(y_right) == 0:
                continue

            w_left = len(y_left) / len(y)
            w_right = len(y_right) / len(y)
            gini_after = w_left * gini(y_left) + w_right * gini(y_right)
            
            if gini_after < best_gini:
                best_gini = gini_after
                split_index = feature_index
                split_threshold = threshold

    return split_index, split_threshold

In [6]:
def build_tree(X, y, max_depth=3):
    # Number of training classifiers
    num_labels = len(np.unique(y))

    # Check of unusable data
    if max_depth == 0 or num_labels == 1:
        # Max number of occurences of classifiers
        leaf_value = np.argmax(np.bincount(y))
        # Create leaf node
        return DecisionNode(value=leaf_value)
    
    # Find feature with highest gain
    feature_index, threshold = best_split(X, y)
    if feature_index is None:
        leaf_value = np.argmax(np.bincount(y))
        # Create leaf node
        return DecisionNode(value=leaf_value)
    
    X_left, y_left, X_right, y_right = split(X, y, feature_index, threshold)
    left_subtree = build_tree(X_left, y_left, max_depth - 1)
    right_subtree = build_tree(X_right, y_right, max_depth - 1)

    # Create decision node
    return DecisionNode(feature_index=feature_index, threshold=threshold, left=left_subtree, right=right_subtree)


In [7]:
def predict_tree(sample, tree):
    if tree.value is not None:
        return tree.value
    if sample[tree.feature_index] <= tree.threshold:
        return predict_tree(sample, tree.left)
    return predict_tree(sample, tree.right)

In [9]:
# def train_test_split(df, target_column_name, test_size=0.2):
#     shuffled_indices = np.random.permutation(len(df))
#     test_set_size = int(len(df) * test_size)
#     test_indices = shuffled_indices[:test_set_size]
#     train_indices = shuffled_indices[test_set_size:]
#     return df.iloc[train_indices], df.iloc[test_indices]

In [22]:
# Load data
df = pd.read_csv('preprocessed_age.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Id,Name,Gender,Occupation,Birth year,Death year,Manner of death,Age of death,Associated Countries,Associated Country Life Expectancy
0,1,Q42,douglas adams,male,artist,1952,2001,natural causes,49,united kingdom,[81.3]
1,2,Q91,abraham lincoln,male,politician,1809,1865,homicide,56,united states,[78.5]
2,6,Q272,paul morand,male,artist,1888,1976,,88,france,[82.5]
3,7,Q296,claude monet,male,artist,1840,1926,natural causes,86,france,[82.5]
4,10,Q303,elvis presley,male,artist,1935,1977,natural causes,42,united states,[78.5]


In [23]:
df = df.drop(["Unnamed: 0", "Id", "Death year"], axis=1)

target_column_name = "Gender"
df = df.dropna(subset=[target_column_name])


In [24]:
le = LabelEncoder()
for column in df.columns:
    if (df[column].dtype != "int64"):
        df[column] = le.fit_transform(df[column])

In [25]:
X = df.drop(columns=[target_column_name]).values
y = df[target_column_name].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [26]:
tree = build_tree(X_train, y_train, max_depth=5)

predictions = [predict_tree(sample, tree) for sample in X_test]
accuracy = np.mean(predictions == y_test)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 89.48%


In [13]:
from sklearn.metrics import precision_score, recall_score, f1_score
def prf(actual, predictions):
    print("Precision:", precision_score(actual, predictions, average='macro') * 100)
    print("Recall:",recall_score(actual, predictions, average='macro') * 100)
    print("F1:",f1_score(actual, predictions, average='macro') * 100)

In [27]:
prf(y_test, predictions)


Precision: 15.85998798180101
Recall: 10.102661508476418
F1: 9.657295663497282


  _warn_prf(average, modifier, msg_start, len(result))


In [28]:
import matplotlib.pyplot as plt 