## KD Tree

In [1468]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import mode

In [1469]:
data = pd.read_csv('/Users/hanifemamgholizadeh/Desktop/patter_recognition/data/tree_classification_dataset.csv')

In [1470]:
data.head()

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,target
0,-3.402839,0.179845,1.432424,0.774566,1
1,-1.902995,-1.241501,1.956614,0.512447,1
2,-1.023094,1.270126,0.782203,-0.785725,0
3,-0.331077,-0.06959,-0.258279,-0.339054,0
4,-3.377452,0.816699,-3.009166,-1.553258,2


In [1471]:
y = data['target']
X = data.drop(columns=['target'])
len(y.unique())

3

In [1472]:
class TreeNode:
    def __init__(self, left_child=None, right_child=None, parent=None, value=None, feature=None, leaf_list=None):
        self.left_child = left_child
        self.right_child = right_child
        self.parent = parent
        self.value = value
        self.feature = feature
        self.leaf_list = leaf_list if leaf_list is not None else []
        self.is_right_child = False
        self.is_left_child = False

    def is_leaf(self):
        return self.left_child is None and self.right_child is None

In [1473]:
def calculate_distance(x1, x2):
    return np.linalg.norm(np.array(x1) - np.array(x2))

In [1474]:
def calculate_distance_to_wall(x, node):
    feature = node.feature
    wall_value = node.value
    return abs(x[feature] - wall_value)

In [1475]:
def create_KD_tree(X, y, parent_node=None, direction=None):
    if len(X) <= 4 or len(np.unique(y)) == 1:
        # Create a leaf node
        return TreeNode(parent=parent_node, leaf_list=list(zip(X.values, y.values)))

    # Choose a random feature and split on its median
    feature = np.random.choice(X.columns)
    median_value = X[feature].median()

    # Split the data
    left_indices = X[X[feature] <= median_value].index
    right_indices = X[X[feature] > median_value].index

    # Create current node
    node = TreeNode(parent=parent_node, value=median_value, feature=feature)

    if direction == 'left':
        node.is_left_child = True
    elif direction == 'right':
        node.is_right_child = True

    # Recursive construction
    node.left_child = create_KD_tree(X.loc[left_indices], y.loc[left_indices], parent_node=node, direction='left')
    node.right_child = create_KD_tree(X.loc[right_indices], y.loc[right_indices], parent_node=node, direction='right')

    return node


In [1476]:
def depth_first_search(tree_node: TreeNode, x, knn_distance, knn=[]):
    if tree_node is None:
        return None
    
    if tree_node.is_leaf():
        leaf_list = tree_node.get_leaf_list()
        for item in leaf_list:
            calculate_distances = [calculate_distance(x, item) for item in leaf_list]
        sorted_indices = np.argsort(calculate_distances)
        knn.append(sorted_indices)
        return tree_node.get_leaf_list()
    
    feature = tree_node.get_feature()
    value = tree_node.get_value()
    
    if calculate_distance_to_wall(x[feature], value) < knn_distance:
        return depth_first_search(tree_node, x, knn_distance, knn)
    else:
        return depth_first_search(tree_node.parent, x)

In [1477]:
def depth_first_search(node, x, knn_distance, knn_list):
    if node is None:
        return

    if node.is_leaf():
        for point, label in node.leaf_list:
            dist = calculate_distance(x, point)
            knn_list.append((dist, label))
        return

    feature = node.feature
    value = node.value

    # Choose which subtree to go first
    if x[feature] <= value:
        depth_first_search(node.left_child, x, knn_distance, knn_list)
        if calculate_distance_to_wall(x, node) < knn_distance:
            depth_first_search(node.right_child, x, knn_distance, knn_list)
    else:
        depth_first_search(node.right_child, x, knn_distance, knn_list)
        if calculate_distance_to_wall(x, node) < knn_distance:
            depth_first_search(node.left_child, x, knn_distance, knn_list)

# Query the KD Tre

In [1478]:
def query_KD_tree(tree, x, k=1):
    knn_list = []

    # Initial DFS to fill neighbors
    depth_first_search(tree, x, knn_distance=np.inf, knn_list=knn_list)

    # Sort and return top-k neighbors
    knn_list.sort(key=lambda tup: tup[0])
    return knn_list[:k]

# Example usage
kd_tree = create_KD_tree(X, y)

# Test on one point
test_point = X.iloc[10]
neighbors = query_KD_tree(kd_tree, test_point, k=3)
print("Nearest Neighbors:", neighbors)
print("Test Point prediction:", mode(np.array([preds[1] for preds in neighbors])).mode[0])
print("True Label:", y.iloc[0])

Nearest Neighbors: [(0.0, 1), (0.7395099856826924, 1), (0.7622639206298912, 1)]
Test Point prediction: 1
True Label: 1


  print("Test Point prediction:", mode(np.array([preds[1] for preds in neighbors])).mode[0])


In [1479]:
from sklearn.metrics import accuracy_score
def evaluate_model(tree, X, y, k=3):
    predictions = []
    for i in range(len(X)):
        neighbors = query_KD_tree(tree, X.iloc[i], k)
        predicted_label = mode(np.array([preds[1] for preds in neighbors])).mode[0]
        predictions.append(predicted_label)
    
    accuracy = accuracy_score(y, predictions)
    print("Accuracy:", accuracy)
    return accuracy

In [1480]:
evaluate_model(kd_tree, X, y, k=3)

  predicted_label = mode(np.array([preds[1] for preds in neighbors])).mode[0]


Accuracy: 0.895


0.895

## Decision Tree

In [1481]:
df_dtree = pd.read_csv('/Users/hanifemamgholizadeh/Desktop/patter_recognition/data/decision_tree_dataset.csv')

In [1482]:
df_dtree.head()

Unnamed: 0,age,income,student,credit_rating,label
0,56,23392,1,1,0
1,46,45535,1,1,0
2,32,93603,0,1,0
3,25,67256,0,1,0
4,38,104135,1,0,1


In [1483]:
from sklearn.model_selection import train_test_split
X_dtree = df_dtree.drop(columns=['label'])
y_dtree = df_dtree['label']
X_train, X_test, y_train, y_test = train_test_split(X_dtree, y_dtree, test_size=0.2, random_state=42)

In [1484]:
class DTreeNode:
    def __init__(self, left_child=None, right_child=None, value=None, feature=None,
                 leaf_list=False, included_sample_size=0, impurity=None, sample_list=[], predicted_class=None):
        self.left_child = left_child
        self.right_child = right_child
        self.value = value
        self.feature = feature
        self.leaf_list = leaf_list
        self.included_sample_size = included_sample_size
        self.impurity = impurity
        self.samples_list = sample_list
        self.predicted_class = predicted_class  


In [1485]:
def columns_type(X=X_train):
    columns_types = dict()

    for column in X.columns:
        if X[column].dtype == 'object':
            columns_types[column] = 'categorical'
        else:
            if len(X[column].unique()) >= 10:
                columns_types[column] = 'numerical'
            else:
                columns_types[column] = 'categorical'
    return columns_types

In [1486]:
columns_types = columns_type()

In [1487]:
def entropy(labels):
    probs = labels.value_counts(normalize=True)
    return -np.sum([p * np.log2(p) for p in probs if p > 0])


In [1488]:

def decision_tree(X, y, parent_node=None, list_of_indices=None, threshold=0.01):

    columns_types = columns_type(X)

    # Use consistent positional indices
    if list_of_indices is None:
        list_of_indices = list(range(len(X)))

    # Subset the labels
    labels = y.iloc[list_of_indices]

    # Stopping conditions: pure or low impurity
    p_vals = labels.value_counts(normalize=True)
    current_impurity = -np.sum([p * np.log2(p) for p in p_vals if p > 0])

    if current_impurity <= threshold or len(set(labels)) == 1 or len(list_of_indices) <= 1:
        return DTreeNode(
            included_sample_size=len(list_of_indices),
            sample_list=list_of_indices,
            impurity=current_impurity,
            predicted_class=labels.mode()[0]
        )

    # Create current node
    current_node = DTreeNode(
        included_sample_size=len(list_of_indices),
        sample_list=list_of_indices,
        impurity=current_impurity
    )

    min_impurity = float('inf')
    feature_to_split = None
    value_to_split = None
    best_left_indices = None
    best_right_indices = None

    for column in X.columns:
        column_type = columns_types[column]
        data = X[column].iloc[list_of_indices]

        if column_type == 'numerical':
            sorted_indices = data.sort_values().index.tolist()

            for i in range(1, len(sorted_indices)):
                left_indices = sorted_indices[:i]
                right_indices = sorted_indices[i:]

                if not left_indices or not right_indices:
                    continue

                y_left = y.iloc[left_indices]
                y_right = y.iloc[right_indices]

                entropy_left = entropy(y_left)
                entropy_right = entropy(y_right)

                weighted_entropy = (
                    len(left_indices) / len(list_of_indices) * entropy_left +
                    len(right_indices) / len(list_of_indices) * entropy_right
                )

                if weighted_entropy < min_impurity:
                    min_impurity = weighted_entropy
                    feature_to_split = column
                    value_to_split = (data.iloc[i - 1] + data.iloc[i]) / 2
                    best_left_indices = left_indices
                    best_right_indices = right_indices

        else:  # categorical
            unique_values = data.unique()
            for val in unique_values:
                left_indices = data[data == val].index.tolist()
                right_indices = data[data != val].index.tolist()

                if not left_indices or not right_indices:
                    continue

                y_left = y.iloc[left_indices]
                y_right = y.iloc[right_indices]

                entropy_left = entropy(y_left)
                entropy_right = entropy(y_right)

                weighted_entropy = (
                    len(left_indices) / len(list_of_indices) * entropy_left +
                    len(right_indices) / len(list_of_indices) * entropy_right
                )

                if weighted_entropy < min_impurity:
                    min_impurity = weighted_entropy
                    feature_to_split = column
                    value_to_split = val
                    best_left_indices = left_indices
                    best_right_indices = right_indices

    # Final safeguard: avoid invalid or redundant splits
    if (
        best_left_indices is None or best_right_indices is None or
        set(best_left_indices) == set(list_of_indices) or
        set(best_right_indices) == set(list_of_indices)
    ):
        current_node.predicted_class = labels.mode()[0]
        return current_node

    # Store best split
    current_node.feature = feature_to_split
    current_node.value = value_to_split

    # Recursively split
    current_node.left_child = decision_tree(X, y, current_node, best_left_indices, threshold)
    current_node.right_child = decision_tree(X, y, current_node, best_right_indices, threshold)

    return current_node


In [1489]:
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)

tree = decision_tree(X_train, y_train)

In [1490]:
def predict_one(sample, node):
    while node.left_child and node.right_child:
        val = sample[node.feature]

        if columns_types[node.feature] == 'numerical':
            if val <= node.value:
                node = node.left_child
            else:
                node = node.right_child
        else:  # categorical
            if val == node.value:
                node = node.left_child
            else:
                node = node.right_child

    return node.predicted_class


In [1491]:
def predict_all(X_test, root_node):
    return X_test.apply(lambda row: predict_one(row, root_node), axis=1)

In [1492]:
# Assume your full tree has been built like this
root = decision_tree(X_train, y_train)

# Predict on test set
y_pred = predict_all(X_test, root)

# Accuracy
from sklearn.metrics import accuracy_score
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.95


In [1493]:
from graphviz import Digraph

def visualize_tree(node, dot=None, parent_name=None, edge_label=None, node_id=[0]):
    if dot is None:
        dot = Digraph()
        dot.attr('node', shape='box')

    current_id = str(node_id[0])
    node_id[0] += 1

    if node.left_child is None and node.right_child is None:
        # Leaf node
        label = f'Leaf\nSamples: {node.included_sample_size}\nClass: {getattr(node, "predicted_class", "?")}'
    else:
        label = f'{node.feature} ≤ {node.value}' if columns_types[node.feature] == 'numerical' else f'{node.feature} = {node.value}'
        label += f'\nSamples: {node.included_sample_size}\nImpurity: {round(node.impurity, 3)}'

    dot.node(current_id, label)

    if parent_name is not None:
        dot.edge(parent_name, current_id, label=edge_label)

    # Recurse for children
    if node.left_child is not None:
        visualize_tree(node.left_child, dot, current_id, 'True', node_id)
    if node.right_child is not None:
        visualize_tree(node.right_child, dot, current_id, 'False', node_id)

    return dot


In [1494]:
# Build the decision tree (assuming you've already done this)
# root = decision_tree(X_train, y_train)

# # Visualize it
# dot = visualize_tree(root)
# dot.render("my_tree", format="png", cleanup=False)  # Saves as my_tree.png
# dot.view()  # Opens the image in default viewer


## Random Forest

### Bagging

In [1495]:
rf_data = pd.read_csv('/Users/hanifemamgholizadeh/Desktop/patter_recognition/data/random_forest_synthetic_dataset.csv')

In [1496]:
rf_data.head()

Unnamed: 0,age,income,score,owns_house,has_loan,is_married,category_1,category_2,category_3,target
0,56,51905,0.936648,1,1,0,B,X,Low,1
1,69,31258,0.039186,0,0,0,A,X,Low,1
2,46,79176,0.417946,1,1,1,C,Y,High,1
3,32,47699,0.967581,0,0,0,A,X,Low,0
4,60,36395,0.547972,0,1,1,C,Y,Low,1


In [1497]:
len(rf_data)

200

In [1498]:
len(rf_data.columns)

10

In [1499]:
from sklearn.model_selection import train_test_split
X_rf = rf_data.drop(columns=['target'])
y_rf = rf_data['target']
X_train_rf, X_test_rf, y_train_rf, y_test_rf = train_test_split(X_rf, y_rf, test_size=0.2, random_state=42)

In [1500]:
def bagging(X, sample_size=len(X)):
    return X.sample(n=sample_size, replace=True)

In [1501]:
import random
def train_rf(X, n_trees=10):
    trees = []
    for i in range(n_trees):
        
        sample = bagging(X).reset_index(drop=True)
        
        # Exclude 'target' when sampling features
        feature_columns = random.sample(
            list(sample.columns.difference(['target'])), 
            k=int(np.sqrt(len(sample.columns) - 1))
        )
        
        # Call decision tree with selected features
        tree = decision_tree(sample[feature_columns], sample['target'])
        trees.append(tree)
        
    return trees

In [1502]:
def predict_one(sample, node):
    columns_types = columns_type(X_test_rf)
    while node.left_child and node.right_child:
        val = sample[node.feature]

        if columns_types[node.feature] == 'numerical':
            if val <= node.value:
                node = node.left_child
            else:
                node = node.right_child
        else:  # categorical
            if val == node.value:
                node = node.left_child
            else:
                node = node.right_child

    return node.predicted_class


In [1503]:
def test_rf(trees, X_test):
    predictions = []
    for row in X_test.iterrows():
        
        votes = []
        for tree in trees:
            pred = predict_one(row[1], tree)
            votes.append(pred)
        final_prediction = mode(votes, keepdims=False).mode
        predictions.append(final_prediction)
    return predictions

In [1504]:
trees = train_rf(pd.concat([X_train_rf, y_train_rf], axis=1), n_trees=10)

In [1505]:
preds = test_rf(trees, X_test_rf)

In [1506]:
from sklearn.metrics import accuracy_score
print("Accuracy:", accuracy_score(y_test_rf, preds))

Accuracy: 0.625


## Regression Tree

In [1507]:
regression_data = pd.read_csv('/Users/hanifemamgholizadeh/Desktop/patter_recognition/data/synthetic_regression_data.csv')

In [1508]:
regression_data.head()

Unnamed: 0,feature_1,feature_2,target
0,3.745401,0.157146,9.561862
1,9.507143,3.182052,22.621832
2,7.319939,1.57178,19.402403
3,5.986585,2.542853,11.445345
4,1.560186,4.537832,-0.663557


In [1509]:
from sklearn.model_selection import train_test_split
X_reg = regression_data.drop(columns=['target'])
y_reg = regression_data['target']
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)

X_train_reg = X_train_reg.reset_index(drop=True)
y_train_reg = y_train_reg.reset_index(drop=True)



In [1510]:
def impurity_regression(y):
    if len(y) == 0:
        return 0
    mean_y = np.mean(y)
    return np.sum((y - mean_y) ** 2) / len(y)

In [None]:
def regression_decision_tree_GRBT(X, y, parent_node=None, list_of_indices=None, threshold=0.01, gbrt_active=False, current_depth=0):
    if list_of_indices is None:
        list_of_indices = list(range(len(X)))

    labels = y.iloc[list_of_indices]

    # Compute regression impurity (variance)
    current_impurity = impurity_regression(labels)

    if gbrt_active:
        if current_depth >= 5:  # Limit depth for GBRT
            return DTreeNode(
            included_sample_size=len(list_of_indices),
            sample_list=list_of_indices,
            impurity=current_impurity,
            predicted_class=labels.mean()
        )
        current_depth += 1
        
    columns_types = columns_type(X)  # Assume this function returns 'numerical' or 'categorical' per column

    
    # Stop if node is pure or impurity is below threshold
    if current_impurity < threshold or len(set(labels)) == 1:
        current_node = DTreeNode(
            included_sample_size=len(list_of_indices),
            sample_list=list_of_indices,
            impurity=current_impurity,
            predicted_class=labels.mean()
        )
        return current_node

    # Create current node
    current_node = DTreeNode(
        included_sample_size=len(list_of_indices),
        sample_list=list_of_indices,
        impurity=current_impurity
    )

    # Initialize split search
    min_impurity = float('inf')
    feature_to_split = None
    value_to_split = None
    best_left_indices = None
    best_right_indices = None

    for column in X.columns:
        column_type = columns_types[column]
        data = X[column].iloc[list_of_indices]

        if column_type == 'numerical':
            sorted_indices = data.sort_values().index.tolist()

            for i in range(1, len(sorted_indices)):
                left_indices = sorted_indices[:i]
                right_indices = sorted_indices[i:]

                if not left_indices or not right_indices:
                    continue

                y_left = y.iloc[left_indices]
                y_right = y.iloc[right_indices]

                impurity_left = impurity_regression(y_left)
                impurity_right = impurity_regression(y_right)

                weighted_impurity = (
                    len(left_indices) / len(list_of_indices) * impurity_left +
                    len(right_indices) / len(list_of_indices) * impurity_right
                )

                if weighted_impurity < min_impurity:
                    min_impurity = weighted_impurity
                    feature_to_split = column
                    value_to_split = (data.iloc[i - 1] + data.iloc[i]) / 2
                    best_left_indices = left_indices
                    best_right_indices = right_indices

        else:  # categorical
            unique_values = data.unique()
            for val in unique_values:
                left_indices = data[data == val].index.tolist()
                right_indices = data[data != val].index.tolist()

                if not left_indices or not right_indices:
                    continue

                y_left = y.iloc[left_indices]
                y_right = y.iloc[right_indices]

                impurity_left = impurity_regression(y_left)
                impurity_right = impurity_regression(y_right)

                weighted_impurity = (
                    len(left_indices) / len(list_of_indices) * impurity_left +
                    len(right_indices) / len(list_of_indices) * impurity_right
                )

                if weighted_impurity < min_impurity:
                    min_impurity = weighted_impurity
                    feature_to_split = column
                    value_to_split = val
                    best_left_indices = left_indices
                    best_right_indices = right_indices

    # Final safeguard: avoid bad splits
    if (
        best_left_indices is None or best_right_indices is None or
        set(best_left_indices) == set(list_of_indices) or
        set(best_right_indices) == set(list_of_indices)
    ):
        current_node.predicted_class = labels.mean()
        return current_node

    # Store best split
    current_node.feature = feature_to_split
    current_node.value = value_to_split

    # Recursively grow the tree
    if gbrt_active:
        current_node.left_child = regression_decision_tree_GRBT(X, y, current_node, best_left_indices, threshold, gbrt_active, current_depth)
        current_node.right_child = regression_decision_tree_GRBT(X, y, current_node, best_right_indices, threshold, gbrt_active, current_depth)
    else:
        current_node.left_child = regression_decision_tree_GRBT(X, y, current_node, best_left_indices, threshold)
        current_node.right_child = regression_decision_tree_GRBT(X, y, current_node, best_right_indices, threshold)

    return current_node


In [1512]:
tree = regression_decision_tree_GRBT(X_train_reg, y_train_reg, gbrt_active=False)

In [1513]:
columns_types = columns_type(X_train_reg)

In [1514]:
# Build the decision tree (assuming you've already done this)
root = tree
# Visualize it
dot = visualize_tree(root)
dot.render("my_tree", format="png", cleanup=False)  # Saves as my_tree.png
dot.view()  # Opens the image in default viewer

Python(85726) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(85735) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(85736) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


'my_tree.pdf'

In [1522]:
def predict_one(x, node):
        while node.left_child or node.right_child:
            if node.feature is None:
                break
            val = x[node.feature]
            if isinstance(val, str):
                if val == node.value:
                    node = node.left_child
                else:
                    node = node.right_child
            else:
                if val <= node.value:
                    node = node.left_child
                else:
                    node = node.right_child
        return node.predicted_class

In [1523]:
def predict_with_tree(tree, X):
    return X.apply(lambda row: predict_one(row, tree), axis=1)

In [1524]:
from sklearn.metrics import mean_squared_error
y_pred = predict_with_tree(tree, X_test_reg)

# Compute MSE
mse = mean_squared_error(y_test_reg, y_pred)
print(f"Test MSE: {mse:.4f}")

Test MSE: 12.5759


## Gradient Boosted Regression Tree

In [1518]:
def GBRT(X, y, n_estimators=100, learning_rate=0.1, threshold=0.01):
    H = []
    F = pd.Series(np.full(len(y), y.mean()), index=y.index) 
    for _ in range(n_estimators):
        
        residuals = y - F  
        tree = regression_decision_tree_GRBT(X, residuals, threshold=threshold, gbrt_active=True, current_depth=0)
        pred = X.apply(lambda row: predict_one(row, tree), axis=1)
        F += learning_rate * pred
        H.append(tree)

    return H

In [1519]:
def prediction_GBRT(H, X, learning_rate):
    preds = pd.Series(np.full(len(X), 0.0), index=X.index)
    for item in H:
        preds += learning_rate * X.apply(lambda row: predict_one(row, item), axis=1)
    return preds

In [1520]:
H = GBRT(X_train_reg, y_train_reg, n_estimators=50, learning_rate=0.1, threshold=0.01)
preds = prediction_GBRT(H, X_test_reg, learning_rate=0.1)

In [None]:
from sklearn.metrics import mean_squared_error

# Compute MSE
mse = mean_squared_error(y_test_reg, y_pred)
print(f"Test MSE: {mse:.4f}")

Test MSE: 90.4851


## Adaboost

In [1525]:
data_adaboost = pd.read_csv('/Users/hanifemamgholizadeh/Desktop/patter_recognition/data/adaboost_dataset.csv')

In [1527]:
data_adaboost.head()

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,target
0,-1.773053,-0.129414,0.186609,0.607897,-0.317578,-1
1,-1.3477,-1.391369,-0.374821,-0.240325,1.674886,-1
2,-2.002575,-0.223762,0.223788,-0.517611,-0.809471,-1
3,-0.895796,0.98131,0.746254,-1.359856,1.293444,-1
4,0.362209,0.954005,-0.247519,-0.727137,1.318564,1


In [None]:
from sklearn.model_selection import train_test_split
X_adaboost = data_adaboost.drop(columns=['target'])
y_adaboost = data_adaboost['target']
X_train_adaboost, X_test_adaboost, y_train_adaboost, y_test_adaboost = train_test_split(X_adaboost, y_adaboost, test_size=0.2, random_state=42)

In [1530]:
def weighted_error(y_true, y_pred, sample_weights):
    return np.sum(sample_weights * (y_true != y_pred)) / np.sum(sample_weights)

def compute_alpha(error):
    return 0.5 * np.log((1 - error) / (error + 1e-10))  # small epsilon to avoid div by zero

In [1531]:
class DecisionStump:
    def __init__(self):
        self.feature_index = None
        self.threshold = None
        self.polarity = 1
    
    def predict(self, X):
        n_samples = X.shape[0]
        predictions = np.ones(n_samples)
        if self.polarity == 1:
            predictions[X[:, self.feature_index] < self.threshold] = -1
        else:
            predictions[X[:, self.feature_index] > self.threshold] = -1
        return predictions

    def fit(self, X, y, sample_weights):
        n_samples, n_features = X.shape
        min_error = float('inf')

        for feature_i in range(n_features):
            thresholds = np.unique(X[:, feature_i])
            for threshold in thresholds:
                for polarity in [1, -1]:
                    predictions = np.ones(n_samples)
                    if polarity == 1:
                        predictions[X[:, feature_i] < threshold] = -1
                    else:
                        predictions[X[:, feature_i] > threshold] = -1

                    error = weighted_error(y, predictions, sample_weights)

                    if error < min_error:
                        self.polarity = polarity
                        self.threshold = threshold
                        self.feature_index = feature_i
                        min_error = error


In [1532]:
class AdaBoost:
    def __init__(self, n_clf=5):
        self.n_clf = n_clf
        self.clfs = []
        self.alphas = []

    def fit(self, X, y):
        n_samples, n_features = X.shape
        sample_weights = np.ones(n_samples) / n_samples

        for _ in range(self.n_clf):
            stump = DecisionStump()
            stump.fit(X, y, sample_weights)
            predictions = stump.predict(X)

            error = weighted_error(y, predictions, sample_weights)
            alpha = compute_alpha(error)

            # Update weights
            sample_weights *= np.exp(-alpha * y * predictions)
            sample_weights /= np.sum(sample_weights)

            self.clfs.append(stump)
            self.alphas.append(alpha)

    def predict(self, X):
        clf_preds = [alpha * clf.predict(X) for clf, alpha in zip(self.clfs, self.alphas)]
        return np.sign(np.sum(clf_preds, axis=0))


In [1534]:
preds = AdaBoost(n_clf=10)
preds.fit(data_adaboost.drop(columns=['target']).values, data_adaboost['target'].values)
y_pred = preds.predict(data_adaboost.drop(columns=['target']).values)   