In [1]:
# import sys
# sys.path.append("../Decision Tree")
from DecisionTree import ID3
import pandas as pd
import numpy as np

In [7]:
def predict_tree(tree, df_test):
    """
    tree: The tree that has been built from ID3
    df_test: test dataset, dataframe
    return: a list of all prediction labels
    """
    predictions = []
    for index, row in df_test.iterrows():
        node = tree
        while node.children:
            attribute_name = node.attributes
            attribute_value = row[attribute_name]
            matched_child = None
            for child in node.children:
                if child.attributes == attribute_value:
                    matched_child = child
                    break
            if matched_child:
                node = matched_child
                for subnode in node.children:
                    node = subnode
            else:
                break
        predictions.append(node.label)
    return predictions

class RandomForest:
    def __init__(self, n_trees, feature_subset_size):
        self.n_trees = n_trees
        self.feature_subset_size = feature_subset_size
        self.trees = []

    def fit(self, data, attributes):
        for _ in range(self.n_trees):
            # Sample with replacement from data
            bootstrap_sample = data.sample(n=len(data), replace=True)
            #def ID3(S, Attributes, max_depth, feature_subset_size=None, purity_measurement=None, root=None):

            # Train a decision tree on this sample with feature subsetting
            print(attributes)
            print(type(attributes))
            print(self.feature_subset_size)
            print(type(self.feature_subset_size))
            print([type(attribute) for attribute in attributes])


            tree = ID3(S=bootstrap_sample, Attributes=attributes, max_depth = float('inf'),feature_subset_size= self.feature_subset_size, purity_measurement="entropy")
            self.trees.append(tree)

    def predict_single_tree(self, tree_index, dataset):
        if tree_index >= len(self.trees):
            raise ValueError("Tree index out of range.")
        
        return predict_tree(self.trees[tree_index], dataset)

    def predict(self, dataset):
        all_predictions = []

        for _, instance in dataset.iterrows():
            tree_predictions = [predict_tree(tree, pd.DataFrame([instance]))[0] for tree in self.trees]
            all_predictions.append(max(set(tree_predictions), key=tree_predictions.count))

        return all_predictions


In [8]:
# Test for real data set
def preprocess_data(df):
    # Convert continuous attributes to binary
    for column in ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']:
        median = df[column].median()
        df[column] = df[column].apply(lambda x: 1 if x > median else 0)
    
    # Note: For columns with "unknown", we'll leave them as is. Pandas will treat them as a separate category.
    
    return df

# Load the training and test data
test_file_path = "Data/bank-4/test.csv"
train_file_path = "Data/bank-4/train.csv"
column_names = ['age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'y']
df_bank_train = pd.read_csv(train_file_path, names=column_names)
df_bank_test = pd.read_csv(test_file_path, names=column_names)
bank_attributes = df_bank_train.columns.tolist()[:-1]

# Apply preprocessing to train and test datasets
train_data = preprocess_data(df_bank_train)
test_data = preprocess_data(df_bank_test)
attributes = bank_attributes

In [9]:
print(attributes)

['age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome']


In [10]:
import sys
sys.path.append("../Decision Tree")
from utils import predict, calculate_error_rate, preprocess_numerical_columns

In [11]:
# Lists to store results
trees_range = list(range(1, 501))
feature_subsets = [2, 4, 6]
results = {}

for feature_subset in feature_subsets:
    error_rates = []
    for n_trees in trees_range:
        print(f"Training Random Forest with {n_trees} trees and feature subset size {feature_subset} ...")
        
        # Initialize and train RandomForest
        rf_model = RandomForest(n_trees, feature_subset)
        rf_model.fit(train_data, attributes)
        
        # Predict and calculate error rate on test data
        predictions = rf_model.predict(test_data)
        true_labels = test_data.iloc[:, -1].tolist()
        error_rate = calculate_error_rate(predictions, true_labels)
        error_rates.append(error_rate)
        
        print(f"Error Rate: {error_rate}\n")
    
    results[feature_subset] = error_rates

# Plotting the results
import matplotlib.pyplot as plt

for feature_subset, error_rates in results.items():
    plt.plot(trees_range, error_rates, label=f"Feature Subset: {feature_subset}")

plt.xlabel("Number of Trees")
plt.ylabel("Error Rate")
plt.title("Random Forest Error Rate vs. Number of Trees")
plt.legend()
plt.show()

Training Random Forest with 1 trees and feature subset size 2 ...
['age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome']
<class 'list'>
2
<class 'int'>
[<class 'str'>, <class 'str'>, <class 'str'>, <class 'str'>, <class 'str'>, <class 'str'>, <class 'str'>, <class 'str'>, <class 'str'>, <class 'str'>, <class 'str'>, <class 'str'>, <class 'str'>, <class 'str'>, <class 'str'>, <class 'str'>]
Error Rate: 0.1964

Training Random Forest with 2 trees and feature subset size 2 ...
['age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome']
<class 'list'>
2
<class 'int'>
[<class 'str'>, <class 'str'>, <class 'str'>, <class 'str'>, <class 'str'>, <class 'str'>, <class 'str'>, <class 'str'>, <class 'str'>, <class 'str'>, <class 'str'>, <class 'str'>, <class 'str'>, <class 'str'>, <class

KeyboardInterrupt: 

In [None]:
import random
Attributes = ['age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome']
feature_subset_size = 2
selected_features = random.sample(Attributes, min(feature_subset_size, len(Attributes)))
print(selected_features)


['campaign', 'month']
