In [1]:
import sys
sys.path.append("../Decision Tree")
# from TreeNode import TreeNode
from DecisionTree import ID3
from utils import predict, calculate_error_rate, preprocess_numerical_columns

In [22]:
class TreeNode:
    def __init__(self, label=None, attributes=None, children=None):
        self.label = label  # value of the node
        self.attributes = attributes
        if children is None:
            self.children = []
        else:
            self.children = children

    def __str__(self, level=0):
        prefix = "  " * level
        result = prefix + f"Attribute: {self.attributes}, Label: {self.label}\n"
        for child in self.children:
            result += prefix + f"Child:\n"
            result += child.__str__(level + 1)
        return result

    def add_child(self, child_node):
        self.children.append(child_node)

    def predict(self, instance):
        """
        instance: a single row from the test dataset
        return: predicted label
        """
        node = self
        while node.children:
            attribute_name = node.attributes
            attribute_value = instance.loc[attribute_name]  # Using .loc to get the value by column name
            print(f"Checking attribute {attribute_name} with value {attribute_value}")  
            matched_child = None
            for child in node.children:
                if child.attributes == attribute_value:
                    matched_child = child
                    break
            
            if matched_child:
                print(f"Found matching child with attribute {matched_child.attributes}")
                node = matched_child
            else:
                print(f"No child matches attribute {attribute_name} with value {attribute_value}")  # Add this
                return "default_label_or_most_common_label" 

        return node.label

In [23]:
import pandas as pd
import numpy as np

class BaggedTrees:
    def __init__(self, n_trees=10, max_depth=float('inf')):
        self.n_trees = n_trees
        self.trees = []
        self.max_depth = max_depth

    def fit(self, data, attributes):
        for _ in range(self.n_trees):
            # Bootstrap sampling
            bootstrap_data = data.sample(n=len(data), replace=True)
            tree = ID3(bootstrap_data, attributes, self.max_depth)
            self.trees.append(tree)

    def predict(self, instance):
        predictions = [tree.predict(instance) for tree in self.trees]
        return max(set(predictions), key=predictions.count)

    def batch_predict(self, df):
        return [self.predict(row) for _, row in df.iterrows()]


In [16]:
def preprocess_data(df):
    # Convert continuous attributes to binary
    for column in ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']:
        median = df[column].median()
        df[column] = df[column].apply(lambda x: 1 if x > median else 0)
    
    # Note: For columns with "unknown", we'll leave them as is. Pandas will treat them as a separate category.
    
    return df

# Load the training and test data
test_file_path = "Data/bank-4/test.csv"
train_file_path = "Data/bank-4/train.csv"
column_names = ['age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'y']
df_bank_train = pd.read_csv(train_file_path, names=column_names)
df_bank_test = pd.read_csv(test_file_path, names=column_names)
bank_attributes = df_bank_train.columns.tolist()[:-1]

# Apply preprocessing to train and test datasets
train_data = preprocess_data(df_bank_train)
test_data = preprocess_data(df_bank_test)
attributes = bank_attributes
# Preview the preprocessed train data
print(train_data.head())

   age          job  marital  education default  balance housing loan  \
0    1     services  married  secondary      no        0     yes   no   
1    1  blue-collar   single  secondary      no        0     yes  yes   
2    1   technician  married  secondary      no        1      no  yes   
3    1       admin.  married   tertiary      no        0     yes   no   
4    0   management   single   tertiary      no        1      no   no   

    contact  day month  duration  campaign  pdays  previous poutcome    y  
0   unknown    0   may         0         0      0         0  unknown   no  
1  cellular    0   feb         1         0      0         0  unknown   no  
2  cellular    1   aug         1         0      1         1  success  yes  
3  cellular    0   jul         1         0      0         0  unknown   no  
4  cellular    0   apr         0         0      0         0  unknown  yes  


In [17]:
def calculate_error_rate(predictions, true_labels):
    """
    predictions: list of prediction labels using ID3
    true_labels: real labels
    return: error_rate
    """
    if len(predictions) != len(true_labels):
        raise ValueError("Number of predictions and true label do not match")

    incorrect_predictions = 0
    total_samples = len(predictions)

    for i in range(total_samples):
        if predictions[i] != true_labels[i]:
            incorrect_predictions += 1

    error_rate = incorrect_predictions / total_samples
    return error_rate

In [18]:

tree_root = ID3(train_data, attributes, float('inf'))
# 获取训练数据和测试数据的预测结果
train_predictions = [tree_root.predict(row) for _, row in train_data.iterrows()]
test_predictions = [tree_root.predict(row) for _, row in test_data.iterrows()]

# 获取真实标签
train_true_labels = train_data.iloc[:, -1].tolist()  # Assuming the label is the last column
test_true_labels = test_data.iloc[:, -1].tolist()  # Assuming the label is the last column

# 使用您的函数计算误差率
train_error = calculate_error_rate(train_predictions, train_true_labels)
test_error = calculate_error_rate(test_predictions, test_true_labels)

print(f"Training Error: {train_error:.2f}")
print(f"Test Error: {test_error:.2f}")



  attribute_value = instance[attribute_name]


Training Error: 1.00
Test Error: 1.00


In [29]:
import pandas as pd

# Create data
data = {
    'Weather': ['Sunny', 'Sunny', 'Overcast', 'Rainy', 'Rainy', 'Overcast', 'Sunny', 'Rainy'],
    'Temperature': ['Hot', 'Hot', 'Hot', 'Cold', 'Cold', 'Cold', 'Mild', 'Mild'],
    'Humidity': ['High', 'Low', 'High', 'High', 'Low', 'Low', 'High', 'Low'],
    'Play Outside?': ['No', 'Yes', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No']
}

# Create DataFrame
df = pd.DataFrame(data)
attributes = df.columns.tolist()[:-1]
print(attributes)
tree_root = ID3(df, attributes, float('inf'))
print(tree_root)

['Weather', 'Temperature', 'Humidity']
Attribute: Weather, Label: None
Child:
  Attribute: Sunny, Label: None
  Child:
    Attribute: Temperature, Label: None
    Child:
      Attribute: Hot, Label: None
      Child:
        Attribute: Humidity, Label: None
        Child:
          Attribute: High, Label: No
        Child:
          Attribute: Low, Label: Yes
    Child:
      Attribute: Mild, Label: Yes
Child:
  Attribute: Overcast, Label: Yes
Child:
  Attribute: Rainy, Label: No



In [30]:
import numpy as np

class BaggedTrees:
    def __init__(self, n_trees):
        self.n_trees = n_trees
        self.trees = []

    def fit(self, data, attributes):
        for _ in range(self.n_trees):
            # 1. Sample with replacement from data
            bootstrap_sample = data.sample(n=len(data), replace=True)
            
            # 2. Train a decision tree on this sample
            tree = ID3(bootstrap_sample, attributes, float('inf'))
            self.trees.append(tree)

    def predict(self, instance):
        # Predict with each tree and vote
        predictions = [tree.predict(instance) for tree in self.trees]
        return max(set(predictions), key=predictions.count)

# Using it:
# bagged_model = BaggedTrees(n_trees=50)
# bagged_model.fit(df, attributes)

In [31]:
import pandas as pd

# Create data
train_data = {
    'Weather': ['Sunny', 'Sunny', 'Overcast', 'Rainy', 'Rainy', 'Overcast', 'Sunny', 'Rainy'],
    'Temperature': ['Hot', 'Hot', 'Hot', 'Cold', 'Cold', 'Cold', 'Mild', 'Mild'],
    'Humidity': ['High', 'Low', 'High', 'High', 'Low', 'Low', 'High', 'Low'],
    'Play Outside?': ['No', 'Yes', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No']
}

test_data = {
    'Weather': ['Sunny', 'Sunny', 'Overcast', 'Rainy', 'Rainy', 'Overcast', 'Sunny', 'Rainy'],
    'Temperature': ['Hot', 'Cold', 'Hot', 'Hot', 'Cold', 'Cold', 'Mild', 'Mild'],
    'Humidity': ['High', 'Low', 'High', 'High', 'Low', 'Low', 'High', 'Low'],
    'Play Outside?': ['No', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'No']
}

# Create DataFrame
train_data = pd.DataFrame(train_data)
test_data = pd.DataFrame(test_data)

# Create a bagged model with 50 trees
bagged_model = BaggedTrees(n_trees=50)
bagged_model.fit(train_data, attributes)

# Predict on training data
train_predictions = [bagged_model.predict(row) for _, row in train_data.iterrows()]

# Get true labels
train_true_labels = train_data['Play Outside?'].tolist()

# Calculate training error using the calculate_error_rate function you defined earlier
train_error = calculate_error_rate(train_predictions, train_true_labels)
print(f"Training Error: {train_error:.2f}")


KeyError: 'Sunny'

In [21]:
tree_root = ID3(train_data, attributes, float('inf'))
sampled_test_instance = test_data.sample(1).iloc[0]
predicted_label = tree_root.predict(sampled_test_instance)

print(predicted_label)


None


In [24]:
bagged_trees = BaggedTrees(n_trees=10)
bagged_trees.fit(train_data, attributes)

train_predictions = bagged_trees.batch_predict(train_data)
test_predictions = bagged_trees.batch_predict(test_data)

train_error = calculate_error_rate(train_predictions, train_data.iloc[:, -1].tolist())
test_error = calculate_error_rate(test_predictions, test_data.iloc[:, -1].tolist())

print(f"Training Error: {train_error:.2f}")
print(f"Test Error: {test_error:.2f}")


Training Error: 1.00
Test Error: 1.00


In [20]:
print(tree_root)

Attribute: duration, Label: None
Child:
  Attribute: 0, Label: None
  Child:
    Attribute: month, Label: None
    Child:
      Attribute: may, Label: None
      Child:
        Attribute: job, Label: None
        Child:
          Attribute: services, Label: None
          Child:
            Attribute: marital, Label: None
            Child:
              Attribute: married, Label: no
            Child:
              Attribute: single, Label: None
              Child:
                Attribute: day, Label: None
                Child:
                  Attribute: 0, Label: no
                Child:
                  Attribute: 1, Label: None
                  Child:
                    Attribute: education, Label: None
                    Child:
                      Attribute: tertiary, Label: no
                    Child:
                      Attribute: secondary, Label: None
                      Child:
                        Attribute: contact, Label: None
                        C