In [21]:
import pandas as pd
import numpy as np

# Load the dataset
file_path = r'C:\Users\tyler\OneDrive\Desktop\GitHub Repositories\DataScience2\Exit_Tickets\January29th\penguins.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset for inspection
print(data.head())


  studyName  Sample Number                              Species  Region  \
0   PAL0708              1  Adelie Penguin (Pygoscelis adeliae)  Anvers   
1   PAL0708              2  Adelie Penguin (Pygoscelis adeliae)  Anvers   
2   PAL0708              3  Adelie Penguin (Pygoscelis adeliae)  Anvers   
3   PAL0708              4  Adelie Penguin (Pygoscelis adeliae)  Anvers   
4   PAL0708              5  Adelie Penguin (Pygoscelis adeliae)  Anvers   

      Island               Stage Individual ID Clutch Completion  Date Egg  \
0  Torgersen  Adult, 1 Egg Stage          N1A1               Yes  11/11/07   
1  Torgersen  Adult, 1 Egg Stage          N1A2               Yes  11/11/07   
2  Torgersen  Adult, 1 Egg Stage          N2A1               Yes  11/16/07   
3  Torgersen  Adult, 1 Egg Stage          N2A2               Yes  11/16/07   
4  Torgersen  Adult, 1 Egg Stage          N3A1               Yes  11/16/07   

   Culmen Length (mm)  Culmen Depth (mm)  Flipper Length (mm)  Body Mass (g)  \


In [22]:
# Apply one-hot encoding
categorical_cols = ['Species', 'Region', 'Island', 'Stage', 'Sex', 'Clutch Completion']
data_encoded = pd.get_dummies(data, columns=categorical_cols)

# Drop rows with any missing values
data_encoded.dropna(inplace=True)


In [23]:
# Define the MSE calculation function
def calculate_mse(data):
    if len(data) == 0:
        return 0
    return np.var(data) * len(data)

# Define the function to find the best split
def best_split(X, y):
    min_mse = float('inf')
    best_split_feature = None
    best_split_value = None

    for feature in range(X.shape[1]):
        possible_values = set(X[:, feature])
        for value in possible_values:
            left_mask = X[:, feature] <= value
            right_mask = X[:, feature] > value

            left_mse = calculate_mse(y[left_mask])
            right_mse = calculate_mse(y[right_mask])

            total_mse = left_mse + right_mse

            if total_mse < min_mse:
                min_mse = total_mse
                best_split_feature = feature
                best_split_value = value

    return best_split_feature, best_split_value, min_mse

# Define the TreeNode class
class TreeNode:
    def __init__(self, feature=None, value=None, left=None, right=None, *, output=None):
        self.feature = feature
        self.value = value
        self.left = left
        self.right = right
        self.output = output

# Define the function to build the tree
def build_tree(X, y, depth=1, max_depth=5):
    if depth == max_depth or len(set(y)) == 1:
        return TreeNode(output=np.mean(y))

    feature, value, mse = best_split(X, y)
    
    if mse == float('inf'):
        return TreeNode(output=np.mean(y))

    left_mask = X[:, feature] <= value
    right_mask = X[:, feature] > value

    left_subtree = build_tree(X[left_mask], y[left_mask], depth+1, max_depth)
    right_subtree = build_tree(X[right_mask], y[right_mask], depth+1, max_depth)

    return TreeNode(feature, value, left_subtree, right_subtree)


In [24]:
# Prepare the features (X) and target (y)
selected_columns = ['Culmen Length (mm)', 'Culmen Depth (mm)', 'Flipper Length (mm)', 
                    'Delta 15 N (o/oo)', 'Delta 13 C (o/oo)'] + [col for col in data_encoded.columns if col.startswith(tuple(categorical_cols))]

X = data_encoded[selected_columns].values
y = data_encoded['Body Mass (g)'].values

# Build the tree with the selected features
decision_tree = build_tree(X, y, max_depth=5)


In [25]:
root_feature_index = decision_tree.feature
root_feature_name = selected_columns[root_feature_index]
root_feature_value = decision_tree.value

print(f"Root Feature: {root_feature_name}")
print(f"Splitting Value: {root_feature_value}")


Root Feature: Culmen Length (mm)
Splitting Value: 38.9
