# Penguin-size-Decision Tree

In [4]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split

In [5]:
df = pd.read_csv("penguins.csv")
df = df.dropna()
df['species'] = df['species'].astype('category').cat.codes

features = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']
X = df[features].values
y = df['species'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
def entropy(labels):
    counts = np.bincount(labels)
    probabilities = counts / len(labels)
    return -np.sum([p * np.log2(p) for p in probabilities if p > 0])

In [7]:
def info_gain(parent_labels, left_labels, right_labels):
    parent_entropy = entropy(parent_labels)
    n = len(parent_labels)
    n_left = len(left_labels)
    n_right = len(right_labels)
    weighted_entropy = (n_left / n) * entropy(left_labels) + (n_right / n) * entropy(right_labels)
    return parent_entropy - weighted_entropy

In [13]:
def best_split_feature_for_point(X_train, y_train, x_test):
    best_feature = None
    best_threshold = None
    best_ig = -1
    
    num_features = X_train.shape[1]
    
    for feature_idx in range(num_features):
        unique_values = np.unique(X_train[:, feature_idx])
        
        for thresh in unique_values:
            left_mask = X_train[:, feature_idx] <= thresh
            right_mask = X_train[:, feature_idx] > thresh
            if sum(left_mask) == 0 or sum(right_mask) == 0:
                continue
            
            ig = info_gain(y_train, y_train[left_mask], y_train[right_mask])
            if ig > best_ig:
                best_ig = ig
                best_feature = feature_idx
                best_threshold = thresh
    
    return best_feature, best_threshold

In [14]:
def predict_single_point(X_train, y_train, x_test):
    feature, thresh = best_split_feature_for_point(X_train, y_train, x_test)
    if feature is None:
        return Counter(y_train).most_common(1)[0][0]
    if x_test[feature] <= thresh:
        labels_left = y_train[X_train[:, feature] <= thresh]
        return Counter(labels_left).most_common(1)[0][0]
    else:
        labels_right = y_train[X_train[:, feature] > thresh]
        return Counter(labels_right).most_common(1)[0][0]

y_pred = [predict_single_point(X_train, y_train, x) for x in X_test]

In [15]:
def evaluate(y_true, y_pred):
    labels = np.unique(y_true)
    metrics = {}
    for label in labels:
        TP = sum((np.array(y_pred) == label) & (y_true == label))
        FP = sum((np.array(y_pred) == label) & (y_true != label))
        FN = sum((np.array(y_pred) != label) & (y_true == label))
        precision = TP / (TP + FP) if (TP + FP) else 0
        recall = TP / (TP + FN) if (TP + FN) else 0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) else 0
        metrics[label] = {"Precision": precision, "Recall": recall, "F1-Score": f1}
    accuracy = np.mean(np.array(y_pred) == y_true)
    return accuracy, metrics

In [16]:
accuracy, class_metrics = evaluate(y_test, y_pred)
print(f"Accuracy: {accuracy:.3f}")
print("Class-wise Metrics:")
for label, m in class_metrics.items():
    print(f"Class {label} -> Precision: {m['Precision']:.2f}, Recall: {m['Recall']:.2f}, F1: {m['F1-Score']:.2f}")

Accuracy: 0.731
Class-wise Metrics:
Class 0 -> Precision: 0.65, Recall: 1.00, F1: 0.78
Class 1 -> Precision: 0.00, Recall: 0.00, F1: 0.00
Class 2 -> Precision: 0.95, Recall: 1.00, F1: 0.97


In [17]:
print("\nExample predictions (Actual vs Predicted):")
for i in range(10):
    print(f"Actual: {y_test[i]}, Predicted: {y_pred[i]}")


Example predictions (Actual vs Predicted):
Actual: 0, Predicted: 0
Actual: 1, Predicted: 0
Actual: 0, Predicted: 0
Actual: 2, Predicted: 2
Actual: 0, Predicted: 0
Actual: 1, Predicted: 0
Actual: 1, Predicted: 0
Actual: 2, Predicted: 2
Actual: 2, Predicted: 2
Actual: 2, Predicted: 2
