# Lab 04 - Decision Trees

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
import seaborn as sns
import matplotlib.pyplot as plt

## Task 2: ID3 Algorithm (Loan Approval Prediction)

In [None]:
# Create Dataset
data = {
    'ID': range(1, 16),
    'AGE': ['Young', 'Young', 'Young', 'Young', 'Young', 'Middle', 'Middle', 'Middle', 'Middle', 'Middle', 'Old', 'Old', 'Old', 'Old', 'Old'],
    'JOB_STATUS': [False, False, True, True, False, False, False, True, False, False, False, False, True, True, False],
    'OWNS_HOUSE': [False, False, False, True, False, False, False, True, True, True, True, True, False, False, False],
    'CREDIT_RATING': ['Fair', 'Good', 'Good', 'Fair', 'Fair', 'Fair', 'Good', 'Good', 'Excellent', 'Excellent', 'Excellent', 'Good', 'Good', 'Excellent', 'Fair'],
    'CLASS': ['No', 'No', 'Yes', 'Yes', 'No', 'No', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No']
}
df_loan = pd.DataFrame(data)
print("Loan Dataset:")
print(df_loan.head())

# Entropy Function
def calculate_entropy(y):
    counts = y.value_counts()
    probabilities = counts / len(y)
    entropy = -np.sum(probabilities * np.log2(probabilities))
    return entropy

# Information Gain Function
def calculate_information_gain(df, attribute, target_name):
    total_entropy = calculate_entropy(df[target_name])
    
    values = df[attribute].unique()
    weighted_entropy = 0
    
    for value in values:
        subset = df[df[attribute] == value]
        prob = len(subset) / len(df)
        weighted_entropy += prob * calculate_entropy(subset[target_name])
        
    return total_entropy - weighted_entropy

# Calculate IG for all attributes
target = 'CLASS'
attributes = ['AGE', 'JOB_STATUS', 'OWNS_HOUSE', 'CREDIT_RATING']

print("\nInformation Gains:")
best_ig = -1
root_node = None

for attr in attributes:
    ig = calculate_information_gain(df_loan, attr, target)
    print(f"{attr}: {ig:.4f}")
    if ig > best_ig:
        best_ig = ig
        root_node = attr

print(f"\nRoot Node for ID3: {root_node}")

## Task 3: Decision Tree Pipeline (Cancer Dataset)

In [None]:
# Load Data
df_cancer = pd.read_csv(r"C:\Ali\Programming\MLFall25\Lab04\cancer patient data sets.csv")

# EDA
print("--- EDA ---")
print("Missing Values:", df_cancer.isnull().sum().sum())
print("Duplicates:", df_cancer.duplicated().sum())
if df_cancer.duplicated().sum() > 0:
    df_cancer = df_cancer.drop_duplicates()

print("Target Balance:")
print(df_cancer['Level'].value_counts())

# Feature Selection (Pearson)
le = LabelEncoder()
df_cancer['Level_Encoded'] = le.fit_transform(df_cancer['Level'])

corr = df_cancer.corr(numeric_only=True)
print("\nCorrelation with Target (Top 5):")
print(corr['Level_Encoded'].abs().sort_values(ascending=False).head(6))

# Prepare X and y
X = df_cancer.drop(columns=['index', 'Patient Id', 'Level', 'Level_Encoded'], errors='ignore')
y = df_cancer['Level_Encoded']

# Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split: Train 80%, Test 20%
X_train_full, X_test, y_train_full, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=0)

# Validation Split: Train 70%, Val 30% (from Train split)
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.3, random_state=0)

print(f"\nSplit Sizes: Train={X_train.shape[0]}, Val={X_val.shape[0]}, Test={X_test.shape[0]}")

# Decision Tree Model
dt = DecisionTreeClassifier(random_state=0)
dt.fit(X_train, y_train)

print("\nDecision Tree Results:")
print(f"Validation Accuracy: {dt.score(X_val, y_val):.4f}")
print(f"Test Accuracy: {dt.score(X_test, y_test):.4f}")

## Task 4: CART Algorithm (Student Dataset)

In [None]:
# Create Dataset
data_student = {
    'Student': range(1, 11),
    'Prior_Experience': ['Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes'],
    'Course': ['Programming', 'Programming', 'History', 'Programming', 'English', 'Programming', 'Programming', 'Mathematics', 'Programming', 'Programming'],
    'Time': ['Day', 'Day', 'Night', 'Night', 'Day', 'Day', 'Day', 'Night', 'Night', 'Night'],
    'Liked': ['Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No']
}
df_student = pd.DataFrame(data_student)
print("Student Dataset:")
print(df_student.head())

# Gini Impurity Function
def calculate_gini(y):
    counts = y.value_counts()
    probabilities = counts / len(y)
    gini = 1 - np.sum(probabilities ** 2)
    return gini

# Gini Gain (Weighted Gini) Function
# CART chooses the split that minimizes the weighted Gini impurity of children
def calculate_weighted_gini(df, attribute, target_name):
    values = df[attribute].unique()
    weighted_gini = 0
    
    for value in values:
        subset = df[df[attribute] == value]
        prob = len(subset) / len(df)
        weighted_gini += prob * calculate_gini(subset[target_name])
        
    return weighted_gini

# Calculate Weighted Gini for all attributes
target_student = 'Liked'
attributes_student = ['Prior_Experience', 'Course', 'Time']

print("\nWeighted Gini Impurities (Lower is better):")
best_gini = 1.0
root_node_cart = None

for attr in attributes_student:
    gini = calculate_weighted_gini(df_student, attr, target_student)
    print(f"{attr}: {gini:.4f}")
    if gini < best_gini:
        best_gini = gini
        root_node_cart = attr

print(f"\nRoot Node for CART: {root_node_cart}")