In [None]:
import numpy as np
import pandas as pd

# Define the extended 8-row Titanic subset with all features.
data = {
    'PassengerId': [1, 2, 3, 4, 5, 6, 7, 8],
    'Pclass':      [3, 1, 3, 1, 3, 1, 2, 3],
    'Name':        ["Name1", "Name2", "Name3", "Name4", "Name5", "Name6", "Name7", "Name8"],
    'Sex':         ['male', 'male', 'female', 'female', 'female', 'male', 'female', 'female'],
    'Age':         [35, 22, 39, 30, 26, 54, 60, 23],
    'SibSp':       [0, 1, 1, 1, 0, 0, 1, 0],
    'Parch':       [0, 0, 0, 0, 0, 0, 0, 0],
    'Ticket':      [113803, 113051, 330923, 113503, 3101298, 17453, 237736, 330923],
    'Fare':        [53.10, 26.55, 8.05, 211.50, 7.93, 51.86, 66.60, 8.05],
    'Cabin':       [None, 'C22', None, 'C62', None, 'E46', None, None],
    'Embarked':    ['S', 'S', 'S', 'C', 'S', 'S', 'C', 'Q'],
    'Survived':    [0, 0, 0, 1, 0, 0, 1, 1]
}

df = pd.DataFrame(data)

# Computes Gini, Entropy, and Information Gain
def compute_gini(series):
    """Compute Gini impurity for a binary series."""
    p1 = series.mean()  # proportion of ones in a binary column
    p0 = 1 - p1
    return 1 - (p0**2 + p1**2)

def compute_entropy(series):
    """Compute entropy for a binary series."""
    p1 = series.mean()
    p0 = 1 - p1
    def safe_log2(x):
        return np.log2(x) if x > 0 else 0
    return - (p0 * safe_log2(p0) + p1 * safe_log2(p1))

# Calculate overall impurity measures on the entire dataset for the target 'Survived'
overall_gini = compute_gini(df['Survived'])
overall_entropy = compute_entropy(df['Survived'])

print("Overall Impurity Measures for 'Survived':")
print(f"Overall Gini: {overall_gini:.5f}")
print(f"Overall Entropy: {overall_entropy:.5f}\n")

# Computes weighted impurity after splitting on a given feature.
def weighted_impurity(df, feature, target='Survived', metric='gini'):
    total = len(df)
    weighted_value = 0.0
    details = {}  # for debugging details per group
    for value, group in df.groupby(feature):
        n = len(group)
        if metric == 'gini':
            impurity = compute_gini(group[target])
        else:
            impurity = compute_entropy(group[target])
        weight = n / total
        weighted_value += weight * impurity
        details[value] = (n, impurity, weight)
    return weighted_value, details

# List the features we want to evaluate for splitting.
# Exclude features that are not useful for classification (e.g., PassengerId, Name, Ticket, Cabin)
features_to_evaluate = [col for col in df.columns if col not in ['PassengerId', 'Name', 'Ticket', 'Cabin', 'Survived']]

# Loop over each feature and compute the impurity measures
print("Impurity Measures for each feature split:\n")
results = {}

for feature in features_to_evaluate:
    wgini, details_gini = weighted_impurity(df, feature, metric='gini')
    wentropy, details_entropy = weighted_impurity(df, feature, metric='entropy')
    info_gain_gini = overall_gini - wgini
    info_gain_entropy = overall_entropy - wentropy
    
    results[feature] = {
        'Weighted Gini': wgini,
        'Info Gain (Gini)': info_gain_gini,
        'Weighted Entropy': wentropy,
        'Info Gain (Entropy)': info_gain_entropy,
        'Details Gini': details_gini,
        'Details Entropy': details_entropy
    }
    
    print(f"Feature: {feature}")
    print(f"  Weighted Gini: {wgini:.5f}  |  Info Gain (Gini): {info_gain_gini:.5f}")
    print(f"  Weighted Entropy: {wentropy:.5f}  |  Info Gain (Entropy): {info_gain_entropy:.5f}\n")

# Determine the best feature based on maximum Information Gain (using Entropy, for example)
best_feature = max(results.keys(), key=lambda x: results[x]['Info Gain (Entropy)'])
print("Best Feature to split on (based on highest Info Gain from Entropy):", best_feature)

# Convert the results dictionary to a DataFrame for a nicer display
results_df = pd.DataFrame(results).transpose()
print(results_df)



Overall Impurity Measures for 'Survived':
Overall Gini: 0.46875
Overall Entropy: 0.95443

Impurity Measures for each feature split:

Feature: Pclass
  Weighted Gini: 0.35417  |  Info Gain (Gini): 0.11458
  Weighted Entropy: 0.75000  |  Info Gain (Entropy): 0.20443

Feature: Sex
  Weighted Gini: 0.30000  |  Info Gain (Gini): 0.16875
  Weighted Entropy: 0.60684  |  Info Gain (Entropy): 0.34759

Feature: Age
  Weighted Gini: 0.00000  |  Info Gain (Gini): 0.46875
  Weighted Entropy: 0.00000  |  Info Gain (Entropy): 0.95443

Feature: SibSp
  Weighted Gini: 0.43750  |  Info Gain (Gini): 0.03125
  Weighted Entropy: 0.90564  |  Info Gain (Entropy): 0.04879

Feature: Parch
  Weighted Gini: 0.46875  |  Info Gain (Gini): 0.00000
  Weighted Entropy: 0.95443  |  Info Gain (Entropy): 0.00000

Feature: Fare
  Weighted Gini: 0.12500  |  Info Gain (Gini): 0.34375
  Weighted Entropy: 0.25000  |  Info Gain (Entropy): 0.70443

Feature: Embarked
  Weighted Gini: 0.00000  |  Info Gain (Gini): 0.46875
  Weig