# Lab 2: Decision Trees and Information Gain

In [None]:
import pandas as pd
import numpy as np
import json # For pretty printing the dictionary tree structure
from sklearn.metrics import f1_score # For F1 score calculation

## Section 1: Core Decision Tree Functions

### 1.1. Entropy Function

This function calculates the entropy of a given dataset based on the target column 'hospital_death'.
Entropy is a measure of impurity or disorder in a dataset.

**Formula:** E = - sum(p_i * log2(p_i)) for all classes i.

For a binary classification problem with target 'hospital_death' (classes 0 and 1):
E = - (p_0 * log2(p_0) + p_1 * log2(p_1))
where p_0 is the proportion of class 0 (e.g., not survived) and p_1 is the proportion of class 1 (e.g., survived).

If a proportion p_i is 0, then p_i * log2(p_i) is taken as 0.

In [None]:
def entropy(data):
    """
    Calculates the entropy of a dataset based on the 'hospital_death' target column.
    """
    target_column = 'hospital_death'
    if data.empty or target_column not in data.columns:
        return 0.0
    counts = data[target_column].value_counts()
    total_count = len(data[target_column])
    if total_count == 0 or len(counts) <= 1:
        return 0.0
    entropy_value = 0.0
    for count_val in counts:
        proportion = count_val / total_count
        if proportion > 0:
            entropy_value -= proportion * np.log2(proportion)
    return entropy_value

#### Example Usage for Entropy

In [None]:
sample_data_for_lab2 = {
    'age': [25, 30, 35, 40, 45, 50, 55, 60],
    'gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female'], 
    'bmi': [22.5, 24.1, 26.3, 28.0, 29.5, 30.2, 25.5, 27.8], 
    'hospital_death': [0, 1, 0, 0, 1, 1, 0, 1]
}
sample_df_lab2 = pd.DataFrame(sample_data_for_lab2)
# print(f"Sample DataFrame for Lab 2:\n{sample_df_lab2}")
# print(f"\nCalculated Entropy for sample_df_lab2: {entropy(sample_df_lab2):.4f}")

### 1.2. Information Gain Function

In [None]:
def information_gain(data, mask):
    if data.empty: return 0.0
    parent_entropy = entropy(data)
    left_subset = data[mask]
    right_subset = data[~mask]
    total_count = len(data)
    if total_count == 0: return 0.0
    entropy_left = entropy(left_subset)
    entropy_right = entropy(right_subset)
    weight_left = len(left_subset) / total_count if total_count > 0 else 0
    weight_right = len(right_subset) / total_count if total_count > 0 else 0
    weighted_children_entropy = (weight_left * entropy_left) + (weight_right * entropy_right)
    return parent_entropy - weighted_children_entropy

### 1.3. Find Best Split Function

In [None]:
def find_best_split(data, impl_part='basic'):
    best_ig = -1.0
    best_threshold = None
    best_feature = None
    if data.empty or 'hospital_death' not in data.columns or len(data) < 2:
        return best_ig, best_threshold, best_feature
    features_to_consider = data.columns.drop('hospital_death', errors='ignore')
    for feature in features_to_consider:
        if pd.api.types.is_numeric_dtype(data[feature]):
            unique_values = sorted(data[feature].unique())
            if len(unique_values) < 2: continue
            for i in range(len(unique_values) - 1):
                val1, val2 = unique_values[i], unique_values[i+1]
                if val1 == val2: continue
                threshold = (val1 + val2) / 2.0
                mask = data[feature] <= threshold
                if mask.all() or (~mask).all(): continue
                current_ig = information_gain(data, mask)
                if current_ig > best_ig:
                    best_ig, best_threshold, best_feature = current_ig, threshold, feature
    return best_ig, best_threshold, best_feature

### 1.4. Make Partition Function

In [None]:
def make_partition(data, feature, threshold):
    if feature not in data.columns: return pd.DataFrame(), pd.DataFrame()
    left_df = data[data[feature] <= threshold]
    right_df = data[data[feature] > threshold]
    return left_df, right_df

### 1.5. Build Tree Function

In [None]:
ans_features = [] 
ans_thresholds = [] 

def build_tree(data, max_depth, min_samples_split, current_depth, impl_part='basic'):
    if data.empty or 'hospital_death' not in data.columns: return 0
    majority_class = data['hospital_death'].mode()
    leaf_value = majority_class[0] if not majority_class.empty else 0
    if (current_depth >= max_depth or len(data) < min_samples_split or len(data['hospital_death'].unique()) == 1):
        return leaf_value
    best_ig, best_threshold, best_feature = find_best_split(data, impl_part)
    if best_ig <= 0 or best_feature is None: return leaf_value
    left_data, right_data = make_partition(data, best_feature, best_threshold)
    if left_data.empty or right_data.empty: return leaf_value
    if impl_part == 'basic':
        ans_features.append(best_feature)
        ans_thresholds.append(best_threshold)
    next_depth = current_depth + 1
    left_subtree = build_tree(left_data, max_depth, min_samples_split, next_depth, impl_part)
    right_subtree = build_tree(right_data, max_depth, min_samples_split, next_depth, impl_part)
    question = f'{best_feature} <= {best_threshold:.4f}'
    return {question: [left_subtree, right_subtree]}

### 1.6. Classify Data and Make Prediction Functions

In [None]:
def classify_data(instance, tree):
    if not isinstance(tree, dict): return tree
    question = list(tree.keys())[0]
    feature_name, comparison_operator, value_str = question.split(' ', 2)
    if feature_name not in instance: raise ValueError(f"Feature '{feature_name}' not in instance.")
    instance_value = instance[feature_name]
    threshold = float(value_str)
    answer = tree[question][0] if instance_value <= threshold else tree[question][1]
    return classify_data(instance, answer)

def make_prediction(tree, data):
    if data.empty: return []
    return data.apply(classify_data, axis=1, args=(tree,)).tolist()

## Section 2: Train and Evaluate Decision Tree (Basic Part)

In [None]:
# This section is for the basic decision tree training and evaluation.
# It uses lab2_basic_input.csv and outputs ans_features, ans_thresholds, y_pred, ans_f1score.
# For brevity, example outputs from this section are not repeatedly printed in later advanced sections.

### 1. Load Data

In [None]:
try:
    input_data_basic = pd.read_csv('lab2_basic_input.csv')
except FileNotFoundError:
    data_dict_basic = {
        'age': np.random.randint(20, 80, 40),'bmi': np.random.uniform(18.5, 40.0, 40).round(2),
        'glucose_apache': np.random.randint(70, 250, 40),'hospital_death': np.random.randint(0, 2, 40)
    }
    input_data_basic = pd.DataFrame(data_dict_basic)
    input_data_basic.loc[0:1, 'hospital_death'] = 0; input_data_basic.loc[2:3, 'hospital_death'] = 1
    input_data_basic.loc[0, 'age'] = 30; input_data_basic.loc[1, 'age'] = 60
    input_data_basic.loc[0, 'bmi'] = 22.0; input_data_basic.loc[1, 'bmi'] = 30.0
# print("Basic Input Data Head:\n", input_data_basic.head())

### 2. Split Data

In [None]:
num_train_basic = 30; num_validation_basic = 10
if len(input_data_basic) < num_train_basic + num_validation_basic:
    if len(input_data_basic) < num_train_basic: num_train_basic = len(input_data_basic); num_validation_basic = 0
    else: num_validation_basic = len(input_data_basic) - num_train_basic
training_data_basic = input_data_basic.iloc[:num_train_basic]
validation_data_basic = input_data_basic.iloc[num_train_basic:num_train_basic + num_validation_basic]
x_validation_basic = validation_data_basic.drop(['hospital_death'], axis=1, errors='ignore')
y_validation_flat_basic = validation_data_basic['hospital_death'].values.flatten() if 'hospital_death' in validation_data_basic else np.array([])

### 3. Set Tree Parameters and Initialize Tracking Lists

In [None]:
max_depth_basic = 2; min_samples_split_basic = 2; depth_basic = 0
ans_features = []; ans_thresholds = []

### 4. Train Decision Tree

In [None]:
decision_tree_basic = None
if not training_data_basic.empty and 'hospital_death' in training_data_basic.columns:
    decision_tree_basic = build_tree(training_data_basic, max_depth_basic, min_samples_split_basic, depth_basic, 'basic')
    # print("Basic Decision Tree Structure:", json.dumps(decision_tree_basic, indent=2))
    # print(f"Features used (Basic): {ans_features}")
    # print(f"Thresholds used (Basic): {ans_thresholds}")

### 5. Make Predictions on Validation Data

In [None]:
y_pred_basic = []
if decision_tree_basic is not None and not x_validation_basic.empty and decision_tree_basic:
    y_pred_basic = make_prediction(decision_tree_basic, x_validation_basic)
    # print(f"Predictions (y_pred_basic) on basic validation data: {y_pred_basic}")

### 6. Calculate F1-Score

In [None]:
def calculate_score(y_true, y_pred):
    if not isinstance(y_true, (list, np.ndarray)) or not isinstance(y_pred, (list, np.ndarray)): return 0.0
    if len(y_true) == 0 or len(y_pred) == 0 or len(y_true) != len(y_pred): return 0.0
    return f1_score(y_true, y_pred, zero_division=0)

ans_f1score_basic = 0.0
if len(y_validation_flat_basic) > 0 and len(y_pred_basic) == len(y_validation_flat_basic):
    ans_f1score_basic = calculate_score(y_validation_flat_basic, y_pred_basic)
    ans_f1score_basic = round(ans_f1score_basic, 4)
    # print(f"F1 Score on Basic Validation Data: {ans_f1score_basic}")

## Section 3: Advanced Random Forest (Optional Part)

### 3.1. Build Forest Function

Builds a random forest with bootstrapped data and feature subsets.

In [None]:
def build_forest(data, n_trees, n_features_per_tree, n_samples_per_tree, max_depth, min_samples_split):
    forest = []
    all_feature_names = data.columns.drop('hospital_death', errors='ignore').tolist()
    if not all_feature_names: return forest
    if n_features_per_tree > len(all_feature_names): n_features_per_tree = len(all_feature_names)

    for i in range(n_trees):
        # print(f"\nBuilding tree {i + 1}/{n_trees}...")
        selected_row_indices = np.random.choice(data.index, size=n_samples_per_tree, replace=True)
        tree_data_sampled_rows = data.loc[selected_row_indices]
        # print(f"  Selected row indices (first 10): {selected_row_indices.tolist()[:10]}...")
        selected_features = np.random.choice(all_feature_names, size=n_features_per_tree, replace=False).tolist()
        # print(f"  Selected features: {selected_features}")
        tree_data_final = tree_data_sampled_rows[selected_features + ['hospital_death']]
        tree = build_tree(tree_data_final, max_depth, min_samples_split, 0, impl_part='advanced')
        # if isinstance(tree, dict): print(f"    Root node: {list(tree.keys())[0]} ...") 
        # else: print(f"    Root node: Leaf value = {tree}")
        forest.append(tree)
    return forest

#### Example Usage for build_forest

In [None]:
# print("--- Example for build_forest ---")
n_trees_ex_forest = 1 # Small number for quick example run
max_depth_ex_forest = 2
min_samples_ex_forest = 2
all_feat_count_ex = len(sample_df_lab2.columns.drop('hospital_death', errors='ignore'))
n_feat_tree_ex = max(1, all_feat_count_ex // 2) if all_feat_count_ex > 0 else 0
n_samp_tree_ex = len(sample_df_lab2) // 2 if len(sample_df_lab2) // 2 > 0 else 1

example_forest_run = []
if n_feat_tree_ex > 0 and n_samp_tree_ex > 0 and not sample_df_lab2.empty:
    example_forest_run = build_forest(sample_df_lab2, n_trees_ex_forest, n_feat_tree_ex, n_samp_tree_ex, max_depth_ex_forest, min_samples_ex_forest)
    # print(f"\nNumber of trees in example_forest_run: {len(example_forest_run)}")
    # if example_forest_run and isinstance(example_forest_run[0], dict): print(json.dumps(example_forest_run[0], indent=2))
    # elif example_forest_run: print(f"First tree is a leaf: {example_forest_run[0]}")

### 3.2. Make Prediction with Forest Function

Aggregates predictions from all trees in a forest by majority vote.

In [None]:
def make_prediction_forest(forest, data):
    if not forest: return []
    if data.empty: return []
    all_tree_predictions = []
    for i, tree in enumerate(forest):
        if tree is None or (not isinstance(tree, dict) and not isinstance(tree, (int, float, np.number))):
            # print(f"Warning: Tree {i} is invalid. Skipping.") # Potentially add default predictions for this tree for all instances
            num_instances = len(data)
            all_tree_predictions.append([0] * num_instances) # Defaulting to predict 0 for this tree's predictions
            continue 
        current_tree_predictions = make_prediction(tree, data)
        all_tree_predictions.append(current_tree_predictions)

    if not all_tree_predictions: return [0] * len(data)
    try:
        predictions_array = np.array(all_tree_predictions)
    except ValueError:
        # This might happen if make_prediction returns variable length lists (should not with current code)
        # Or if a tree was skipped and not handled by adding a placeholder list of predictions
        # Fallback: use predictions from the first valid tree, or default to 0
        for preds in all_tree_predictions: # Find first non-empty list
            if preds and len(preds) == len(data): return preds
        return [0] * len(data) 

    transposed_predictions = predictions_array.T
    final_predictions = []
    for instance_predictions in transposed_predictions:
        count_0 = np.count_nonzero(instance_predictions == 0)
        count_1 = np.count_nonzero(instance_predictions == 1)
        majority = 1 if count_1 >= count_0 else 0
        final_predictions.append(majority)
    return final_predictions

#### Example Usage for make_prediction_forest

In [None]:
# print("--- Example for make_prediction_forest ---")
if 'example_forest_run' in globals() and isinstance(example_forest_run, list) and example_forest_run and \
   'sample_df_lab2' in globals() and not sample_df_lab2.empty:
    forest_pred_data_ex = sample_df_lab2.drop(columns=['hospital_death'], errors='ignore')
    forest_preds_ex = make_prediction_forest(example_forest_run, forest_pred_data_ex)
    # print(f"Forest Predictions on sample_df_lab2: {forest_preds_ex}")

## Section 4: Train and Evaluate Random Forest (Advanced Part)

### 1. Load Advanced Training and Testing Data

In [None]:
try:
    advanced_training_data = pd.read_csv('lab2_advanced_training.csv')
    advanced_testing_data = pd.read_csv('lab2_advanced_testing.csv')
    print("Successfully loaded advanced training and testing data.")
except FileNotFoundError:
    print("Advanced data files not found. Creating dummy advanced data.")
    adv_train_dict = {
        'feature1': np.random.rand(100) * 10, 'feature2': np.random.rand(100) * 5,
        'feature3': np.random.randint(0, 5, 100), 'feature4': np.random.normal(50, 10, 100),
        'feature5': np.random.uniform(1, 10, 100),
        'hospital_death': np.random.randint(0, 2, 100)
    }
    advanced_training_data = pd.DataFrame(adv_train_dict)
    adv_test_dict = {
        'feature1': np.random.rand(50) * 10, 'feature2': np.random.rand(50) * 5,
        'feature3': np.random.randint(0, 5, 50), 'feature4': np.random.normal(50, 10, 50),
        'feature5': np.random.uniform(1, 10, 50)
    }
    advanced_testing_data = pd.DataFrame(adv_test_dict)
    print("Created dummy advanced training and testing data.")

print(f"Advanced training data shape: {advanced_training_data.shape}")
print(f"Advanced testing data shape: {advanced_testing_data.shape}")

### 2. Split Training Data for Validation

In [None]:
split_ratio_adv = 0.8
adv_train_df = advanced_training_data.sample(frac=split_ratio_adv, random_state=42)
adv_val_df = advanced_training_data.drop(adv_train_df.index)

x_adv_train = adv_train_df.drop(columns=['hospital_death'], errors='ignore')
y_adv_train = adv_train_df['hospital_death'] if 'hospital_death' in adv_train_df else pd.Series()

x_adv_val = adv_val_df.drop(columns=['hospital_death'], errors='ignore')
y_adv_val = adv_val_df['hospital_death'] if 'hospital_death' in adv_val_df else pd.Series()
y_adv_val_flat = y_adv_val.values.flatten()

print(f"Advanced training features shape: {x_adv_train.shape}")
print(f"Advanced validation features shape: {x_adv_val.shape}")

### 3. Set Random Forest Parameters

In [None]:
max_depth_rf = 5
min_samples_split_rf = 2
n_trees = 5 # Using a small number for quick example run

n_features_all_adv = x_adv_train.shape[1]
if n_features_all_adv > 0:
    n_features_per_tree = int(np.sqrt(n_features_all_adv))
    n_features_per_tree = max(1, n_features_per_tree) # Ensure at least 1 feature
else:
    n_features_per_tree = 0 # No features available

n_samples_per_tree = len(adv_train_df) # Full bootstrap sample size
if n_samples_per_tree == 0 and len(advanced_training_data) > 0: # Handle case where adv_train_df might be empty if split_ratio is 0
    n_samples_per_tree = len(advanced_training_data)

print(f"Random Forest Parameters: max_depth={max_depth_rf}, min_samples_split={min_samples_split_rf}, n_trees={n_trees}")
print(f"Features per tree: {n_features_per_tree}, Samples per tree: {n_samples_per_tree}")

### 4. Train Random Forest

In [None]:
adv_forest = [] # Initialize
if not adv_train_df.empty and 'hospital_death' in adv_train_df.columns and n_features_per_tree > 0:
    adv_forest = build_forest(adv_train_df, n_trees, n_features_per_tree, n_samples_per_tree, max_depth_rf, min_samples_split_rf)
    print(f"\nRandom Forest built with {len(adv_forest)} trees.")
else:
    print("Cannot build forest: Training data is empty, 'hospital_death' is missing, or no features to select for trees.")

### 5. Validate Random Forest on Validation Set

In [None]:
f1_score_val_rf = 0.0
if adv_forest and not x_adv_val.empty and y_adv_val_flat.size > 0:
    y_pred_val_rf = make_prediction_forest(adv_forest, x_adv_val)
    f1_score_val_rf = calculate_score(y_adv_val_flat, y_pred_val_rf)
    f1_score_val_rf = round(f1_score_val_rf, 4)
    print(f"F1 Score of Random Forest on Advanced Validation Data: {f1_score_val_rf}")
elif not adv_forest:
    print("Forest not built. Skipping validation.")
else:
    print("Validation data is empty or has no labels. Skipping F1 score calculation.")

### 6. Make Predictions on Test Data

In [None]:
advanced = [] # This list will store the final predictions for the test set
if adv_forest and not advanced_testing_data.empty:
    y_pred_test_rf = make_prediction_forest(adv_forest, advanced_testing_data)
    advanced = y_pred_test_rf # Store predictions in the 'advanced' list
    print(f"Predictions made on advanced testing data. First 10 predictions: {advanced[:10]}")
elif not adv_forest:
    print("Forest not built. Skipping predictions on test data.")
else:
    print("Advanced testing data is empty. No predictions made.")

### 7. Write Output File for Test Predictions

In [None]:
if 'advanced' in globals() and isinstance(advanced, list) and advanced:
    if len(advanced) == len(advanced_testing_data):
        # Assuming advanced_testing_data does not have an 'Id' column, so create sequential IDs
        output_df_adv = pd.DataFrame({
            'Id': np.arange(1, len(advanced) + 1),
            'hospital_death': advanced
        })
        output_df_adv.to_csv('lab2_advanced.csv', index=False)
        print(f"Advanced predictions saved to lab2_advanced.csv. Total predictions: {len(advanced)}")
    else:
        print(f"Error: Length of predictions ({len(advanced)}) does not match test data ({len(advanced_testing_data)}). Output file not saved.")
elif 'advanced' in globals() and isinstance(advanced, list) and not advanced and not advanced_testing_data.empty:
    print("No predictions in 'advanced' list, but test data exists. Output file not saved.")
else:
    print("Error: 'advanced' predictions list not found or empty, or test data is empty. Cannot save output file.")