In [6]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

def entropy(target):
    counts = np.bincount(target)
    probs = counts[np.nonzero(counts)] / len(target)
    return -np.sum(probs * np.log2(probs))

def information_gain(features, target, feature_idx):
    total_entropy = entropy(target)
    values, counts = np.unique(features[:, feature_idx], return_counts=True)
    weighted_entropy = np.sum([(counts[i] / len(target)) * entropy(target[features[:, feature_idx] == values[i]]) for i in range(len(values))])
    return total_entropy - weighted_entropy

def find_root_feature(features, target):
    gains = [information_gain(features, target, i) for i in range(features.shape[1])]
    return np.argmax(gains)

def handle_null_values(df, feature):
    df[feature].fillna(df[feature].mean(), inplace=True)

def handle_categorical_nulls(df, feature):
    most_frequent_val = df[feature].mode()[0]
    df[feature].fillna(most_frequent_val, inplace=True)

def encode_labels(df, feature):
    encoder = LabelEncoder()
    df[feature] = encoder.fit_transform(df[feature])

df = pd.read_csv("weatherAUS.csv")

# Handling null values in features
handle_null_values(df, 'Pressure9am')
handle_null_values(df, 'Humidity9am')
handle_categorical_nulls(df, 'RainToday')

# Encoding categorical features
encode_labels(df, 'RainToday')

target = df['RainToday'].to_numpy()
features = df[['Pressure9am', 'Humidity9am']].to_numpy()

root_feature_idx = find_root_feature(features, target)
print("Root node feature index:", root_feature_idx)


Root node feature index: 1
