In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

%matplotlib inline

### Data

In [42]:
import numpy as np
import pandas as pd

# Set seed for reproducibility
np.random.seed(45) #anchor the results from the sample (same nb of WPM)

# Number of samples
n_samples = 50

# Generate random data
water = np.random.uniform(3, 6, n_samples)
protein = np.random.uniform(22, 36, n_samples)
ph = np.random.uniform(6, 7.4, n_samples)
fat = np.random.uniform(22, 30, n_samples)

# Define acceptable ranges for each feature
water_ok = (water >= 4*0.95) & (water <= 5*1.05)
protein_ok = (protein >= 24*0.95) & (protein <= 34*1.05)
ph_ok = (ph >= 6.5*0.95) & (ph <= 6.9*1.05)
fat_ok = (fat >= 26*0.95) & (fat <= 26*1.05)

# Create X: binary indicators for each feature
X = np.column_stack([water_ok, protein_ok, ph_ok, fat_ok]).astype(int)

# Create y: 1 if all four are within range, else 0
y = (water_ok & protein_ok & ph_ok & fat_ok).astype(int)

# Optional: create DataFrame for inspection
df = pd.DataFrame({
    'Water': water,
    'Protein': protein,
    'pH': ph,
    'Fat': fat,
    'WMP_1': y
})

print("Example:")
print(df.iloc[49])
print("\nX[49] =", X[49])
print("y[49] =", y[49])
print(type(X))
print("\nNumber of WMP:", df['WMP_1'].sum())


Example:
Water       3.200815
Protein    31.360904
pH          6.880067
Fat        29.011710
WMP_1       0.000000
Name: 49, dtype: float64

X[49] = [0 1 1 0]
y[49] = 0
<class 'numpy.ndarray'>

Number of WMP: 6


In [30]:
X

array([[0, 1, 1, 1],
       [1, 1, 1, 0],
       [1, 1, 1, 0],
       [0, 1, 1, 0],
       [1, 1, 0, 0],
       [1, 1, 1, 1],
       [0, 0, 0, 1],
       [0, 1, 1, 1],
       [0, 1, 1, 0],
       [1, 1, 0, 1],
       [0, 1, 0, 0],
       [1, 1, 1, 0],
       [0, 1, 0, 0],
       [1, 1, 0, 1],
       [1, 1, 0, 0],
       [1, 1, 1, 0],
       [0, 1, 1, 0],
       [1, 1, 1, 0],
       [1, 1, 1, 1],
       [1, 1, 1, 0],
       [1, 1, 0, 1],
       [0, 0, 1, 0],
       [0, 1, 1, 1],
       [1, 1, 1, 1],
       [0, 1, 1, 0],
       [1, 1, 1, 0],
       [0, 1, 0, 0],
       [0, 1, 0, 0],
       [0, 0, 1, 0],
       [0, 1, 1, 1],
       [1, 1, 0, 0],
       [0, 1, 1, 1],
       [1, 1, 1, 1],
       [1, 1, 1, 0],
       [0, 0, 1, 0],
       [0, 1, 1, 0],
       [1, 1, 1, 1],
       [1, 1, 0, 0],
       [0, 1, 1, 0],
       [0, 1, 1, 0],
       [1, 1, 1, 0],
       [1, 1, 1, 0],
       [0, 1, 1, 0],
       [1, 1, 0, 1],
       [0, 1, 1, 0],
       [0, 0, 1, 0],
       [1, 1, 1, 1],
       [0, 1,

In [31]:
y

array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0])

### Entropy, Split and Information Gain

In [None]:
def compute_entropy(y):
     """
    Computes the entropy
    """
     entropy = 0
     if len(y) != 0:
          p1 = len(y[y==1]) / len(y)
          if p1 != 0 and p1 != 1:
             entropy = -p1 * np.log2(p1) - (1 - p1) * np.log2(1 - p1)
          else:
             entropy = 0
     return entropy
        

In [33]:
print("Entropy at root node: ", compute_entropy(y)) 

Entropy at root node:  0.5293608652873644


In [34]:
def split_dataset(X, node_indices, feature):
    """
    Splits the data at the given node into left and right branches
    """
    
    # You need to return the following variables correctly
    left_indices = []
    right_indices = []
    
    for i in node_indices:   
        if X[i][feature] == 1:
            left_indices.append(i)
        else:
            right_indices.append(i)
        
    return left_indices, right_indices

In [35]:
#counter = -1
root_indices = [int(counter) for counter in range(y.shape[0])]
print(root_indices[-5:])

feature = 0
left_indices, right_indices = split_dataset(X, root_indices, feature)

print("Left indices: ", left_indices)
print("Right indices: ", right_indices)

[45, 46, 47, 48, 49]
Left indices:  [1, 2, 4, 5, 9, 11, 13, 14, 15, 17, 18, 19, 20, 23, 25, 30, 32, 33, 36, 37, 40, 41, 43, 46]
Right indices:  [0, 3, 6, 7, 8, 10, 12, 16, 21, 22, 24, 26, 27, 28, 29, 31, 34, 35, 38, 39, 42, 44, 45, 47, 48, 49]


In [36]:
def compute_information_gain(X, y, node_indices, feature):
    
    """
    Compute the information of splitting the node on a given feature
    """    
    # Split dataset
    left_indices, right_indices = split_dataset(X, node_indices, feature)
    
    # Some useful variables
    X_node, y_node = X[node_indices], y[node_indices]
    X_left, y_left = X[left_indices], y[left_indices]
    X_right, y_right = X[right_indices], y[right_indices]
    
    # You need to return the following variables correctly
    information_gain = 0
    
    node_entropy = compute_entropy(y_node)
    left_entropy = compute_entropy(y_left)
    right_entropy = compute_entropy(y_right)
    
    # Weights 
    w_left = len(X_left) / len(X_node)
    w_right = len(X_right) / len(X_node)
    
    #Weighted entropy
    weighted_entropy = w_left * left_entropy + w_right * right_entropy
    
    #Information gain 
    information_gain = node_entropy - weighted_entropy
        
    return information_gain

In [37]:
info_gain0 = compute_information_gain(X, y, root_indices, feature=0)
print("Information Gain from splitting the root on water: ", info_gain0)
    
info_gain1 = compute_information_gain(X, y, root_indices, feature=1)
print("Information Gain from splitting the root on protein: ", info_gain1)

info_gain2 = compute_information_gain(X, y, root_indices, feature=2)
print("Information Gain from splitting the root on ph: ", info_gain2)

info_gain3 = compute_information_gain(X, y, root_indices, feature=3)
print("Information Gain from splitting the root on fat: ", info_gain3)


Information Gain from splitting the root on water:  0.13994736554698062
Information Gain from splitting the root on protein:  0.019502309389749506
Information Gain from splitting the root on ph:  0.05616258724834294
Information Gain from splitting the root on fat:  0.2108939554489931


In [38]:
def get_best_split(X, y, node_indices):   
    """
    Returns the optimal feature and threshold value to split the node data 
    """    
    
    # Some useful variables
    num_features = X.shape[1]
    
    # You need to return the following variables correctly
    best_feature = -1
    
    max_info_gain=0
    for feature in range(num_features):
        info_gain = compute_information_gain(X, y, node_indices, feature)
        if info_gain > max_info_gain:
            max_info_gain = info_gain
            best_feature = feature
                       
    return best_feature


In [39]:
best_feature = get_best_split(X, y, root_indices)
print("Best feature to split on: %d" % best_feature)

Best feature to split on: 3


### Build the decision tree

In [40]:
# Not graded
tree = []

def build_tree_recursive(X, y, node_indices, branch_name, max_depth, current_depth):
    """
    Build a tree using the recursive algorithm that split the dataset into 2 subgroups at each node.
    This function just prints the tree.
    """ 

    # Maximum depth reached - stop splitting
    if current_depth == max_depth:
        formatting = " "*current_depth + "-"*current_depth
        print(formatting, "%s leaf node with indices" % branch_name, node_indices)
        return
   
    # Otherwise, get best split and split the data
    # Get the best feature and threshold at this node
    best_feature = get_best_split(X, y, node_indices) 
    tree.append((current_depth, branch_name, best_feature, node_indices))
    
    formatting = "-"*current_depth
    print("%s Depth %d, %s: Split on feature: %d" % (formatting, current_depth, branch_name, best_feature))
    
    # Split the dataset at the best feature
    left_indices, right_indices = split_dataset(X, node_indices, best_feature)
    
    # continue splitting the left and the right child. Increment current depth
    build_tree_recursive(X, y, left_indices, "Left", max_depth, current_depth+1)
    build_tree_recursive(X, y, right_indices, "Right", max_depth, current_depth+1)


In [41]:
build_tree_recursive(X, y, root_indices, "Root", max_depth=2, current_depth=0)

 Depth 0, Root: Split on feature: 3
- Depth 1, Left: Split on feature: 0
  -- Left leaf node with indices [5, 9, 13, 18, 20, 23, 32, 36, 43, 46]
  -- Right leaf node with indices [0, 6, 7, 22, 29, 31, 48]
- Depth 1, Right: Split on feature: -1
  -- Left leaf node with indices []
  -- Right leaf node with indices [1, 2, 3, 4, 8, 10, 11, 12, 14, 15, 16, 17, 19, 21, 24, 25, 26, 27, 28, 30, 33, 34, 35, 37, 38, 39, 40, 41, 42, 44, 45, 47, 49]
