In [2]:
import numpy as np
import pandas as pd
from collections import Counter

### Question 1

In [3]:
df = pd.read_csv('weather.csv',index_col = 'Day')
df

Unnamed: 0_level_0,Outlook,Temp,Humidity,Wind,Decision
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Sunny,85,85,Weak,No
2,Sunny,80,90,Strong,No
3,Overcast,83,78,Weak,Yes
4,Rain,70,96,Weak,Yes
5,Rain,68,80,Weak,Yes
6,Rain,65,70,Strong,No
7,Overcast,64,65,Strong,Yes
8,Sunny,72,95,Weak,No
9,Sunny,69,70,Weak,Yes
10,Rain,75,80,Weak,Yes


In [4]:
df = df.to_numpy()
df

array([['Sunny', 85, 85, 'Weak', 'No'],
       ['Sunny', 80, 90, 'Strong', 'No'],
       ['Overcast', 83, 78, 'Weak', 'Yes'],
       ['Rain', 70, 96, 'Weak', 'Yes'],
       ['Rain', 68, 80, 'Weak', 'Yes'],
       ['Rain', 65, 70, 'Strong', 'No'],
       ['Overcast', 64, 65, 'Strong', 'Yes'],
       ['Sunny', 72, 95, 'Weak', 'No'],
       ['Sunny', 69, 70, 'Weak', 'Yes'],
       ['Rain', 75, 80, 'Weak', 'Yes'],
       ['Sunny', 75, 70, 'Strong', 'Yes'],
       ['Overcast', 72, 90, 'Strong', 'Yes'],
       ['Overcast', 81, 75, 'Weak', 'Yes'],
       ['Rain', 71, 80, 'Strong', 'No']], dtype=object)

In [5]:
def entropy(data):
    target = data[:, -1]
    _, counts = np.unique(target, return_counts=True)
    probabilities = counts / counts.sum()
    return -sum(p * np.log2(p) for p in probabilities if p > 0)

In [6]:
def information_gain(data, split_attribute, base_entropy):
    values, counts = np.unique(data[:, split_attribute], return_counts=True)
    weighted_entropy = sum((counts[i] / sum(counts)) * entropy(data[data[:, split_attribute] == value]) for i, value in enumerate(values))
    return base_entropy - weighted_entropy

In [7]:
def gain_ratio(data, split_attribute):
    base_entropy = entropy(data)
    info_gain = information_gain(data, split_attribute, base_entropy)
    values, counts = np.unique(data[:, split_attribute], return_counts=True)
    split_entropy = -sum((counts[i] / sum(counts)) * np.log2(counts[i] / sum(counts)) for i in range(len(values)) if counts[i] > 0)
    if split_entropy == 0:
        return 0
    return info_gain / split_entropy

In [8]:
def best_attribute_to_split(data):
    num_attributes = data.shape[1] - 1
    gain_ratios = [gain_ratio(data, i) for i in range(num_attributes)]
    return np.argmax(gain_ratios)

def build_tree_c45(data, attributes, tree=None):
    target = data[:, -1]
    unique_targets = np.unique(target)
    
    if len(unique_targets) == 1:
        return unique_targets[0]
    
    if len(attributes) == 0:
        return Counter(target).most_common(1)[0][0]
    
    best_attr_index = best_attribute_to_split(data)
    best_attr = attributes[best_attr_index]
    
    if tree is None:
        tree = {}
        tree[best_attr] = {}

    attr_values = np.unique(data[:, best_attr_index])
    new_attributes = [attr for i, attr in enumerate(attributes) if i != best_attr_index]
    
    for value in attr_values:
        subset = data[data[:, best_attr_index] == value]
        subtree = build_tree_c45(subset, new_attributes)
        tree[best_attr][value] = subtree
    
    return tree

In [9]:
def classify_c45(tree, sample, attributes):
    if not isinstance(tree, dict):
        return tree
    root_attr = next(iter(tree))
    attr_index = attributes.index(root_attr)
    attr_value = sample[attr_index]
    subtree = tree[root_attr].get(attr_value, None)
    
    if subtree is None:
        print(f"Missing branch for {root_attr} = {attr_value}")
        return None
    
    return classify_c45(subtree, sample, attributes) if isinstance(subtree, dict) else subtree


In [10]:
attributes = ['Outlook', 'Temp', 'Humidity', 'Wind']

tree = build_tree_c45(df, attributes)
print("Decision Tree:", tree)
sample = ['Sunny', 85,85, 'Strong']
classification = classify_c45(tree, sample, attributes)
print("Classification of new sample:", classification)

Decision Tree: {'Temp': {64: 'Yes', 65: 'No', 68: 'Yes', 69: 'Yes', 70: 'Yes', 71: 'No', 72: {'Outlook': {'Overcast': 'Yes', 'Sunny': 'No'}}, 75: 'Yes', 80: 'No', 81: 'Yes', 83: 'Yes', 85: 'No'}}
Classification of new sample: No


### Question 2

In [11]:
def gini_impurity(data):
    target = data[:, -1]
    _, counts = np.unique(target, return_counts=True)
    probabilities = counts / counts.sum()
    return 1 - sum(p ** 2 for p in probabilities)

def split_data(data, feature_index, threshold):
    left_split = data[data[:, feature_index] <= threshold]
    right_split = data[data[:, feature_index] > threshold]
    return left_split, right_split

In [12]:
def weighted_gini(data, feature_index, threshold):
    left_split, right_split = split_data(data, feature_index, threshold)
    n = len(data)
    weighted_impurity = (
        len(left_split) / n * gini_impurity(left_split) + 
        len(right_split) / n * gini_impurity(right_split)
    )
    return weighted_impurity

In [13]:
def best_split_for_feature(data, feature_index):
    unique_values = np.unique(data[:, feature_index])
    best_threshold, best_gini = None, float('inf')
    
    for threshold in unique_values:
        current_gini = weighted_gini(data, feature_index, threshold)
        if current_gini < best_gini:
            best_gini, best_threshold = current_gini, threshold
            
    return best_threshold, best_gini

def best_split(data):
    num_features = data.shape[1] - 1
    best_feature, best_threshold, best_gini = None, None, float('inf')
    
    for feature_index in range(num_features):
        threshold, gini = best_split_for_feature(data, feature_index)
        if gini < best_gini:
            best_feature, best_threshold, best_gini = feature_index, threshold, gini
            
    return best_feature, best_threshold

In [14]:
def build_tree_cart(data, depth=0, max_depth=3, min_samples_split=2):
    target = data[:, -1]
    unique_targets = np.unique(target)
    
    if len(unique_targets) == 1 or len(data) < min_samples_split or depth >= max_depth:
        return Counter(target).most_common(1)[0][0]
    
    feature_index, threshold = best_split(data)
    
    if feature_index is None:
        return Counter(target).most_common(1)[0][0]
    
    tree = {'feature': feature_index, 'threshold': threshold, 'left': None, 'right': None}
    
    left_split, right_split = split_data(data, feature_index, threshold)
    tree['left'] = build_tree_cart(left_split, depth + 1, max_depth, min_samples_split)
    tree['right'] = build_tree_cart(right_split, depth + 1, max_depth, min_samples_split)
    
    return tree

In [15]:
def classify_cart(tree, sample):
    if isinstance(tree, dict):
        feature_index, threshold = tree['feature'], tree['threshold']
        if sample[feature_index] <= threshold:
            return classify_cart(tree['left'], sample)
        else:
            return classify_cart(tree['right'], sample)
    else:
        return tree


In [16]:
attributes = ['Outlook', 'Temp', 'Humidity', 'Wind']
tree = build_tree_cart(df)
print("Decision Tree:", tree)
sample = ['Sunny', 85, 85, 'Strong']
classification = classify_cart(tree, sample)
print("Classification of new sample:", classification)

Decision Tree: {'feature': 0, 'threshold': 'Overcast', 'left': 'Yes', 'right': {'feature': 1, 'threshold': 75, 'left': {'feature': 1, 'threshold': 65, 'left': 'No', 'right': 'Yes'}, 'right': 'No'}}
Classification of new sample: No


### Question 3

In [17]:
data = [
    ['Low', 'Good', 'Yes'],
    ['Low', 'Bad', 'No'],
    ['Medium', 'Good', 'Yes'],
    ['Medium', 'Bad', 'Yes'],
    ['High', 'Good', 'Yes'],
    ['High', 'Bad', 'No']
]
df = np.array(data)
df

array([['Low', 'Good', 'Yes'],
       ['Low', 'Bad', 'No'],
       ['Medium', 'Good', 'Yes'],
       ['Medium', 'Bad', 'Yes'],
       ['High', 'Good', 'Yes'],
       ['High', 'Bad', 'No']], dtype='<U6')

In [18]:
attributes = ['Income','Credit','Loan Approved']
sample = ['Low','Good']

In [19]:
from sklearn.tree import DecisionTreeClassifier

#### C4.5 algorithm

In [20]:
tree_c45 = build_tree_c45(df, attributes)
print("Decision Tree:", tree)
classification = classify_c45(tree_c45, sample, attributes)
print("Classification of new sample:", classification)

Decision Tree: {'feature': 0, 'threshold': 'Overcast', 'left': 'Yes', 'right': {'feature': 1, 'threshold': 75, 'left': {'feature': 1, 'threshold': 65, 'left': 'No', 'right': 'Yes'}, 'right': 'No'}}
Classification of new sample: Yes


In [21]:
X = [row[:2] for row in data]
y = [row[2] for row in data]
le_X_income = LabelEncoder()
le_X_credit = LabelEncoder()
le_y = LabelEncoder()

X_encoded = np.column_stack((
    le_X_income.fit_transform([row[0] for row in X]),
    le_X_credit.fit_transform([row[1] for row in X])
))

y_encoded = le_y.fit_transform(y)

c4_5_model = DecisionTreeClassifier(criterion="entropy")
c4_5_model.fit(X_encoded, y_encoded)

sample_encoded = np.array([
    le_X_income.transform([sample[0]])[0],  
    le_X_credit.transform([sample[1]])[0]
]).reshape(1, -1)

# Predict using C4.5-like model
c4_5_prediction = c4_5_model.predict(sample_encoded)
c4_5_prediction_label = le_y.inverse_transform(c4_5_prediction)
print("C4.5-like Prediction:", c4_5_prediction_label[0])


NameError: name 'LabelEncoder' is not defined

#### CART algorithtm

In [22]:
tree_cart = build_tree_cart(df)
print("Decision Tree:", tree)
classification = classify_cart(tree_cart, sample)
print("Classification of new sample:", classification)

Decision Tree: {'feature': 0, 'threshold': 'Overcast', 'left': 'Yes', 'right': {'feature': 1, 'threshold': 75, 'left': {'feature': 1, 'threshold': 65, 'left': 'No', 'right': 'Yes'}, 'right': 'No'}}
Classification of new sample: Yes


In [23]:
cart_model = DecisionTreeClassifier(criterion="gini")
cart_model.fit(X_encoded, y_encoded)

cart_prediction = cart_model.predict(sample_encoded)
cart_prediction_label = le_y.inverse_transform(cart_prediction)
print("CART Prediction:", cart_prediction_label[0])

NameError: name 'X_encoded' is not defined