<a href="https://colab.research.google.com/github/Khushkataruka/Machine-Learning-SVNIT/blob/main/ML_LAB_7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ==========================
# Q1: Decision Tree (ID3 & C4.5) on playCricket.csv
# ==========================

import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

# Load dataset
data = pd.read_csv("playCricket.csv")
data.head()

# Convert continuous columns to boolean if any (classification task)
for col in data.columns[:-1]:
    if data[col].dtype != 'object':
        thr = data[col].median()
        data[col] = (data[col] > thr).astype(str)

# Helper functions
def entropy(y):
    vals, counts = np.unique(y, return_counts=True)
    probs = counts / counts.sum()
    return -np.sum(probs * np.log2(probs + 1e-9))

def info_gain(X, y, feature):
    vals = np.unique(X[feature])
    total_entropy = entropy(y)
    weighted_entropy = sum((len(X[X[feature]==v]) / len(X)) * entropy(y[X[feature]==v]) for v in vals)
    return total_entropy - weighted_entropy

def split_gain_ratio(X, y, feature):
    vals = np.unique(X[feature])
    split_info = -np.sum([(len(X[X[feature]==v])/len(X))*np.log2(len(X[X[feature]==v])/len(X)) for v in vals])
    return info_gain(X, y, feature) / (split_info + 1e-9)

# ID3 Algorithm
def id3(X, y):
    if len(np.unique(y)) == 1:
        return y.iloc[0]
    if len(X.columns) == 0:
        return y.mode()[0]
    gains = {col: info_gain(X, y, col) for col in X.columns}
    best = max(gains, key=gains.get)
    tree = {best: {}}
    for val in np.unique(X[best]):
        subX = X[X[best]==val].drop(columns=[best])
        subY = y[X[best]==val]
        tree[best][val] = id3(subX, subY)
    return tree

# C4.5 Algorithm
def c45(X, y):
    if len(np.unique(y)) == 1:
        return y.iloc[0]
    if len(X.columns) == 0:
        return y.mode()[0]
    ratios = {col: split_gain_ratio(X, y, col) for col in X.columns}
    best = max(ratios, key=ratios.get)
    tree = {best: {}}
    for val in np.unique(X[best]):
        subX = X[X[best]==val].drop(columns=[best])
        subY = y[X[best]==val]
        tree[best][val] = c45(subX, subY)
    return tree

# Simple predict function
def predict(tree, sample):
    if not isinstance(tree, dict):
        return tree
    feature = list(tree.keys())[0]
    val = sample[feature]
    if val in tree[feature]:
        return predict(tree[feature][val], sample)
    return list(tree[feature].values())[0]

# 5-Fold Cross Validation
X, y = data.iloc[:,:-1], data.iloc[:,-1]
kf = KFold(n_splits=5, shuffle=True, random_state=42)

for algo in ["ID3", "C4.5"]:
    accs = []
    for train, test in kf.split(X):
        X_train, y_train = X.iloc[train], y.iloc[train]
        X_test, y_test = X.iloc[test], y.iloc[test]
        tree = id3(X_train, y_train) if algo=="ID3" else c45(X_train, y_train)
        preds = [predict(tree, row) for _, row in X_test.iterrows()]
        accs.append(accuracy_score(y_test, preds))
    print(f"{algo} Accuracy (PlayCricket):", np.mean(accs))


ID3 Accuracy (PlayCricket): 0.39999999999999997
C4.5 Accuracy (PlayCricket): 0.4666666666666667


In [2]:
# ==========================
# Q2: Decision Tree (ID3 & C4.5) on drug_200.csv
# ==========================

data2 = pd.read_csv("drug_200.csv")
data2.head()

# Convert numerical columns to boolean if classification
for col in data2.columns[:-1]:
    if data2[col].dtype != 'object':
        thr = data2[col].median()
        data2[col] = (data2[col] > thr).astype(str)

X, y = data2.iloc[:,:-1], data2.iloc[:,-1]

for algo in ["ID3", "C4.5"]:
    accs = []
    for train, test in kf.split(X):
        X_train, y_train = X.iloc[train], y.iloc[train]
        X_test, y_test = X.iloc[test], y.iloc[test]
        tree = id3(X_train, y_train) if algo=="ID3" else c45(X_train, y_train)
        preds = [predict(tree, row) for _, row in X_test.iterrows()]
        accs.append(accuracy_score(y_test, preds))
    print(f"{algo} Accuracy (Drug200):", np.mean(accs))


ID3 Accuracy (Drug200): 0.8800000000000001
C4.5 Accuracy (Drug200): 0.8800000000000001


In [4]:
# ==========================
# Q3: Decision Tree Regression on petrol_consumption.csv
# ==========================

from sklearn.metrics import r2_score, mean_squared_error

data3 = pd.read_csv("petrol_consumption.csv")
data3.head()

X = data3.iloc[:,:-1].values
y = data3.iloc[:,-1].values

class Node:
    def __init__(self, feat=None, thr=None, left=None, right=None, val=None):
        self.feat = feat
        self.thr = thr
        self.left = left
        self.right = right
        self.val = val

def mse(y):
    return np.mean((y - np.mean(y))**2)

def best_split(X, y):
    best_feat, best_thr, best_gain = None, None, -1
    for feat in range(X.shape[1]):
        for thr in np.unique(X[:,feat]):
            left, right = y[X[:,feat]<=thr], y[X[:,feat]>thr]
            if len(left)==0 or len(right)==0: continue
            gain = mse(y) - (len(left)/len(y))*mse(left) - (len(right)/len(y))*mse(right)
            if gain > best_gain:
                best_feat, best_thr, best_gain = feat, thr, gain
    return best_feat, best_thr

def build_tree(X, y, depth=0, max_depth=5):
    if len(np.unique(y))==1 or depth>=max_depth:
        return Node(val=np.mean(y))
    feat, thr = best_split(X, y)
    if feat is None: return Node(val=np.mean(y))
    left_idx, right_idx = X[:,feat]<=thr, X[:,feat]>thr
    left = build_tree(X[left_idx], y[left_idx], depth+1, max_depth)
    right = build_tree(X[right_idx], y[right_idx], depth+1, max_depth)
    return Node(feat, thr, left, right)

def predict_reg(node, x):
    if node.val is not None:
        return node.val
    if x[node.feat] <= node.thr:
        return predict_reg(node.left, x)
    else:
        return predict_reg(node.right, x)

kf = KFold(n_splits=5, shuffle=True, random_state=42)
r2s, rmses = [], []

for train, test in kf.split(X):
    X_train, y_train = X[train], y[train]
    X_test, y_test = X[test], y[test]
    tree = build_tree(X_train, y_train)
    preds = np.array([predict_reg(tree, x) for x in X_test])
    r2s.append(r2_score(y_test, preds))
    rmses.append(np.sqrt(mean_squared_error(y_test, preds)))

print("Regression Decision Tree:")
print("Average R2:", np.mean(r2s))
print("Average RMSE:", np.mean(rmses))


Regression Decision Tree:
Average R2: -0.15888089862827232
Average RMSE: 96.59973085862343
