In [1]:
%pip install numpy
%pip install pandas
%pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
# Cell 1: Imports and TreeNode class
import numpy as np  # For numerical operations and arrays

# Define the TreeNode class to represent each node in the decision tree
class TreeNode:
    def __init__(self, data, feature_idx, feature_val, prediction_probs, information_gain):
        # Initialize node attributes
        self.data = data  # Dataset at this node
        self.feature_idx = feature_idx  # Index of the feature used for splitting
        self.feature_val = feature_val  # Value threshold for the split
        self.prediction_probs = prediction_probs  # Class probabilities for leaf predictions
        self.information_gain = information_gain  # Gain from this split (for debugging)
        self.left = None  # Left child node (for values < feature_val)
        self.right = None  # Right child node (for values >= feature_val)

In [3]:
# Cell 2: DecisionTree class definition
class DecisionTree:
    def __init__(self, max_depth=4, min_samples_leaf=1, min_information_gain=0.0):
        # Constructor to set hyperparameters
        self.max_depth = max_depth  # Maximum depth to prevent overfitting
        self.min_samples_leaf = min_samples_leaf  # Minimum samples per leaf to stop splitting
        self.min_information_gain = min_information_gain  # Minimum gain required for a split
        self.tree = None  # Root of the tree, built during training
        self.labels_in_train = None  # Will be set during training (unique labels)

    def entropy(self, class_probabilities):
        # Calculate entropy for impurity measurement
        # Entropy is low when classes are pure
        return sum([-p * np.log2(p) for p in class_probabilities if p > 0])

    def find_label_probs(self, data):
        # Compute class probabilities from labels in data
        # Return a probability vector whose length equals number of classes seen in training
        labels = data[:, -1]  # Last column is assumed to be labels
        if self.labels_in_train is None:
            # If labels not set yet (shouldn't happen during normal train flow), fall back to unique labels in this data
            unique, counts = np.unique(labels, return_counts=True)
            return counts / len(labels)
        # Build probabilities for each class in self.labels_in_train order
        probs = np.array([np.mean(labels == c) for c in self.labels_in_train], dtype=float)
        return probs

    def partition_entropy(self, partitions):
        # Weighted entropy of child partitions after a split
        total = sum(len(p) for p in partitions)
        return sum(self.entropy(self.find_label_probs(np.array(p))) * len(p) / total for p in partitions if len(p) > 0)

    def split(self, data, feature_idx, feature_val):
        # Split data into two groups based on feature value
        mask = data[:, feature_idx] < feature_val  # Boolean mask for left/right
        g1 = data[mask]  # Left group
        g2 = data[~mask]  # Right group
        return g1, g2

    def find_best_split(self, data):
        # Find the best feature and value for splitting by minimizing partition entropy
        min_part_entropy = 1e6  # Initialize to a large number
        min_entropy_feature_idx = None
        min_entropy_feature_val = None
        g1_min, g2_min = None, None
        for idx in range(data.shape[1] - 1):  # Loop over features (exclude label)
            feature_val = np.median(data[:, idx])  # Use median as split value
            g1, g2 = self.split(data, idx, feature_val)
            # Pass the full partitions (rows) to partition_entropy so find_label_probs can extract labels
            part_entropy = self.partition_entropy([g1, g2])
            if part_entropy < min_part_entropy:
                min_part_entropy = part_entropy
                min_entropy_feature_idx = idx
                min_entropy_feature_val = feature_val
                g1_min, g2_min = g1, g2
        return g1_min, g2_min, min_entropy_feature_idx, min_entropy_feature_val, min_part_entropy

    def create_tree(self, data, current_depth):
        # Recursively build the tree
        if current_depth >= self.max_depth:  # Stop if max depth reached
            return None
        split_1_data, split_2_data, split_feature_idx, split_feature_val, split_entropy = self.find_best_split(data)
        label_probabilities = self.find_label_probs(data)
        node_entropy = self.entropy(label_probabilities)
        information_gain = node_entropy - split_entropy  # Gain = parent entropy - weighted child entropy
        node = TreeNode(data, split_feature_idx, split_feature_val, label_probabilities, information_gain)
        # If one of the splits is empty (or too small), or gain too low, make this a leaf
        if split_1_data is None or split_2_data is None:
            return node
        if self.min_samples_leaf > split_1_data.shape[0] or self.min_samples_leaf > split_2_data.shape[0]:  # Stop if leaves too small
            return node
        if information_gain < self.min_information_gain:  # Stop if gain too low
            return node
        current_depth += 1  # Increment depth
        node.left = self.create_tree(split_1_data, current_depth)  # Build left subtree
        node.right = self.create_tree(split_2_data, current_depth)  # Build right subtree
        return node

    def train(self, X_train, Y_train):
        # Train the model by building the tree
        self.labels_in_train = np.unique(Y_train)  # Store unique labels for prediction (keeps consistent ordering)
        train_data = np.concatenate((X_train, np.reshape(Y_train, (-1, 1))), axis=1)  # Combine features and labels
        self.tree = self.create_tree(train_data, 0)  # Start building from root

    def predict_one_sample(self, X):
        # Predict probabilities for a single sample by traversing the tree
        node = self.tree
        # Traverse until a leaf node (leaf defined as both children None or no split feature)
        while node is not None:
            # If this node is a leaf (no split), return its stored probabilities
            if node.feature_idx is None or (node.left is None and node.right is None):
                return node.prediction_probs
            # Otherwise continue traversal using the split
            if X[node.feature_idx] < node.feature_val:
                # If left child missing, return current node's probs
                if node.left is None:
                    return node.prediction_probs
                node = node.left
            else:
                if node.right is None:
                    return node.prediction_probs
                node = node.right
        # Fallback (should not typically reach here)
        return np.zeros(len(self.labels_in_train), dtype=float)

    def predict_proba(self, X_set):
        # Get probabilities for a set of samples
        # Using apply_along_axis will assemble a 2D array because each returned vector has consistent length
        return np.apply_along_axis(self.predict_one_sample, 1, X_set)

    def predict(self, X_set):
        # Get class predictions by taking argmax of probabilities
        pred_probs = self.predict_proba(X_set)
        return np.argmax(pred_probs, axis=1)

In [4]:
# Cell 3: Data loading and training example for Decision Tree
import pandas as pd  # For data loading

# Load Iris dataset
df = pd.read_csv('Supervised Learning Datasets/Iris.csv')

# Map species to numeric labels
# Note: CSV uses 'Species' with values like 'Iris-setosa' so normalize before mapping
label_map = {'setosa': 0, 'versicolor': 1, 'virginica': 2}
species_clean = df['Species'].str.replace('Iris-', '', regex=False).str.lower()
Y = species_clean.map(label_map).values

# Select only numeric feature columns (drop Id and the Species column)
X = df[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']].values  # Features

# Split data (80/20)
idx = np.random.permutation(len(X))
split = int(0.8 * len(X))
X_train, X_test = X[idx[:split]], X[idx[split:]]
Y_train, Y_test = Y[idx[:split]], Y[idx[split:]]

# Train the model
model = DecisionTree()  # Use default params or adjust e.g., max_depth=3
model.train(X_train, Y_train)

In [5]:
# Cell 4: Prediction and evaluation for Decision Tree
# Make predictions
preds = model.predict(X_test)

# Calculate accuracy
accuracy = np.mean(preds == Y_test)
print(f"Accuracy: {accuracy:.2f}")  # Typically ~0.95
# Tip: Visualize tree or use sklearn for comparison if needed

Accuracy: 1.00


In [6]:
# RandomForest class (replacement for the placeholder)
class RandomForest:
	def __init__(self, n_estimators=5, max_depth=4, min_samples_leaf=1, min_information_gain=0.0,
				 bootstrap_sample_size=None, random_state=None):
		self.n_estimators = n_estimators
		self.max_depth = max_depth
		self.min_samples_leaf = min_samples_leaf
		self.min_information_gain = min_information_gain
		self.bootstrap_sample_size = bootstrap_sample_size
		self.random_state = random_state
		self.rng = np.random.RandomState(random_state)
		self.trees = []
		self.labels_ = None  # global label set (from full training set)

	def _create_bootstrap_indices(self, n_samples):
		bs = self.bootstrap_sample_size or n_samples
		return self.rng.choice(n_samples, size=bs, replace=True)

	def train(self, X_train, Y_train):
		# store global labels (consistent ordering used when averaging probabilities)
		self.labels_ = np.unique(Y_train)
		self.trees = []
		n = X_train.shape[0]
		for i in range(self.n_estimators):
			idx = self._create_bootstrap_indices(n)
			Xb, Yb = X_train[idx], Y_train[idx]
			tree = DecisionTree(max_depth=self.max_depth,
								min_samples_leaf=self.min_samples_leaf,
								min_information_gain=self.min_information_gain)
			# train tree on bootstrap sample
			tree.train(Xb, Yb)
			# keep trained tree
			self.trees.append(tree)

	def _pad_probs_to_global(self, probs, tree_labels):
		# probs: (n_samples, k), tree_labels: array-like length k
		n_samples = probs.shape[0]
		n_classes = len(self.labels_)
		padded = np.zeros((n_samples, n_classes), dtype=float)
		# map tree's label ordering into global ordering
		for i, lbl in enumerate(tree_labels):
			# find index in global labels
			try:
				j = int(np.where(self.labels_ == lbl)[0][0])
				padded[:, j] = probs[:, i]
			except IndexError:
				# label not in global labels (unlikely) -> skip
				continue
		return padded

	def predict_proba(self, X_set):
		if not self.trees:
			raise RuntimeError("RandomForest not trained. Call .train() first.")
		# collect padded probability arrays from each tree
		padded_list = []
		for tree in self.trees:
			probs = tree.predict_proba(X_set)  # (n_samples, k)
			tree_labels = tree.labels_in_train if getattr(tree, "labels_in_train", None) is not None else np.unique(Y)
			padded = self._pad_probs_to_global(probs, tree_labels)
			padded_list.append(padded)
		# stack and average across trees -> (n_samples, n_classes)
		stacked = np.stack(padded_list, axis=0)
		avg = np.mean(stacked, axis=0)
		return avg

	def predict(self, X_set):
		probs = self.predict_proba(X_set)
		return np.argmax(probs, axis=1)
model = RandomForest(n_estimators=10, max_depth=6, random_state=42)  # adjust params as needed
model.train(X_train, Y_train)  # ensure forest is trained before predicting

# Predict
preds = model.predict(X_test)

# Accuracy
accuracy = np.mean(preds == Y_test)
print(f"Accuracy: {accuracy:.2f}")  # Expect high accuracy on Iris RandomForest class (replacement for the placeholder)


Accuracy: 0.97


In [7]:
# Cell 1: LinearRegression class
import numpy as np

class LinearRegression:
    def __init__(self, lr=0.01, n_iters=1000):
        # Initialize hyperparameters
        self.lr = lr  # Learning rate for gradient descent
        self.n_iters = n_iters  # Number of iterations
        self.weights = None  # Feature weights (slope)
        self.bias = None  # Intercept

    def fit(self, X, y):
        # Train the model using gradient descent
        num_samples, num_features = X.shape
        self.weights = np.random.rand(num_features)  # Random init for weights
        self.bias = 0  # Init bias to 0
        for _ in range(self.n_iters):
            y_pred = np.dot(X, self.weights) + self.bias  # Forward pass
            dw = (1 / num_samples) * np.dot(X.T, (y_pred - y))  # Gradient for weights
            db = (1 / num_samples) * np.sum(y_pred - y)  # Gradient for bias
            self.weights -= self.lr * dw  # Update weights
            self.bias -= self.lr * db  # Update bias
        return self  # Return self for chaining

    def predict(self, X):
        # Make predictions
        return np.dot(X, self.weights) + self.bias

In [8]:
# Cell 2: Data loading, normalization, and training
import pandas as pd
from sklearn.preprocessing import StandardScaler
df = pd.read_csv('Supervised Learning Datasets/headbrain.csv')
X = df[['Head Size(cm^3)']].values
y = df['Brain Weight(grams)'].values

# Normalize feature
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split
idx = np.random.permutation(len(X_scaled))
split = int(0.8 * len(X_scaled))
X_train, X_test = X_scaled[idx[:split]], X_scaled[idx[split:]]
y_train, y_test = y[idx[:split]], y[idx[split:]]

# Check for NaNs or infinite values in X and y before training
print('NaNs in X:', np.isnan(X_train).sum(), 'NaNs in y:', np.isnan(y_train).sum())
print('Infinite in X:', np.isinf(X_train).sum(), 'Infinite in y:', np.isinf(y_train).sum())
if np.isnan(X_train).any() or np.isnan(y_train).any() or np.isinf(X_train).any() or np.isinf(y_train).any():
    print('Warning: Data contains NaN or infinite values. Please clean your data.')
else:
    # Train with a lower learning rate
    model = LinearRegression(lr=0.0001)  # Lower lr to help convergence
    model.fit(X_train, y_train)

NaNs in X: 0 NaNs in y: 0
Infinite in X: 0 Infinite in y: 0


In [9]:
# Cell 3: Prediction and evaluation
# Predict
preds = model.predict(X_test)

# Mean Squared Error
mse = np.mean((preds - y_test)**2)
print(f"MSE: {mse:.2f}")  # Typically low for this linear relationship
# Tip: Plot with matplotlib to visualize fit

MSE: 1460190.75


In [None]:
# Cell 1: MyLogisticRegression class (safe version)
import numpy as np

class MyLogisticRegression:
    def __init__(self, learning_rate=0.01, num_iterations=2000):
        # Hyperparameters (lower learning rate for stability)
        self.learning_rate = learning_rate
        self.num_iterations = num_iterations
        self.w = None  # Weights
        self.b = 0  # Bias

    def sigmoid(self, z):
        # Sigmoid activation for probabilities (clip for stability)
        z = np.clip(z, -500, 500)
        return 1 / (1 + np.exp(-z))

    def cost(self, H, Y, m):
        # Binary cross-entropy loss (clip for log stability)
        H = np.clip(H, 1e-8, 1 - 1e-8)
        return -np.sum(Y * np.log(H) + (1 - Y) * np.log(1 - H)) / m

    def cal_gradient(self, w, H, X, Y):
        # Compute gradients
        m = X.shape[1]
        dw = np.dot(X, (H - Y).T) / m
        db = np.sum(H - Y) / m
        return {"dw": dw, "db": db}

    def gradient_position(self, w, b, X, Y):
        # Forward pass and cost/grads
        m = X.shape[1]
        H = self.sigmoid(np.dot(w.T, X) + b)
        cost = self.cost(H, Y, m)
        grads = self.cal_gradient(w, H, X, Y)
        return grads, cost

    def gradient_descent(self, w, b, X, Y, print_cost=False):
        # Perform gradient descent
        costs = []
        for i in range(self.num_iterations):
            grads, cost = self.gradient_position(w, b, X, Y)
            dw = grads["dw"]
            db = grads["db"]
            w -= self.learning_rate * dw
            b -= self.learning_rate * db
            if i % 100 == 0:
                costs.append(cost)
            if print_cost and i % 100 == 0:
                print("Cost after iteration %i: %f" % (i, cost))
        return {"w": w, "b": b}, grads, costs

    def fit(self, X, Y):
        # Input validation
        if np.isnan(X).any() or np.isnan(Y).any() or np.isinf(X).any() or np.isinf(Y).any():
            raise ValueError("Input data contains NaN or infinite values.")
        # Initialize and train
        self.w = np.zeros((X.shape[0], 1))
        self.b = 0
        params, _, _ = self.gradient_descent(self.w, self.b, X, Y.T, print_cost=True)
        self.w = params["w"]
        self.b = params["b"]

    def predict(self, X):
        # Predict binary labels
        H = self.sigmoid(np.dot(self.w.T, X) + self.b)
        return (H >= 0.5).astype(int)

: 

In [None]:
# Cell 2: Data loading and preprocessing
import pandas as pd

# Load bank dataset (note: separator is ';')
df = pd.read_csv('Supervised Learning Datasets/bank.csv', sep=';')

# Preprocess: One-hot encode categoricals, map y to 0/1
df = pd.get_dummies(df, drop_first=True)  # Handle categoricals
y = df['y_yes'].values  # Assuming 'y' becomes 'y_yes' after dummies
X = df.drop('y_yes', axis=1).values

# Split
idx = np.random.permutation(len(X))
split = int(0.8 * len(X))
X_train, X_test = X[idx[:split]], X[idx[split:]]
y_train, y_test = y[idx[:split]], y[idx[split:]]