In [1]:
import numpy as np

class ModelSelection:
    def __init__(self, model, loss_function):
        """
        Initialize the model selector with a given model and loss function.

        Parameters:
        - model: A class with `fit` and `predict` methods.
        - loss_function: A callable that takes (y_true, y_pred) and returns a scalar loss.
        """
        self.model = model
        self.loss_function = loss_function

    def k_fold_cross_validation(self, X, y, k=5):
        """
        Perform k-fold cross-validation.

        Parameters:
        - X: Feature matrix (numpy array).
        - y: Target vector (numpy array).
        - k: Number of folds (default is 5).

        Returns:
        - mean_loss: The average loss across all folds.
        """
        n = len(y)
        indices = np.arange(n)
        np.random.shuffle(indices)
        fold_size = n // k
        losses = []

        for i in range(k):
            test_indices = indices[i * fold_size:(i + 1) * fold_size]
            train_indices = np.setdiff1d(indices, test_indices)

            X_train, X_test = X[train_indices], X[test_indices]
            y_train, y_test = y[train_indices], y[test_indices]

            self.model.fit(X_train, y_train)
            y_pred = self.model.predict(X_test)
            loss = self.loss_function(y_test, y_pred)
            losses.append(loss)

        mean_loss = np.mean(losses)
        return mean_loss

    def bootstrap(self, X, y, B=100):
        """
        Perform bootstrap resampling to estimate prediction error.

        Parameters:
        - X: Feature matrix (numpy array).
        - y: Target vector (numpy array).
        - B: Number of bootstrap samples (default is 100).

        Returns:
        - mean_loss: The average loss across all bootstrap samples.
        """
        n = len(y)
        losses = []

        for _ in range(B):
            bootstrap_indices = np.random.choice(np.arange(n), size=n, replace=True)
            oob_indices = np.setdiff1d(np.arange(n), bootstrap_indices)

            if len(oob_indices) == 0:  # Skip iteration if no OOB samples
                continue

            X_train, X_test = X[bootstrap_indices], X[oob_indices]
            y_train, y_test = y[bootstrap_indices], y[oob_indices]

            self.model.fit(X_train, y_train)
            y_pred = self.model.predict(X_test)
            loss = self.loss_function(y_test, y_pred)
            losses.append(loss)

        mean_loss = np.mean(losses)
        return mean_loss

    def evaluate_model(self, X, y, method='k_fold', **kwargs):
        """
        Evaluate the model using the specified method.

        Parameters:
        - X: Feature matrix (numpy array).
        - y: Target vector (numpy array).
        - method: 'k_fold' or 'bootstrap'.
        - kwargs: Additional parameters for the evaluation method.

        Returns:
        - loss: The evaluation loss.
        """
        if method == 'k_fold':
            return self.k_fold_cross_validation(X, y, **kwargs)
        elif method == 'bootstrap':
            return self.bootstrap(X, y, **kwargs)
        else:
            raise ValueError("Unsupported method. Choose 'k_fold' or 'bootstrap'.")


In [2]:
# Example of a simple linear regression model
class SimpleLinearModel:
    def fit(self, X, y):
        self.coef_ = np.linalg.pinv(X) @ y

    def predict(self, X):
        return X @ self.coef_

# Mean squared error loss function
def mean_squared_error(y_true, y_pred):
    return np.mean((y_true - y_pred) ** 2)

# Create synthetic data
np.random.seed(42)
X = np.random.rand(100, 3)
y = X @ np.array([1.5, -2.0, 1.0]) + np.random.randn(100) * 0.1

# Initialize model and model selector
model = SimpleLinearModel()
selector = ModelSelection(model, mean_squared_error)

# Perform k-fold cross-validation
k_fold_loss = selector.evaluate_model(X, y, method='k_fold', k=5)
print("K-Fold Cross-Validation Loss:", k_fold_loss)

# Perform bootstrap
bootstrap_loss = selector.evaluate_model(X, y, method='bootstrap', B=100)
print("Bootstrap Loss:", bootstrap_loss)

model.fit(X, y)

# Evaluate
predictions = model.predict(X)
mse = np.mean((y - predictions) ** 2)
print(f"Mean Squared Error: {mse:.4f}")
print("First 10 Predictions:", predictions[:10])

K-Fold Cross-Validation Loss: 0.010218721629392193
Bootstrap Loss: 0.010187779954984457
Mean Squared Error: 0.0093
First 10 Predictions: [-6.02845904e-01  7.50176355e-01 -1.04552162e+00  2.03622865e+00
  1.01601727e+00  2.05011940e-01  6.97539978e-01 -1.39914961e-03
 -6.96849252e-01 -3.76952506e-01]


In [4]:
#Dataset1
import numpy as np
import pandas as pd

# Load dataset
file_path = "https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv"
data = pd.read_csv(file_path)

# Features and Target
X = data.drop('medv', axis=1).values
y = data['medv'].values

# Train Linear Regression (First Principles)
class LinearRegression:
    def fit(self, X, y):
        X = np.c_[np.ones(X.shape[0]), X]  # Add bias term
        self.coef_ = np.linalg.pinv(X.T @ X) @ X.T @ y

    def predict(self, X):
        X = np.c_[np.ones(X.shape[0]), X]  # Add bias term
        return X @ self.coef_

model = LinearRegression()
model.fit(X, y)

# Evaluate
predictions = model.predict(X)
mse = np.mean((y - predictions) ** 2)
print(f"Mean Squared Error: {mse:.4f}")
print("First 10 Predictions:", predictions[:10])

Mean Squared Error: 21.8948
First 10 Predictions: [30.00384338 25.02556238 30.56759672 28.60703649 27.94352423 25.25628446
 23.00180827 19.53598843 11.52363685 18.92026211]


In [8]:
# Dataset 2: Multi-Class Logistic Regression
import numpy as np
import pandas as pd
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load the Digits dataset
digits = load_digits()
X, y = digits.data, digits.target  # Features and target

# Normalize the features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Multi-class Logistic Regression with Softmax
class LogisticRegression:
    def __init__(self, lr=0.01, epochs=5000):
        self.lr = lr
        self.epochs = epochs
        self.coefficients = None

    def _softmax(self, z):
        exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))  # Prevent overflow
        return exp_z / np.sum(exp_z, axis=1, keepdims=True)

    def fit(self, X, y, num_classes):
        # Add bias term
        X = np.c_[np.ones(X.shape[0]), X]
        self.coefficients = np.random.randn(num_classes, X.shape[1]) * 0.01  # Random initialization

        for epoch in range(self.epochs):
            logits = X @ self.coefficients.T  # Linear combination
            probabilities = self._softmax(logits)  # Apply softmax activation
            
            # Create one-hot encoded target matrix
            y_one_hot = np.zeros((y.size, num_classes))
            y_one_hot[np.arange(y.size), y] = 1
            
            # Compute gradient
            gradient = X.T @ (probabilities - y_one_hot) / len(y)
            self.coefficients -= self.lr * gradient.T  # Update coefficients

    def predict(self, X):
        # Add bias term
        X = np.c_[np.ones(X.shape[0]), X]
        logits = X @ self.coefficients.T
        probabilities = self._softmax(logits)
        return np.argmax(probabilities, axis=1)

# Initialize and train the model
model = LogisticRegression(lr=0.01, epochs=5000)
num_classes = len(np.unique(y_train))
model.fit(X_train, y_train, num_classes)

# Evaluate
predictions = model.predict(X_test)
accuracy = np.mean(predictions == y_test)
print(f"Accuracy: {accuracy:.4f}")
print("First 10 Predictions:", predictions[:10])
print("Actual Labels:        ", y_test[:10])


Accuracy: 0.9648
First 10 Predictions: [6 9 3 7 2 2 5 2 5 2]
Actual Labels:         [6 9 3 7 2 1 5 2 5 2]


In [6]:
#Dataset3
# Load dataset
file_path = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigree', 'Age', 'Outcome']
data = pd.read_csv(file_path, names=columns)

# Features and Target
X = data.drop('Outcome', axis=1).values
y = data['Outcome'].values

# Train Perceptron (Binary Classification)
class Perceptron:
    def __init__(self, lr=0.1, epochs=100):
        self.lr = lr
        self.epochs = epochs

    def fit(self, X, y):
        X = np.c_[np.ones(X.shape[0]), X]  # Add bias term
        self.weights = np.zeros(X.shape[1])
        for _ in range(self.epochs):
            for i in range(len(y)):
                prediction = 1 if X[i] @ self.weights > 0 else 0
                self.weights += self.lr * (y[i] - prediction) * X[i]

    def predict(self, X):
        X = np.c_[np.ones(X.shape[0]), X]  # Add bias term
        return (X @ self.weights > 0).astype(int)

model = Perceptron(lr=0.1, epochs=100)
model.fit(X, y)

# Evaluate
predictions = model.predict(X)
accuracy = np.mean(predictions == y)
print(f"Accuracy: {accuracy:.4f}")
print("First 10 Predictions:", predictions[:10])

Accuracy: 0.6536
First 10 Predictions: [0 0 0 0 1 0 0 1 1 0]


In [7]:
#Dataset4
# Load dataset
file_path = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
data = pd.read_csv(file_path, sep=';')

# Features and Target
X = data.drop('quality', axis=1).values
y = data['quality'].values

# Train Ridge Regression (First Principles)
class RidgeRegression:
    def __init__(self, alpha=1.0):
        self.alpha = alpha

    def fit(self, X, y):
        X = np.c_[np.ones(X.shape[0]), X]  # Add bias term
        I = np.eye(X.shape[1])
        I[0, 0] = 0  # Do not regularize bias term
        self.coef_ = np.linalg.pinv(X.T @ X + self.alpha * I) @ X.T @ y

    def predict(self, X):
        X = np.c_[np.ones(X.shape[0]), X]  # Add bias term
        return X @ self.coef_

model = RidgeRegression(alpha=1.0)
model.fit(X, y)

# Evaluate
predictions = model.predict(X)
mse = np.mean((y - predictions) ** 2)
print(f"Mean Squared Error: {mse:.4f}")
print("First 10 Predictions:", predictions[:10])


Mean Squared Error: 0.4175
First 10 Predictions: [5.04323344 5.13506106 5.21178615 5.68140344 5.04323344 5.07773597
 5.1034784  5.32451785 5.32838568 5.63530842]
