# COMP47590: Advanced Machine Learning
# Assignment 1: Benchmarking Esemble Methods

Name(s): 

Student Number(s):

## Import Packages Etc

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import pyplot
%matplotlib inline
from random import randint
import math
import sklearn

## Task 1: Define HyperParamClassifier

HyperParamClassifier class.

In [5]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.tree import DecisionTreeClassifier

class HyperParamClassifier(BaseEstimator, ClassifierMixin):
    """An ensemble classifier that trains a single Decision Tree classifier 
       with specific hyperparameters.
    
    Parameters
    ----------
    param_grid : dict
        A dictionary defining the hyperparameter values for the Decision Tree.
        Example: {'max_depth': [3, 5, 7], 'min_samples_split': [2, 5, 10]}
    
    Attributes
    ----------
    model : DecisionTreeClassifier
        The trained Decision Tree model.
    
    Notes
    -----
    This classifier trains a single DecisionTreeClassifier with a selected 
    hyperparameter configuration.
    
    See also
    --------
    DecisionTreeClassifier : The base model used in this classifier.
    
    Examples
    --------
    >>> param_grid = {'max_depth': [5], 'min_samples_split': [2]}
    >>> clf = HyperParamClassifier(param_grid)
    >>> clf.fit(X_train, y_train)
    >>> predictions = clf.predict(X_test)
    """

    def __init__(self, param_grid):
        """Initialize a HyperParamClassifier with a Decision Tree model.
        
        Parameters
        ----------
        param_grid : dict
            A dictionary specifying hyperparameter values for the Decision Tree.
        """
        self.param_grid = param_grid
        self.model = None  # Placeholder for the trained model

    def fit(self, X, y):
        """Train a Decision Tree classifier with the provided hyperparameters.
        
        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            The training input samples. 
        y : array-like, shape = [n_samples] 
            The target values (class labels) as integers or strings.
        
        Returns
        -------
        self : object
        """
        # Select the first set of hyperparameters from the dictionary
        best_params = {key: values[0] for key, values in self.param_grid.items()}
        
        # Initialize the Decision Tree model with the selected parameters
        self.model = DecisionTreeClassifier(**best_params)
        
        # Train the model using the training data
        self.model.fit(X, y)
        
        return self

    def predict(self, X):
        """Predict class labels for the input samples X.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            The input samples. 

        Returns
        -------
        p : array of shape = [n_samples]
            The predicted class labels of the input samples.
        """
        # Ensure the model has been trained before making predictions
        if self.model is None:
            raise ValueError("Model has not been trained. Call `fit` first.")

        # Use the trained DecisionTreeClassifier to make predictions
        return self.model.predict(X)


    def predict_proba(self, X):
        """Predict class probabilities for the input samples X.
        
        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            The input samples. 
        
        Returns
        -------
        p : array of shape = [n_samples, n_classes]
            The predicted class label probabilities of the input samples.
        """
        if self.model is None:
            raise ValueError("Model has not been trained. Call `fit` first.")
        
        return self.model.predict_proba(X)


In [6]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np


data = pd.read_csv("mnist_train.csv")

# 1️⃣ Load the MNIST dataset (70,000 images of handwritten digits)

X = data.drop("label",axis = 1)  # X = images, y = digit labels
y = data["label"]
# Convert labels to integers
y = y.astype(np.int64)

# 2️⃣ Normalize pixel values (optional for tree-based models)
 # Decision trees don't need this, but it's useful for consistency

# 3️⃣ Split the dataset into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4️⃣ Define hyperparameters for DecisionTreeClassifier
param_grid = {
    'max_depth': [10],  # Limit tree depth to prevent overfitting
    'min_samples_split': [5]  # Minimum samples required to split a node
}

# 5️⃣ Initialize the HyperParamClassifier
clf = HyperParamClassifier(param_grid)

# 6️⃣ Train the model
clf.fit(X_train, y_train)

# 7️⃣ Make predictions
y_pred = clf.predict(X_test)

# 8️⃣ Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy on MNIST: {accuracy:.4f}")


Model Accuracy on MNIST: 0.8573


## Test the HyperParamClassifier

Perform a simple test using the HyperParamClassifier on the Iris dataset

In [5]:
from sklearn.datasets import load_iris
iris = load_iris()
clf = StackedEnsembleClassifier()
clf.fit(iris.data, iris.target)
y_pred = clf.predict(iris.data)
print(metrics.classification_report(iris.target, y_pred))
print("Confusion Matrix")
display(pd.crosstab(np.array(iris.target), y_pred, rownames=['True'], colnames=['Predicted'], margins=True))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00        50
          1       1.00      0.96      0.98        50
          2       0.96      1.00      0.98        50

avg / total       0.99      0.99      0.99       150

Confusion Matrix


Predicted,0,1,2,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,50,0,0,50
1,0,48,2,50
2,0,0,50,50
All,50,48,52,150


Perform a cross validation experiment

In [6]:
scores = cross_val_score(clf, iris.data, iris.target, cv=10)
print(scores)
print(np.mean(scores), " +/- ", np.std(scores))

[1.         0.93333333 1.         0.93333333 0.93333333 0.93333333
 0.86666667 1.         1.         1.        ]
0.96  +/-  0.044221663871405324


## Task 2: Design the Evaluation Experiment

Describe datasets and expeimental apporach and setup infrastructure for experimentation. 

In [None]:
# Write your code here


## Task 3: Execute Evalution Experiment

In [None]:
# Write your code here


### Experiment Results Summary
Present a series of tables and graphs illustraitng experiment results. 

In [None]:
# Write your code here


## Task 4: Reflection
A short (less than 400 words) reflection on the results of the experiment and the experience of running it. 


*Write your reflection here*


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.base import BaseEstimator, ClassifierMixin
import numpy as np

class HyperParamEnsembleClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, param_grid, voting='hard'):
        """
        Parameters:
        -----------
        param_grid : dict
            A dictionary defining the hyperparameter choices for the Decision Tree.
            Example: {'max_depth': [3, 5, 7], 'min_samples_split': [2, 5, 10]}

        voting : str, default='hard'
            - 'hard': Uses majority voting.
            - 'soft': Uses averaged probabilities.
        """
        self.param_grid = param_grid
        self.voting = voting
        self.model = None  # Will store the trained DecisionTreeClassifier

    def fit(self, X, y):
        """
        Train a Decision Tree classifier using the best hyperparameter combination.

        Parameters:
        -----------
        X : array-like, shape (n_samples, n_features)
            Feature matrix (training data).
        
        y : array-like, shape (n_samples,)
            Target labels.

        Returns:
        --------
        self : object
            Trained classifier.
        """
        # Select a set of hyperparameters (e.g., the first set in the grid)
        best_params = {key: values[0] for key, values in self.param_grid.items()}
        
        # Initialize a Decision Tree with the selected hyperparameters
        self.model = DecisionTreeClassifier(**best_params)
        
        # Train the model on the dataset
        self.model.fit(X, y)
        
        return self


sanketh