In [60]:
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

(a) Load Wisconsin Breast Cancer dataset from scikit-learn’s built-in datasets

In [61]:
#loads the breast cancer dataset
data = load_breast_cancer()
X = data.data
y = data.target

(b) Split the dataset into train, validation, and test sets.

In [62]:
#splits the data to train + val and test sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#splits the train + val set into train and val 
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)

(c) Report the size of each class in your training (+ validation) set.

In [67]:
#counts the number of the sample from the training + validation set
class_0_counter = len(y_train_val[y_train_val == 0])
class_1_counter = len(y_train_val[y_train_val == 1])
print("Class 0 has", class_0_counter, "samples")
print("Class 1 has", class_1_counter, "samples")

Class 0 has 169 samples
Class 1 has 286 samples


 (d) Train a binary logistic regression model using your implementation from problem 3. 
 Initialize the model weights randomly, sampling from a standard Gaussian distribution. 
 Experiment with different choices of fixed learning rate and batch size.

In [64]:
class LogisticRegression:
    def __init__(self, dim):
        #initializes the logistic regression model with the numbers of features in the dataset.
        self.w = np.random.randn(dim)

    def sigmoid(self, z):
        #outputs probilities after applying it to logistic sigmoid function.
        return 1 / (1 + np.exp(-z))

    
    def predict(self, X):
        #predicts the data points class by the threshold of 0.5.
        probabilities = self.sigmoid(np.dot(X, self.w))
        return (probabilities >= 0.5).astype(int)
    
    def cross_entropy_error(self, X, t):
        #computes the cross_entropy_error
        y = self.predict(X, return_proba=True)
        error = -np.sum(t * np.log(y) + (1 - t) * np.log(1 - y))
        return error
    
class MiniBatchSGD:
    def __init__(self, model, batch_size=64, learning_rate=0.001, max_iterations=10000):
        #initializes the mini-batch SGD parameters
        self.model = model
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.max_iterations = max_iterations 
    
    def random_select_batch(self, X, t):
        #randomly selects a batch
        i = np.random.choice(X.shape[0], self.batch_size, replace=False)
        X_batch = X[i]
        t_batch = t[i]
        return X_batch, t_batch
        
    def iterate(self, X, t):
     for i in range(self.max_iterations):
            #select a random mini-batch
            X_batch, t_batch = self.random_select_batch(X, t)
            
            #predicts the probability of the batch
            z_batch = np.dot(X_batch, self.model.w)
            y_batch = self.model.sigmoid(z_batch)
            
            #computes the gradient of the selcted batch
            gradient = np.dot(X_batch.T, (y_batch - t_batch)) / self.batch_size
            
            #update the model's weights
            self.model.w -= self.learning_rate * gradient
            
            return self.model
     
#initialize the model with feature dimension
model = LogisticRegression(dim=X_train.shape[1])

#train the model with different learning rates and batch sizes
optimizer = MiniBatchSGD(model=model, batch_size=32, learning_rate=0.005, max_iterations=1000)
trained_model = optimizer.iterate(X_train, y_train)

  return 1 / (1 + np.exp(-z))


(e) Use the trained model to report the performance of the model on the test set. For evaluation
 metrics, use accuracy, precision, recall, and F1-score.

In [65]:
y_pred = trained_model.predict(X_test)

#computes accuracy, precision, recall, and f1-score of the prediction
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=1)
recall = recall_score(y_test, y_pred, zero_division=1)
f1_score = f1_score(y_test, y_pred, zero_division=1)

#prints each evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1_score)

Accuracy: 0.6228070175438597
Precision: 0.6228070175438597
Recall: 1.0
F1-Score: 0.7675675675675676


(f) Summarize your findings.

The F1-score shows the balance between the precision and recall. The Recall of 1.0 suggest that model is identifying all the benign tumor(Class 0, negative) correctly. However, precision of 62.3% suggests that many predictions is identifying benign tumor as maligant tumor(Class 1, positive) which is a Type I Error. In this case, we prefer having higher recall than precision as identifying maliganant tumor as benign tumor could potentially worsen patient's condition. Lastly, low accuracy of 62.3% indicates that the model struggles to make correct predictions for about one-third of the cases.