In [1]:
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

In [2]:
# 3

class LogisticRegression():
    
    def __init__(self, X_train, y_train, batch_size, learning_rate, max_iter):
        self.weights = np.array([np.random.normal(0, 1) for i in range(len(X_train[0])+1)])
        self.X_train = X_train
        self.y_train = y_train
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.max_iter = max_iter
        self.num_features = len(X_train[0])
         
    def sigmoid(self, val):
        return 1/(1+np.exp(-val))
        
    def fit(self):
        for i in range(self.max_iter):
            batch_val = np.random.choice(range(len(self.X_train)), size = self.batch_size)
            edit = np.zeros(self.num_features+1)
            for j in batch_val:
                edit = edit + (self.sigmoid(np.matmul(self.weights, np.concatenate(([1], self.X_train[j])))) - self.y_train[j])*np.concatenate(([1], self.X_train[j]))
            self.weights = self.weights - self.learning_rate*edit
            
    def predict(self, X_test):
        results = np.matmul(np.concatenate(([[1]]*(len(X_test)), X_test), axis = 1), self.weights)
        results = [self.sigmoid(a) for a in results]
        results = [1 if result >=0.5 else 0 for result in results]
        return np.array(results)

In [3]:
# 4a

breast_cancer_dataset = load_breast_cancer()

In [4]:
# 4b

# splits into train/test (75/25 split)

X_train, X_test, y_train, y_test = train_test_split(breast_cancer_dataset['data'], breast_cancer_dataset['target'], test_size=0.25)

# splits into test/val split (10/15 split)

X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.4)

In [5]:
# 4c

print('Train size: ' + str(len(y_train)))
print('Val size: ' + str(len(y_val)))
print('Test size: ' + str(len(y_test)))

Train size: 426
Val size: 85
Test size: 58


In [6]:
# 4d

scaler = preprocessing.StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_train_scaled.shape

LRModel = LogisticRegression(X_train_scaled, y_train, 64, 0.001, 7500)
LRModel.fit()

In [7]:
# used to find optimal hyperparameter

X_val = scaler.transform(X_val)
predictions = LRModel.predict(X_val)
actual = y_val
TP = (predictions == 1) & (actual == 1)
FP = (predictions == 1) & (actual == 0)
TN = (predictions == 0) & (actual == 0)
FN = (predictions == 0) & (actual == 1)

accuracy = (sum(TP) + sum(TN))/len(X_val)
precision = sum(TP)/(sum(TP) + sum(FP))
recall = sum(TP)/(sum(TP) + sum(FN))

print('Accuracy: ' + str(accuracy))
print('Precision: ' + str(precision))
print('Recall: ' + str(recall))
print('F1: ' + str(2*precision*recall/(precision + recall)))

Accuracy: 0.9882352941176471
Precision: 1.0
Recall: 0.9818181818181818
F1: 0.9908256880733944


In [8]:
# 4e

X_test = scaler.transform(X_test)
predictions = LRModel.predict(X_test)
actual = y_test
TP = (predictions == 1) & (actual == 1)
FP = (predictions == 1) & (actual == 0)
TN = (predictions == 0) & (actual == 0)
FN = (predictions == 0) & (actual == 1)

accuracy = (sum(TP) + sum(TN))/len(X_test)
precision = sum(TP)/(sum(TP) + sum(FP))
recall = sum(TP)/(sum(TP) + sum(FN))

print('Accuracy: ' + str(accuracy))
print('Precision: ' + str(precision))
print('Recall: ' + str(recall))
print('F1: ' + str(2*precision*recall/(precision + recall)))

Accuracy: 0.9827586206896551
Precision: 1.0
Recall: 0.9772727272727273
F1: 0.9885057471264368
