In [1]:
import numpy as np

In [2]:
class LogisticRegression:
    def __init__(self, n_iterations, learning_rate):
        self.n_iterations = n_iterations
        self.learning_rate = learning_rate
        self.parameters = {}
        
    def sigmoid(self, x):
        s = 1/(1 + np.nan_to_num(np.exp(-x)))
        return s

    def initialize_parameters(self, n_features):
        W = np.zeros((1, n_features))
        b = np.zeros((1, 1))

        self.parameters = {
            'W': W,
            'b': b
        }

    def forward_propagation(self, X):
        W = self.parameters['W']
        b = self.parameters['b']

        Z = np.dot(W, X) + b
        A = self.sigmoid(Z)

        results = {
            'Z': Z,
            'A': A
        }

        return results

    def compute_cost(self, A, Y):
        m = Y.shape[1]
        
        A = np.nan_to_num(A)
        logA = np.nan_to_num(np.log(A))
        logA_ = np.nan_to_num(np.log(1-A))
        
        cost = (-1/m)*np.nan_to_num(np.dot(Y, logA.T) + np.dot(1-Y, logA_.T))
        cost = np.squeeze(cost)
    
        return cost

    def backward_propagation(self, A, X, Y):
        m = Y.shape[1]
        dZ = A - Y
        dW = np.dot(dZ, X.T)/m
        db = np.sum(dZ, axis=1, keepdims=True)

        results = {
            "dZ": dZ,
            "dW": dW,
            "db": db
        }

        return results

    def gradient_descent(self, X, Y):    
        for i in range(self.n_iterations):
            A = self.forward_propagation(X)['A']
            
            if (i+1)%100 == 0:
                print(f"Cost after {i+1} iterations: {self.compute_cost(A, Y)}")
                
            gradients = self.backward_propagation(A, X, Y)
            self.parameters['W'] = self.parameters['W'] - self.learning_rate*gradients['dW']
            self.parameters['b'] = self.parameters['b'] - self.learning_rate*gradients['db']
            
    def fit(self, X, Y):
        X = np.array(X).T
        Y = np.array(Y).reshape(1, -1)

        n_features = X.shape[0]
        self.initialize_parameters(n_features)

        self.gradient_descent(X, Y)

    def predict(self, X):
        X = np.array(X).T
        m = X.shape[1]
        
        W = self.parameters['W']
        b = self.parameters['b']

        Z = np.dot(W, X) + b
        A = self.sigmoid(Z)

        predictions = []
        for i in range(m):
            if A[0, i] > 0.5:
                predictions.append(1)
            else:
                predictions.append(0)

        return predictions

# Loading dataset

In [3]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

dataset = load_breast_cancer()

X_train, X_test, y_train, y_test = train_test_split(dataset['data'], dataset['target'], test_size=0.2, random_state=0)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((455, 30), (114, 30), (455,), (114,))

In [4]:
X_train[0]

array([1.005e+01, 1.753e+01, 6.441e+01, 3.108e+02, 1.007e-01, 7.326e-02,
       2.511e-02, 1.775e-02, 1.890e-01, 6.331e-02, 2.619e-01, 2.015e+00,
       1.778e+00, 1.685e+01, 7.803e-03, 1.449e-02, 1.690e-02, 8.043e-03,
       2.100e-02, 2.778e-03, 1.116e+01, 2.684e+01, 7.198e+01, 3.840e+02,
       1.402e-01, 1.402e-01, 1.055e-01, 6.499e-02, 2.894e-01, 7.664e-02])

# Normalizing inputs

In [5]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

X_train[0]

array([-1.15036482, -0.39064196, -1.12855021, -0.95876358,  0.3109837 ,
       -0.5959945 , -0.80259612, -0.80249002,  0.29453906,  0.0942515 ,
       -0.4950523 ,  1.48720153, -0.51448782, -0.49154005,  0.28149837,
       -0.60451206, -0.46900701, -0.61170002,  0.05798237, -0.35763702,
       -1.0431756 ,  0.21353282, -1.0360446 , -0.84880771,  0.34249851,
       -0.73009743, -0.81232053, -0.75798367, -0.01614761, -0.38503402])

# Training model

In [6]:
clf = LogisticRegression(n_iterations=1000, learning_rate=0.01)
clf.fit(X_train, y_train)

Cost after 100 iterations: 0.2340942258824932
Cost after 200 iterations: 0.1789840434891524
Cost after 300 iterations: 0.15408753529858576
Cost after 400 iterations: 0.13918800752674343
Cost after 500 iterations: 0.1290154319651974
Cost after 600 iterations: 0.12150668727507102
Cost after 700 iterations: 0.11567048018240134
Cost after 800 iterations: 0.11096497899716708
Cost after 900 iterations: 0.10706611101628746
Cost after 1000 iterations: 0.10376665838945613


# Computing accuracy on test set

In [7]:
from sklearn.metrics import accuracy_score

y_pred = clf.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")

Accuracy: 0.956140350877193
