In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# ***Data Exploration***

In [3]:
df = pd.read_csv(r"breast_cancer.csv", encoding="latin1")

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 683 entries, 0 to 682
Data columns (total 10 columns):
 #   Column                       Non-Null Count  Dtype
---  ------                       --------------  -----
 0   Clump Thickness              683 non-null    int64
 1   Uniformity of Cell Size      683 non-null    int64
 2   Uniformity of Cell Shape     683 non-null    int64
 3   Marginal Adhesion            683 non-null    int64
 4   Single Epithelial Cell Size  683 non-null    int64
 5   Bare Nuclei                  683 non-null    int64
 6   Bland Chromatin              683 non-null    int64
 7   Normal Nucleoli              683 non-null    int64
 8   Mitoses                      683 non-null    int64
 9   Class                        683 non-null    int64
dtypes: int64(10)
memory usage: 53.5 KB


In [5]:
df.columns

Index(['Clump Thickness', 'Uniformity of Cell Size',
       'Uniformity of Cell Shape', 'Marginal Adhesion',
       'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin',
       'Normal Nucleoli', 'Mitoses', 'Class'],
      dtype='object')

In [6]:
df.shape

(683, 10)

In [7]:
df.isna().sum()

Clump Thickness                0
Uniformity of Cell Size        0
Uniformity of Cell Shape       0
Marginal Adhesion              0
Single Epithelial Cell Size    0
Bare Nuclei                    0
Bland Chromatin                0
Normal Nucleoli                0
Mitoses                        0
Class                          0
dtype: int64

In [8]:
df.head()

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,5,1,1,1,2,1,3,1,1,2
1,5,4,4,5,7,10,3,2,1,2
2,3,1,1,1,2,2,3,1,1,2
3,6,8,8,1,3,4,3,7,1,2
4,4,1,1,3,2,1,3,1,1,2


# ***Logistic Regression (Scratch)***

In [10]:
class LogisticRegressionScratch:

    @staticmethod
    def sigmoid(z):
        return (1 / (1 + np.exp(-z)))

    @staticmethod
    def classify(y):
        return (y >= 0.5).astype(int)

    def log_loss(self, y, y_pred):
        return -(1/len(y))*np.sum(y*np.log(y_pred) + (1-y)*np.log(1-y_pred))
    
    def __init__(self):
        self.coef_ = None
        self.intercept_ = None

    def fit(self, X, y, learning_rate=0.01, iterations=10000):
        y = y.reshape(-1, 1)
        n = len(X)
        self.coef_ = np.zeros((X.shape[1], 1))
        self.intercept_ = 0
        for _ in range(iterations):
            z = X @ self.coef_ + self.intercept_
            y_pred = LogisticRegressionScratch.sigmoid(z)
            errors = y_pred - y
            dw = (1/n) * (X.T @ errors)
            db = (1/n) * np.sum(errors)
            self.coef_ -= learning_rate * dw
            self.intercept_ -= learning_rate * db

    def predict(self, X):
        z = X @ self.coef_ + self.intercept_
        y_pred = LogisticRegressionScratch.sigmoid(z)
        return LogisticRegressionScratch.classify(y_pred)

# ***X, y ___ Split***

In [11]:
df.head()

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,5,1,1,1,2,1,3,1,1,2
1,5,4,4,5,7,10,3,2,1,2
2,3,1,1,1,2,2,3,1,1,2
3,6,8,8,1,3,4,3,7,1,2
4,4,1,1,3,2,1,3,1,1,2
