In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

matplotlib.style.use('fivethirtyeight')
np.random.seed(113)

In [2]:
# loading csv file
names=['sample', 'thickness', 'size', 'shape', 'adhesion', 'epithelial', 'nuclei', 'chromatin', 'nucleoli', 'mitoses', 'status']
df = pd.read_csv('../datasets/breast-cancer-wisconsin.data', names=names, na_values='?')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   sample      699 non-null    int64  
 1   thickness   699 non-null    int64  
 2   size        699 non-null    int64  
 3   shape       699 non-null    int64  
 4   adhesion    699 non-null    int64  
 5   epithelial  699 non-null    int64  
 6   nuclei      683 non-null    float64
 7   chromatin   699 non-null    int64  
 8   nucleoli    699 non-null    int64  
 9   mitoses     699 non-null    int64  
 10  status      699 non-null    int64  
dtypes: float64(1), int64(10)
memory usage: 60.2 KB


In [4]:
print(df.isnull().sum())
df.head(10)

sample         0
thickness      0
size           0
shape          0
adhesion       0
epithelial     0
nuclei        16
chromatin      0
nucleoli       0
mitoses        0
status         0
dtype: int64


Unnamed: 0,sample,thickness,size,shape,adhesion,epithelial,nuclei,chromatin,nucleoli,mitoses,status
0,1000025,5,1,1,1,2,1.0,3,1,1,2
1,1002945,5,4,4,5,7,10.0,3,2,1,2
2,1015425,3,1,1,1,2,2.0,3,1,1,2
3,1016277,6,8,8,1,3,4.0,3,7,1,2
4,1017023,4,1,1,3,2,1.0,3,1,1,2
5,1017122,8,10,10,8,7,10.0,9,7,1,4
6,1018099,1,1,1,1,2,10.0,3,1,1,2
7,1018561,2,1,2,1,2,1.0,3,1,1,2
8,1033078,2,1,1,1,2,1.0,1,1,5,2
9,1033078,4,2,1,1,2,1.0,2,1,1,2


In [5]:
y = df['status']
y[y==2] = 0; y[y==4] = 1
X = df.drop(columns=['status', 'sample'])
# 'status' goes for labels and number of 'sample' isn't important

print(X.dtypes)
X.head(10)

thickness       int64
size            int64
shape           int64
adhesion        int64
epithelial      int64
nuclei        float64
chromatin       int64
nucleoli        int64
mitoses         int64
dtype: object


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y[y==2] = 0; y[y==4] = 1


Unnamed: 0,thickness,size,shape,adhesion,epithelial,nuclei,chromatin,nucleoli,mitoses
0,5,1,1,1,2,1.0,3,1,1
1,5,4,4,5,7,10.0,3,2,1
2,3,1,1,1,2,2.0,3,1,1
3,6,8,8,1,3,4.0,3,7,1
4,4,1,1,3,2,1.0,3,1,1
5,8,10,10,8,7,10.0,9,7,1
6,1,1,1,1,2,10.0,3,1,1
7,2,1,2,1,2,1.0,3,1,1
8,2,1,1,1,2,1.0,1,1,5
9,4,2,1,1,2,1.0,2,1,1


In [6]:
# cleaning
X.fillna(X.mean(), inplace=True)
# changing NaN values for mean values

print(X.isna().sum())

thickness     0
size          0
shape         0
adhesion      0
epithelial    0
nuclei        0
chromatin     0
nucleoli      0
mitoses       0
dtype: int64


In [17]:
# declaration of perceptron class
class Perceptron:
    
    def __init__(self, n_dim, learning_rate):
        self.weights = np.random.random(n_dim)
        self.bias    = 0
        self.n_dim   = n_dim
        self.learning_rate = learning_rate
        
    def ReLu(self, x):
        return np.maximum(x, 0, x)
    
    def perceptronComputing(self, X):
        z = np.dot(self.weights, X.T) + self.bias
        outputs = self.ReLu(z)
        return outputs
    
    def train(self, X, y, epochs=1000):
        X = X.astype(float)
        m = y.shape[0] # number of samples
        
        for i in range(epochs):
            y_preds = self.perceptronComputing(X)
            error   = (y - y_preds) / m
            adjustments   = np.dot( error, X )
            self.bias    += self.learning_rate * error.sum()
            self.weights += self.learning_rate * adjustments
                
        return self.bias, self.weights
    
    def predict(self, x):
        z = np.dot(self.weights, x.T) + self.bias
        preds = self.ReLu(z)
        return np.where(preds >= 0.0, 1, 0)

In [8]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [9]:
n_dim = X.shape[1]

# preprocessing
preprocesser = StandardScaler()
X = preprocesser.fit_transform(X)

In [18]:
perceptron = Perceptron(n_dim, 0.0001)
print("Perceptron weights before training:\n", perceptron.weights)

Perceptron weights before training:
 [0.50134692 0.68525156 0.58303705 0.22480071 0.76450769 0.63758369
 0.59276587 0.9753792  0.34053951]


In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [20]:
print("Perceptron weights after training:")
perceptron.train(X_train, y_train, 500)

Perceptron weights after training:


(-0.07052512676142018,
 array([0.42482708, 0.57742168, 0.47934003, 0.13757109, 0.66461125,
        0.55461449, 0.49678381, 0.87543544, 0.27037691]))

In [21]:
print("After training:\n", perceptron.weights)

After training:
 [0.42482708 0.57742168 0.47934003 0.13757109 0.66461125 0.55461449
 0.49678381 0.87543544 0.27037691]


In [22]:
y_pred = perceptron.predict(X_test)

In [23]:
def get_accuracy(y_true, y_pred):
    return np.sum(y_true == y_pred) / y_true.shape[0]

In [24]:
accuracy = get_accuracy(y_test, y_pred) * 100
print( f'Accuracy: {accuracy:.2f}%' )

Accuracy: 27.14%
