In [1]:
# Import necessary libraries
import numpy as np # Used for numerical operations
import pandas as pd # Used for data manipulation and analysis (DataFrames)
import torch # PyTorch library for building and training neural networks
from sklearn.model_selection import train_test_split # Used to split data into training and testing sets
from sklearn.preprocessing import StandardScaler # Used for feature scaling (standardization)
from sklearn.preprocessing import LabelEncoder # Used to encode categorical labels into numerical format

In [2]:
# Load the dataset from a URL into a pandas DataFrame
df = pd.read_csv('https://raw.githubusercontent.com/gscdit/Breast-Cancer-Detection/refs/heads/master/data.csv')
# Display the first 5 rows of the DataFrame to get an overview of the data
df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [3]:
# Check the dimensions (number of rows and columns) of the DataFrame
df.shape

(569, 33)

In [4]:
# Drop unnecessary columns 'id' and 'Unnamed: 32' from the DataFrame
# 'id' is a unique identifier and not useful for prediction
# 'Unnamed: 32' appears to be an empty or irrelevant column based on initial inspection
df.drop(columns=['id', 'Unnamed: 32'], inplace= True) # inplace=True modifies the DataFrame directly

In [5]:
# Display the first 5 rows of the DataFrame again after dropping columns
# This helps to verify that the columns have been removed successfully
df.head()

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


### train test split

In [6]:
# Split the DataFrame into features (X) and target (y)
# df.iloc[:, 1:] selects all columns from the second column onwards as features (X)
# df.iloc[:, 0] selects the first column as the target variable (y, which is 'diagnosis')
# test_size=0.2 means 20% of the data will be used for the test set, and 80% for the training set
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, 1:], df.iloc[:, 0], test_size=0.2)

### scaling

In [7]:
# Initialize the StandardScaler. This scales features to have zero mean and unit variance.
scaler = StandardScaler()
# Fit the scaler on the training features and transform them
# fit_transform learns the scaling parameters (mean and std dev) from X_train and applies the transformation
X_train = scaler.fit_transform(X_train)
# Transform the test features using the scaling parameters learned from the training data
# It's crucial to only transform the test set and not fit on it to prevent data leakage
X_test = scaler.transform(X_test)

In [8]:
# Display the scaled training features. They are now NumPy arrays.
X_train

array([[ 1.13717267, -0.12595689,  1.07442752, ...,  0.94051194,
        -0.16661828, -0.12453553],
       [ 0.96475025,  0.91864777,  1.08651127, ...,  1.27249954,
        -0.98832968,  2.09229965],
       [-1.7909495 ,  1.41284414, -1.75961408, ..., -0.98501613,
        -0.20199529,  1.22574854],
       ...,
       [ 1.41249236, -0.19622178,  1.3483258 , ...,  1.41585782,
         0.99921491, -0.07188433],
       [-0.7038539 , -1.53828112, -0.71235595, ..., -0.46455011,
        -0.23094011,  0.18149961],
       [-0.67326283, -0.9269766 , -0.68456333, ..., -0.55282863,
        -0.24219643, -0.33952796]])

In [9]:
# Display the training target variable (diagnosis labels). These are still pandas Series.
y_train

Unnamed: 0,diagnosis
317,M
468,M
539,B
225,B
58,B
...,...
322,B
329,M
56,M
98,B


### Label Encoding

In [10]:
# Initialize the LabelEncoder
encoder = LabelEncoder()
# Fit the encoder on the training target labels and transform them
# 'M' (Malignant) and 'B' (Benign) will be converted to 0s and 1s
y_train = encoder.fit_transform(y_train)
# Transform the test target labels using the same encoder
y_test = encoder.transform(y_test)

In [11]:
# Display the label-encoded training target variable
# Now 'M' and 'B' are represented as numerical values (e.g., 0 and 1)
y_train

array([1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1,
       1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0,
       1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0,
       1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1,
       0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,

### Numpy arrays to PyTorch tensors

In [12]:
# Convert X_train (features) from NumPy array to PyTorch tensor
# .astype(np.float32) ensures the data type is float32, which is common for neural network inputs
X_train_tensor = torch.from_numpy(X_train.astype(np.float32))
# Convert X_test (features) from NumPy array to PyTorch tensor
X_test_tensor = torch.from_numpy(X_test.astype(np.float32))
# Convert y_train (target labels) from NumPy array to PyTorch tensor
y_train_tensor = torch.from_numpy(y_train.astype(np.float32))
# Convert y_test (target labels) from NumPy array to PyTorch tensor
y_test_tensor = torch.from_numpy(y_test.astype(np.float32))

In [13]:
# Check the shape of the X_train_tensor
# This should show (number_of_samples, number_of_features)
X_train_tensor.shape

torch.Size([455, 30])

In [14]:
# Check the shape of the y_train_tensor
# This should show (number_of_samples,)
y_train_tensor.shape

torch.Size([455])

### Defining the model

In [15]:
import torch.nn as nn # Import the neural network module from PyTorch


class MySimpleNN(nn.Module):
  # Define a simple neural network class that inherits from nn.Module

  def __init__(self, num_features):
    # Constructor for the neural network
    super().__init__() # Call the constructor of the parent class (nn.Module)
    self.linear = nn.Linear(num_features, 1) # Define a linear layer (fully connected layer) with 'num_features' input and 1 output
    self.sigmoid = nn.Sigmoid() # Define a Sigmoid activation function. It squashes the output to a range between 0 and 1, suitable for binary classification.

  def forward(self, features):
    # Defines the forward pass of the neural network (how data flows through the layers)
    out = self.linear(features) # Pass the input features through the linear layer
    out = self.sigmoid(out) # Apply the Sigmoid activation function to the output of the linear layer

    return out # Return the final output of the network

### Important Parameters

In [16]:
learning_rate = 0.1 # Defines the step size at each iteration while moving toward a minimum of a loss function
epochs = 25 # Defines the number of times the entire training dataset is passed forward and backward through the neural network

In [17]:
# Define the loss function
# BCELoss (Binary Cross-Entropy Loss) is suitable for binary classification problems
loss_function = nn.BCELoss()

### Training Pipeline

In [18]:
# Create an instance of the MySimpleNN model
# X_train_tensor.shape[1] gives the number of features in the input data
model = MySimpleNN(X_train_tensor.shape[1])

# Define the optimizer
# SGD (Stochastic Gradient Descent) is an optimization algorithm used to update model weights
# model.parameters() provides the parameters (weights and biases) of the model to be optimized
# lr (learning rate) determines the step size for parameter updates
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

# Start the training loop for a specified number of epochs
for epoch in range(epochs):

  # Forward pass: compute predicted outputs by passing training data through the model
  y_pred = model(X_train_tensor)

  # Calculate the loss: compare the predicted outputs with the true labels
  # y_train_tensor.view(-1, 1) reshapes the target tensor to match the output shape of y_pred
  loss = loss_function(y_pred, y_train_tensor.view(-1,1))

  # Clear previous gradients
  # Gradients accumulate by default, so they need to be zeroed before a new backward pass
  optimizer.zero_grad()

  # Backward pass: compute gradients of the loss with respect to model parameters
  loss.backward()

  # Update model parameters: adjust weights based on the computed gradients
  optimizer.step()

  # Print the loss for the current epoch to monitor training progress
  print(f'Epoch: {epoch + 1}, Loss: {loss.item()}')

Epoch: 1, Loss: 0.6594364643096924
Epoch: 2, Loss: 0.5088738203048706
Epoch: 3, Loss: 0.42824605107307434
Epoch: 4, Loss: 0.3773457407951355
Epoch: 5, Loss: 0.3415745794773102
Epoch: 6, Loss: 0.31468233466148376
Epoch: 7, Loss: 0.2935170531272888
Epoch: 8, Loss: 0.27629807591438293
Epoch: 9, Loss: 0.2619345188140869
Epoch: 10, Loss: 0.24971610307693481
Epoch: 11, Loss: 0.2391582429409027
Epoch: 12, Loss: 0.22991740703582764
Epoch: 13, Loss: 0.22174231708049774
Epoch: 14, Loss: 0.2144445776939392
Epoch: 15, Loss: 0.2078794538974762
Epoch: 16, Loss: 0.2019338607788086
Epoch: 17, Loss: 0.19651781022548676
Epoch: 18, Loss: 0.19155870378017426
Epoch: 19, Loss: 0.1869971752166748
Epoch: 20, Loss: 0.18278412520885468
Epoch: 21, Loss: 0.1788785755634308
Epoch: 22, Loss: 0.17524588108062744
Epoch: 23, Loss: 0.17185664176940918
Epoch: 24, Loss: 0.1686856746673584
Epoch: 25, Loss: 0.16571122407913208


### Evaluation

In [19]:
# Evaluate the model on the test dataset
# torch.no_grad() disables gradient calculation during evaluation, saving memory and computation
with torch.no_grad():
  # Perform a forward pass on the test features to get predictions
  y_pred = model.forward(X_test_tensor)
  # Convert predicted probabilities to binary class labels (0 or 1)
  # If probability > 0.5, predict 1 (Malignant); otherwise, predict 0 (Benign)
  y_pred = (y_pred > 0.5).float()
  # Calculate accuracy: compare predicted labels with true labels
  # (y_pred == y_test_tensor) creates a boolean tensor, .float() converts to 0s and 1s, .mean() calculates the average
  accuracy = (y_pred == y_test_tensor).float().mean()
  # Print the calculated accuracy
  print(f'Accuracy: {accuracy.item()}')

Accuracy: 0.5678670406341553
