In [1]:
#applying the mini batch insted of the gradient

# Numerical computing library used for handling arrays and performing mathematical operations
import numpy as np

# Data manipulation library used for handling tabular datasets (like CSV files, Excel, etc.)
import pandas as pd

# Deep learning framework for building and training neural networks
import torch

# Function to split datasets into training and testing sets for machine learning
from sklearn.model_selection import train_test_split

# Standardization tool to scale numerical features so that they have a mean of 0 and a standard deviation of 1
from sklearn.preprocessing import StandardScaler

# Converts categorical labels (e.g., "cat", "dog") into numerical values (e.g., 0, 1) for machine learning models
from sklearn.preprocessing import LabelEncoder

In [3]:
df = pd.read_csv('https://raw.githubusercontent.com/gscdit/Breast-Cancer-Detection/refs/heads/master/data.csv')
df.head() #display the five rows of the dataset to see the data

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [6]:
df.shape
# how many samples (rows) and features (columns) are in the dataset.

(569, 33)

In [7]:
df.drop(columns=['id', 'Unnamed: 32'], inplace= True)
# Drop unnecessary columns:
# 'id' - A unique identifier that does not contribute to the model.
# 'Unnamed: 32' - An extra column likely containing only NaN values.

In [8]:
df.head()

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


### train test split

In [9]:
# Split the dataset into training and testing sets:
# df.iloc[:, 1:] selects all columns from index 1 onwards (features excluding 'diagnosis')
# df.iloc[:, 0] selects the first column (target variable 'diagnosis')
# test_size=0.2 means 20% of the data will be used for testing, and 80% for training

X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, 1:], df.iloc[:, 0], test_size=0.2)

# X_train: Features for the training set (80% of the data)
# X_test: Features for the testing set (20% of the data)
# y_train: Target labels for the training set (80% of the data)
# y_test: Target labels for the testing set (20% of the data)


### scaling

In [10]:
# Initialize a StandardScaler object to standardize features
scaler = StandardScaler()

# Fit the scaler on the training data and transform it (standardize the features)
X_train = scaler.fit_transform(X_train)

# Transform the testing data using the already fitted scaler (standardize using the same scale as training data)
X_test = scaler.transform(X_test)


In [13]:
X_train

array([[-0.3056004 ,  0.34980258, -0.33529435, ..., -0.75283343,
         0.50481555, -1.10113485],
       [ 0.1461648 , -1.30487202,  0.11080133, ..., -0.32406283,
        -0.43925651, -0.76429848],
       [ 0.97249481,  1.25615524,  1.01333813, ...,  0.86640614,
        -1.21107682,  0.47314934],
       ...,
       [ 1.75307645,  1.75581119,  1.74165761, ...,  1.50585295,
         0.62903556, -0.44339727],
       [ 1.12117703,  0.32191481,  1.08368717, ...,  1.24176587,
         0.27956327,  0.03520711],
       [-1.22428494,  0.15458816, -1.22086462, ..., -1.33367665,
        -1.58539311,  0.30170897]])

In [None]:
y_train

Unnamed: 0,diagnosis
109,B
354,B
10,M
173,B
73,M
...,...
531,B
534,B
85,M
187,B


### Label Encoding

In [16]:
# Initialize a LabelEncoder object to encode categorical target labels
encoder = LabelEncoder()
#fit: The encoder learns the unique labels in the training target variable (y_train).
#transform: After learning the mapping, the encoder converts each label to a corresponding numeric value (e.g., M becomes 0, B becomes 1).
# Fit the encoder on the training target labels (y_train) and transform them into numeric values
y_train = encoder.fit_transform(y_train)

# Transform the test target labels (y_test) using the same encoder (so it has the same mapping as y_train)
y_test = encoder.transform(y_test)


In [17]:
y_train

array([0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1,
       1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1,
       1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0,
       0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1,
       0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,

### Numpy arrays to PyTorch tensors

In [19]:
# Convert the training and testing feature data (X_train, X_test) into PyTorch tensors
# The .astype(np.float32) ensures that the data is of type float32 (which is compatible with PyTorch)

X_train_tensor = torch.from_numpy(X_train.astype(np.float32))
X_test_tensor = torch.from_numpy(X_test.astype(np.float32))

# Convert the training and testing target labels (y_train, y_test) into PyTorch tensors
# Again, we use .astype(np.float32) to ensure correct data type for PyTorch

y_train_tensor = torch.from_numpy(y_train.astype(np.float32))
y_test_tensor = torch.from_numpy(y_test.astype(np.float32))


In [20]:
X_train_tensor.shape

torch.Size([455, 30])

In [21]:
y_train_tensor.shape

torch.Size([455])

In [23]:
# Import Dataset and DataLoader from PyTorch’s utils module
from torch.utils.data import Dataset, DataLoader

# Define a custom dataset class inheriting from PyTorch's Dataset class
class CustomDataset(Dataset):

  def __init__(self, features, labels):
    """
    Initialize dataset with features and labels.

    :param features: Input data (e.g., X_train_tensor)
    :param labels: Corresponding target labels (e.g., y_train_tensor)
    """
    self.features = features  # Store input data
    self.labels = labels      # Store target labels

  def __len__(self):
    """
    Return the total number of samples in the dataset.
    """
    return len(self.features)  # Return the number of samples

  def __getitem__(self, idx):
    """
    Retrieve the feature-label pair for the sample at index `idx`.

    :param idx: Index of the sample
    :return: Tuple (feature, label) at the specified index
    """
    return self.features[idx], self.labels[idx]  # Return feature and label at index



In [25]:
# Create an instance of CustomDataset for the training data
train_dataset = CustomDataset(X_train_tensor, y_train_tensor)

# Create an instance of CustomDataset for the test data
test_dataset = CustomDataset(X_test_tensor, y_test_tensor)



In [26]:
train_dataset[10]   # Retrieve the 10th sample from the train_dataset

(tensor([ 0.6380, -0.8308,  0.5416,  0.4946, -0.1651, -0.7069, -0.4181, -0.0791,
         -0.3759, -0.9103, -0.7530, -1.3323, -0.8934, -0.4577,  0.1044, -0.7479,
         -0.2491,  0.4135, -0.8572, -0.5082,  0.2270, -1.2643,  0.0855,  0.0931,
          0.1249, -0.7446, -0.3583,  0.3264, -0.9792, -0.6643]),
 tensor(0.))

In [28]:
# Create a DataLoader for the training dataset with batch size of 32 and shuffling enabled
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Create a DataLoader for the test dataset with batch size of 32 and shuffling enabled
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)


### Defining the model

In [34]:
# Import the necessary module for creating neural network layers
import torch.nn as nn

# Define a simple neural network class inheriting from nn.Module
class MySimpleNN(nn.Module):

  def __init__(self, num_features):
    """
    Initialize the neural network with one linear layer and a sigmoid activation.

    :param num_features: The number of input features to the network
    """
    super().__init__()  # Call the parent class (nn.Module) constructor

    # Define a linear layer that transforms the input features to one output
    self.linear = nn.Linear(num_features, 1)

    # Define a sigmoid activation function to output probabilities between 0 and 1
    self.sigmoid = nn.Sigmoid()

  def forward(self, features):
    """
    Define the forward pass of the network.

    :param features: The input features to the network
    :return: The output after applying the linear transformation and sigmoid activation
    """
    out = self.linear(features)  # Apply the linear transformation to the input
    out = self.sigmoid(out)      # Apply the sigmoid activation to the output

    return out  # Return the final output


### Important Parameters

In [35]:
learning_rate = 0.1
epochs = 25

In [36]:
# Create an instance of the neural network model 1 mean take the number of features
model = MySimpleNN(X_train_tensor.shape[1])

# Define the optimizer (Stochastic Gradient Descent with the specified learning rate)
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

# Define the loss function (Binary Cross Entropy Loss for binary classification)
loss_function = nn.BCELoss()


### Training Pipeline

In [37]:

# Define the training loop
for epoch in range(epochs):

  # Iterate through batches in the training data
  for batch_features, batch_labels in train_loader:

    # Forward pass: Predict output based on current model and batch features
    y_pred = model(batch_features)

    # Calculate loss between predicted output and true labels
    loss = loss_function(y_pred, batch_labels.view(-1, 1))

    # Clear previous gradients to avoid accumulation
    optimizer.zero_grad()

    # Backward pass: Compute gradients for model parameters
    loss.backward()

    # Update model parameters using the optimizer
    optimizer.step()

  # Print the loss at the end of each epoch
  print(f'Epoch: {epoch + 1}, Loss: {loss.item()}')


Epoch: 1, Loss: 0.3746975064277649
Epoch: 2, Loss: 0.06628835201263428
Epoch: 3, Loss: 0.07314001023769379
Epoch: 4, Loss: 0.31146374344825745
Epoch: 5, Loss: 0.10596656054258347
Epoch: 6, Loss: 0.14257940649986267
Epoch: 7, Loss: 0.15852907299995422
Epoch: 8, Loss: 0.164570614695549
Epoch: 9, Loss: 0.2556047737598419
Epoch: 10, Loss: 0.0761428028345108
Epoch: 11, Loss: 0.051103152334690094
Epoch: 12, Loss: 0.08020762354135513
Epoch: 13, Loss: 0.012436519376933575
Epoch: 14, Loss: 0.04701296612620354
Epoch: 15, Loss: 0.048021018505096436
Epoch: 16, Loss: 0.01991453766822815
Epoch: 17, Loss: 0.022957827895879745
Epoch: 18, Loss: 0.02667568251490593
Epoch: 19, Loss: 0.3126072585582733
Epoch: 20, Loss: 0.33070388436317444
Epoch: 21, Loss: 0.05106164142489433
Epoch: 22, Loss: 0.06419892609119415
Epoch: 23, Loss: 0.014061111025512218
Epoch: 24, Loss: 0.06161568686366081
Epoch: 25, Loss: 0.014662576839327812


### Evaluation

In [None]:
# Model evaluation using test_loader
model.eval()  # Set the model to evaluation mode
accuracy_list = []

with torch.no_grad():
    for batch_features, batch_labels in test_loader:
        # Forward pass
        y_pred = model(batch_features)
        y_pred = (y_pred > 0.8).float()  # Convert probabilities to binary predictions

        # Calculate accuracy for the current batch
        batch_accuracy = (y_pred.view(-1) == batch_labels).float().mean().item()
        accuracy_list.append(batch_accuracy)

# Calculate overall accuracy
overall_accuracy = sum(accuracy_list) / len(accuracy_list)
print(f'Accuracy: {overall_accuracy:.4f}')
