In [1]:
from sklearn.datasets import make_classification
import torch

In [3]:
# Step 1 : Create a synthetic classification dataset using sklearn
X, y = make_classification(
    n_samples=10,             # Number of samples
    n_features = 2,           # Number of features
    n_classes = 2,            # Number of informative features
    n_informative = 2,        # Number of redundant features
    n_redundant = 0,          # Number of classes
    random_state = 42         # For reproducibility
)

In [4]:
X

array([[ 1.06833894, -0.97007347],
       [-1.14021544, -0.83879234],
       [-2.8953973 ,  1.97686236],
       [-0.72063436, -0.96059253],
       [-1.96287438, -0.99225135],
       [-0.9382051 , -0.54304815],
       [ 1.72725924, -1.18582677],
       [ 1.77736657,  1.51157598],
       [ 1.89969252,  0.83444483],
       [-0.58723065, -1.97171753]])

In [5]:
X.shape

(10, 2)

In [6]:
y.shape

(10,)

In [8]:
# convert the data to pytorch tensor
X = torch.tensor(X, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.long)

  X = torch.tensor(X, dtype=torch.float32)
  y = torch.tensor(y, dtype=torch.long)


In [9]:
y

tensor([1, 0, 0, 0, 0, 1, 1, 1, 1, 0])

In [10]:
from torch.utils.data import Dataset, DataLoader

In [12]:
class CustomDataset(Dataset):
  def __init__(self, features, labels):
    self.features = features
    self.labels = labels

  def __len__(self):
    return len(self.features)

  def __getitem__(self, idx):
    # transform data in this section
    return self.features[idx], self.labels[idx]

In [13]:
dataset = CustomDataset(X, y)

In [14]:
len(dataset)

10

In [15]:
dataset[0]

(tensor([ 1.0683, -0.9701]), tensor(1))

In [16]:
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

In [18]:
for batch_features, batch_labels in dataloader:
  print(batch_features)
  print(batch_labels)
  print("_"*50)

tensor([[ 1.0683, -0.9701],
        [ 1.8997,  0.8344]])
tensor([1, 1])
__________________________________________________
tensor([[ 1.7273, -1.1858],
        [-2.8954,  1.9769]])
tensor([1, 0])
__________________________________________________
tensor([[ 1.7774,  1.5116],
        [-0.7206, -0.9606]])
tensor([1, 0])
__________________________________________________
tensor([[-0.5872, -1.9717],
        [-0.9382, -0.5430]])
tensor([0, 1])
__________________________________________________
tensor([[-1.9629, -0.9923],
        [-1.1402, -0.8388]])
tensor([0, 0])
__________________________________________________


# **A Note about samplers**
In PyTorch, the sampler in the DataLoader determines the strategy for selecting samples from the dataset during data loading. It controls how indices of the dataset are drawn for each batch. <br>
## **Type of Samplers**
PyTorch provides several predefined samplers, and you can create ones:
1. SequentialSampler:
  - Samples elements sequantially, in the order they appear in the dataset.
  - Default when shuffle-false.

2. RandomSampler:
  - Samples elements randomly without replacement.
  - Default when shuffle-True.

# **A note about collate_function**
The collate function in PyTorch's Dataloader is a function that specifies how to combine a list of samples from a dataset into a single batch. By default, the Dataloader uses a sample batch collation mechanism, But collate_fn allows you to customize how the data should be processed and batched.

# **DataLoader Important Parameters**
The DataLoader class in PyTorch comes with several parameters that allow you to customize how data is loaded, batched and preprocessed. Some of the most commonly used and important parameters include:
1. dataset(mandatory):
  - The Dataset from which the DataLoader will pull data.
  - Must be a subclass of torch.utils.data.Dataset that implements __ geitem __ and __ len __ .
2. batch_size:
  - How many samples per batch to load.
  - Default is 1.
  - Larger batch sizes can speed up training on GPUs but require more memory.
3. shuffle:
  - if True, the DataLoader will shuffle the dataset indices each epoch.
  - Helpful to avoid the model becoming to dependent on the order of samples.
4. num_workers:
  - The number of worker processes used to load data in paralled.
  - Setting num_workers > 0 can speed up data loading by levering multiple CPU cores, especially if I/O or proprecessing is a bottlneck.
5. pin_memory:
  - if True, the DataLoader will copy tensors into pinned (page-locked) memory before returning them.
  - This can improve GPU transfer speed and thus overall training throughout, particularly on CUDA systems.
6. drop_last:
  - if True, the DataLoader will drop the last incomplete batch if the total number of samples is not divisible by the batch size.
  - Useful when exact batch sizes are required (for example, in some batch normalization scenario).
7. collate_fn:
  - A collable that processes a list of sample into a batch (the default simple stacks tensors).
  - custom collate_fn can handle variable-length sequences, perfom cusotm batching loging or handle complex data structures.
8. sampler:
  - sampler defines the strategy for drawing samples (e.g for handling imbalanced classes or custom sampling strategies).
  - batch_sampler works at the batch level, controlling how batches are formed.
  - Typically, you dont need to specify these if you are using batch_size and shuffle,. However, they provide lower-level control if you have advanced requiements.

In [2]:
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

In [3]:
df = pd.read_csv("https://raw.githubusercontent.com/gscdit/Breast-Cancer-Detection/refs/heads/master/data.csv")
df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [4]:
df.drop(columns = ['id', 'Unnamed: 32'],inplace = True)
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, 1:], df.iloc[:, 0], test_size=0.2)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.transform(y_test)

X_train_tensor = torch.from_numpy(X_train)
X_test_tensor = torch.from_numpy(X_test)
y_train_tensor = torch.from_numpy(y_train)
y_test_tensor = torch.from_numpy(y_test)

X_train_tensor = torch.from_numpy(X_train).type(torch.float32)
X_test_tensor = torch.from_numpy(X_test).type(torch.float32)

In [5]:
from torch.utils.data import Dataset, DataLoader
class CustomDataset(Dataset):
  def __init__(self, features, labels):
    self.features = features
    self.labels = labels

  def __len__(self):
    return len(self.features)

  def __getitem__(self, idx):
    return self.features[idx], self.labels[idx]

In [6]:
train_dataset = CustomDataset(X_train_tensor, y_train_tensor)
test_dataset = CustomDataset(X_test_tensor, y_test_tensor)

In [7]:
train_dataset[10]

(tensor([ 0.7120,  0.5309,  0.7201,  0.5751, -0.0042,  0.5865,  0.5718,  0.2957,
          0.3340, -0.8591,  0.1954, -0.5517,  0.0663,  0.0934, -0.4092, -0.0345,
          0.1434,  0.0271,  0.8667, -0.4123,  0.7518,  0.5405,  0.7607,  0.5888,
          1.0752,  0.8154,  1.2492,  1.0239,  3.2065,  0.1539]),
 tensor(1))

In [8]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# **Defining Model**

In [9]:
import torch.nn as nn
class MySimpleNN(nn.Module):
  def __init__(self, num_features):
    super().__init__()
    self.linear = nn.Linear(num_features, 1)
    self.sigmoid = nn.Sigmoid()

  def forward(self, features):
    out = self.linear(features)
    out = self.sigmoid(out)
    return out


In [10]:
# parameter
Learining_rate = 0.1
epochs = 25

In [11]:
# create model
model = MySimpleNN(X_train_tensor.shape[1])

# define optimizer
optimizer = torch.optim.SGD(model.parameters(), lr=Learining_rate)

# define loss function
loss_fn = nn.BCELoss()

In [12]:
# Training Pipeline

for epoch in range(epochs):
  for batch_features, batch_labels in train_loader:
    # forward pass
    output = model(batch_features)

    # loss calculate
    loss = loss_fn(output, batch_labels.unsqueeze(1).float())

    # backward pass
    optimizer.zero_grad()
    loss.backward()

    # update weights
    optimizer.step()

  print(f"Epoch: {epoch+1}, Loss: {loss.item()}")

Epoch: 1, Loss: 0.33313706517219543
Epoch: 2, Loss: 0.16434547305107117
Epoch: 3, Loss: 0.060491885989904404
Epoch: 4, Loss: 0.09261643141508102
Epoch: 5, Loss: 0.02303214743733406
Epoch: 6, Loss: 0.05541075021028519
Epoch: 7, Loss: 0.06661498546600342
Epoch: 8, Loss: 0.11640209704637527
Epoch: 9, Loss: 0.10987496376037598
Epoch: 10, Loss: 0.16720378398895264
Epoch: 11, Loss: 0.016352830454707146
Epoch: 12, Loss: 0.027312615886330605
Epoch: 13, Loss: 0.13039423525333405
Epoch: 14, Loss: 0.02174692414700985
Epoch: 15, Loss: 0.039615269750356674
Epoch: 16, Loss: 0.06116238981485367
Epoch: 17, Loss: 0.017276793718338013
Epoch: 18, Loss: 0.0471893809735775
Epoch: 19, Loss: 0.08078593760728836
Epoch: 20, Loss: 0.04389205202460289
Epoch: 21, Loss: 0.014134575612843037
Epoch: 22, Loss: 0.13520823419094086
Epoch: 23, Loss: 0.00958036445081234
Epoch: 24, Loss: 0.04249066114425659
Epoch: 25, Loss: 0.07219503074884415


In [20]:
# Evaluation

model.eval() #set the model to evaluation mode
accuracy_list = []

with torch.no_grad():
  for batch_features, batch_labels in test_loader:

    # forward pass:
    y_pred = model(batch_features)
    y_pred = (y_pred > 0.5).float()

    # calculate accuray for the current batch
    batch_accuracy = (y_pred.view(-1) == batch_labels).float().mean().item()
    accuracy_list.append(batch_accuracy)

# calculate overall accuracy
overall_accuracy = sum(accuracy_list) / len(accuracy_list)
print(f"Overall Accuracy: {overall_accuracy * 100:.2f}%")

Overall Accuracy: 94.53%
