In [1]:
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/gscdit/Breast-Cancer-Detection/refs/heads/master/data.csv')
df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [3]:
df.drop(columns=['id', 'Unnamed: 32'], inplace= True)
df.head()

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [4]:
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, 1:], df.iloc[:, 0], test_size=0.2)

In [5]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [6]:
encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.transform(y_test)

## **understanding the use of data set and data loader classes**

### Why dataset class

1. Data set class provides you the structured way of organizing and accessing our data 

2. we can subclass(Inherit) Dataset class so that we can handle various data formats and accessing will be most easer

### customDataSet
we inherit the Dataset class such that we can have all the functionality that required for data loader.

we create a customclass to pass our data and make operations such as accessing , structuring etc. 

using __len__() we can find the length of the batch. such that as the mini batches are forming it will be so helpful in finding the length and checking if it reached the required size or not.

__getitem()__ will allow us in accessing the row at the perticular index both x,y labels

### Data Loader class
Over here we mention the required number of batches we want and shuffle so that it uses the customDataSet class inherited from dataset set class to collect row by row data until it reaches the batch size for entire size. 

In [8]:
from torch.utils.data import Dataset, DataLoader

class customDataSet(Dataset):
  def __init__(self, X, y):
    self.X = torch.tensor(X, dtype=torch.float32)
    self.y = torch.tensor(y, dtype=torch.float32)
  def __len__(self):
    return self.X.shape[0]
  def __getitem__(self, index):
    return self.X[index],self.y[index]


In [9]:
data = customDataSet(X_train, y_train)

In [10]:
len(data)

455

In [11]:
dataloader = DataLoader(data, batch_size=75, shuffle=True)

In [None]:
for batch_features, batch_labels in dataloader:

  print(batch_features)
  print(batch_labels)
  print("-"*50)

In [13]:
learning_rate = 0.01
epochs = 50

In [14]:
import torch.nn as nn

class nn_module_class(nn.Module):
    def __init__(self, num_features):
      #weights will be created automatically and from super class
      super().__init__()
      self.linear1 = nn.Linear(num_features, 3)
      self.relu = nn.ReLU()
      self.linear3 = nn.Linear(3,1)
      self.sigmoid = nn.Sigmoid()

    def forward(self, features):
      # this functions discribes what kind of activation functions we want to use
      out = self.linear1(features)
      out = self.relu(out)
      # out = self.linear2(out)
      # out = self.relu(out)
      out = self.linear3(out)
      out = self.sigmoid(out)

      return out

In [15]:
loss_function = nn.BCELoss()

# Mini Batch Gradient Decent

In [16]:
# create model
model = nn_module_class(X_train_tensor.shape[1])


optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
# define loop
for epoch in range(epochs):
  for batch_features, batch_labels in dataloader:
    y_pred = model(batch_features)
    loss = loss_function(y_pred, batch_labels.view(-1, 1))
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
  print(f'Epoch: {epoch + 1}, Loss: {loss.item()}')


Epoch: 1, Loss: 0.7257180213928223
Epoch: 2, Loss: 0.782181441783905
Epoch: 3, Loss: 0.6089656352996826
Epoch: 4, Loss: 0.7659962177276611
Epoch: 5, Loss: 0.6833186745643616
Epoch: 6, Loss: 0.643136203289032
Epoch: 7, Loss: 0.6118662357330322
Epoch: 8, Loss: 0.6198490262031555
Epoch: 9, Loss: 0.6434481143951416
Epoch: 10, Loss: 0.6378451585769653
Epoch: 11, Loss: 0.6669641137123108
Epoch: 12, Loss: 0.5992167592048645
Epoch: 13, Loss: 0.5353060960769653
Epoch: 14, Loss: 0.607020914554596
Epoch: 15, Loss: 0.6105437278747559
Epoch: 16, Loss: 0.6128842830657959
Epoch: 17, Loss: 0.6342984437942505
Epoch: 18, Loss: 0.4690264165401459
Epoch: 19, Loss: 0.5356364846229553
Epoch: 20, Loss: 0.4258726239204407
Epoch: 21, Loss: 0.5628027319908142
Epoch: 22, Loss: 0.6004625558853149
Epoch: 23, Loss: 0.40798407793045044
Epoch: 24, Loss: 0.6611684560775757
Epoch: 25, Loss: 0.49520254135131836
Epoch: 26, Loss: 0.4883471429347992
Epoch: 27, Loss: 0.5778844356536865
Epoch: 28, Loss: 0.6452085375785828
Ep

In [17]:
X_test_tensor = torch.from_numpy(X_test.astype(np.float32))
y_test_tensor = torch.from_numpy(y_test.astype(np.float32))

In [18]:
# model evaluation
with torch.no_grad():
  y_pred = model.forward(X_test_tensor)
  y_pred = (y_pred > 0.9).float()
  accuracy = (y_pred == y_test_tensor).float().mean()
  print(f'Accuracy: {accuracy.item()}')

Accuracy: 0.6228070259094238
