In [106]:
import pandas as pd
import torch
import numpy as np
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader 
import torch.nn.functional as F 
from torch.autograd import Variable
import torch.optim as optim
from tqdm import tqdm
from torchsummary import summary
from torchvision import datasets, transforms
import time
import os

In [107]:
df = pd.read_csv('breast-cancer-wisconsin.data', header=None)
#             names=['code_number',
#               'clumb_thickness',
#               'cell_size_uniformity', 
#               'cell_shape_uniformity',
#               'marginal_adhesion',
#              'epithelial_sell_size', 
#              'bare_nuclei', 
#              'bland_chromatin',
#               'normal_nuleoli'
#              'mitoses',
#              'class'])


In [108]:
df.head()
# df.isnull().sum() # Returns the column names along with the number of
# NaN values in that particular column

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [109]:
df.describe()

Unnamed: 0,0,1,2,3,4,5,7,8,9,10
count,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0
mean,1071704.0,4.41774,3.134478,3.207439,2.806867,3.216023,3.437768,2.866953,1.589413,2.689557
std,617095.7,2.815741,3.051459,2.971913,2.855379,2.2143,2.438364,3.053634,1.715078,0.951273
min,61634.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0
25%,870688.5,2.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,2.0
50%,1171710.0,4.0,1.0,1.0,1.0,2.0,3.0,1.0,1.0,2.0
75%,1238298.0,6.0,5.0,5.0,4.0,4.0,5.0,4.0,1.0,4.0
max,13454350.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,4.0


In [110]:
df.shape

(699, 11)

In [111]:
np_arr = df.to_numpy()

In [112]:
X_replace = df.replace('?',0)

In [113]:
np_array = np.array(X_replace).astype(float)

In [114]:
df_tensor = torch.tensor(np_array)

In [115]:
class Breast_Cancer_Classifier(nn.Module):
    def __init__(self):
        super(Breast_Cancer_Classifier, self).__init__()
        self.linear_a = nn.Linear(10, 64)
        self.linear_b = nn.Linear(64, 10)
        self.linear_c = nn.Linear(10, 2)

    def forward(self, x):
        x = self.linear_a(x)
        x = F.relu(x)
        x = self.linear_b(x)
        x = F.relu(x)
        x = self.linear_c(x)
        output = F.softmax(x, dim=1)
        return output

model = Breast_Cancer_Classifier()

In [116]:
class BreastCancerDataset(Dataset):

    def __init__(self):  #Initialise the data, download etc
        df = pd.read_csv('breast-cancer-wisconsin.data', header=None)
        df[10] = df[10].replace(2, 0)
        df[10] = df[10].replace(4, 1)
        X_replace = df.replace('?',0)
        np_array = np.array(X_replace).astype(float)
        self.len = df.shape[0]
        self.x_data = torch.from_numpy(np_array[:, :10])
        self.y_data = torch.from_numpy(np_array[:, 10])

    def __getitem__(self, index):  #return one item on the index
        return self.x_data[index], self.y_data[index]

    def __len__(self): #return the data length
        return self.len

dataset = BreastCancerDataset()
train_loader = DataLoader(dataset=dataset, 
                            batch_size=32, 
                            shuffle=True, 
                            num_workers=0)


In [117]:
criterion = torch.nn.CrossEntropyLoss(size_average=False)
optimizer = optim.SGD(model.parameters(), lr=0.001)
model.train()
        
start_time = time.time()
for epoch in range(5):  # loop over the dataset multiple times

    running_loss = 0.0
    for i, data in enumerate(train_loader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data
        
        # forward pass over the data: model.forward
        outputs = model(inputs.float())   
        
        # computing the loss
        loss = criterion(outputs, labels.long())
        loss.backward()   # back propagation
        optimizer.step() # update parameters based on loss

        #  reset the gradients to zero before moving forward, because PyTorch accumulates gradients.
        optimizer.zero_grad()

        # print statistics
        running_loss += loss.item()
        if i % 5 == 0:    # print every 4 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 2000))
            running_loss = 0.0

end_time = time.time()
print('Finished Training')
print('Training Time: ', end_time - start_time)

[1,     1] loss: 0.011
[1,     6] loss: 0.053
[1,    11] loss: 0.050
[1,    16] loss: 0.054
[1,    21] loss: 0.053
[2,     1] loss: 0.012
[2,     6] loss: 0.055
[2,    11] loss: 0.054
[2,    16] loss: 0.050
[2,    21] loss: 0.052
[3,     1] loss: 0.011
[3,     6] loss: 0.049
[3,    11] loss: 0.049
[3,    16] loss: 0.056
[3,    21] loss: 0.054
[4,     1] loss: 0.009
[4,     6] loss: 0.054
[4,    11] loss: 0.053
[4,    16] loss: 0.053
[4,    21] loss: 0.053
[5,     1] loss: 0.012
[5,     6] loss: 0.056
[5,    11] loss: 0.048
[5,    16] loss: 0.055
[5,    21] loss: 0.052
Finished Training
Training Time:  0.1806955337524414


In [125]:
device = torch.device('cpu')

test_set = DataLoader(dataset=dataset, 
                            batch_size=100, 
                            shuffle=True, 
                            num_workers=0)

def test(model, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
#             print(data, target)
#             print(".........................")
            data, target = data.float(), target.long()
            data, target = data.to(device), target.to(device)
            output = model(data)
#             print(output)
            test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))
    
test(model=model, device=device, test_loader=test_set)


Test set: Average loss: -0.6552, Accuracy: 458/699 (66%)



In [121]:
# saving the model
torch.save(model.state_dict(), "predictions.pt")

In [None]:
# loading the model
model.load_state_dict(torch.load("predictions.pt"))
model.eval()