In [1]:
# https://www.nih.gov/news-events/news-releases/nih-clinical-center-provides-one-largest-publicly-available-chest-x-ray-datasets-scientific-community

import torch.optim as optim
import torch.nn.functional as F
import torch.nn as nn
import os
from torch.utils.data import Dataset, DataLoader
import torchvision
from torchvision import transforms as tf
from os import walk
import torch
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from PIL import Image
from matplotlib import image
from torchvision import transforms
# Set ipython's max row display
pd.set_option('display.max_row', 1000)

# Set iPython's max column width to 50
pd.set_option('display.max_columns', 50)

In [2]:
train_df = pd.read_csv('train_df2.csv')
test_df = pd.read_csv('test_df2.csv')

In [3]:
train_df.sample(10)

Unnamed: 0,Finding Label,Image Index,h,w,x,y
45174,1,00013925_002.png,2048.0,2500.0,0.168,0.168
39526,1,00012176_013.png,2991.0,2992.0,0.143,0.143
76024,1,00025303_036.png,2021.0,2021.0,0.194311,0.194311
20339,8,00006391_001.png,2048.0,2500.0,0.171,0.171
25440,5,00007916_004.png,2048.0,2500.0,0.168,0.168
20854,1,00006547_003.png,2991.0,2992.0,0.143,0.143
46441,5,00014320_022.png,2048.0,2500.0,0.168,0.168
2905,1,00000980_001.png,2048.0,2500.0,0.171,0.171
53949,5,00016778_007.png,2544.0,3056.0,0.139,0.139
3609,1,00001221_003.png,2991.0,2846.0,0.143,0.143


In [113]:
train_df['Finding Label'].value_counts()

1     49232
4      8106
8      7795
5      5045
6      2284
3      2067
12     1933
0      1385
9      1155
13     1074
7       903
11      879
10      826
14      230
2        81
Name: Finding Label, dtype: int64

In [4]:
# (train_df['Image Index'] == '00030181_001.png').value_counts()
# 00028247_001.png

In [5]:
import os
import platform
my_path = "../Datasets/Lungs_Dataset/Xray" if platform.system() == 'Windows' else "datasets/data/images"
filename_list = []
for root, dirs, files in os.walk(my_path, topdown=True):
    for name in files:
        filename_list.append(name)
#         with Image.open(os.path.join("datasets/data/images", name)) as f:
#             print(len(f.size))

In [6]:
train_df['Image Index'].isin(filename_list).value_counts()

True     82995
False     3529
Name: Image Index, dtype: int64

In [7]:
train_df = train_df[train_df['Image Index'].isin(filename_list)]

In [8]:
train_df['Image Index'].isin(filename_list).value_counts()

True    82995
Name: Image Index, dtype: int64

In [9]:
train_df['Image Index'].isna().value_counts()

False    82995
Name: Image Index, dtype: int64

In [10]:
test_df = test_df[test_df['Image Index'].isin(filename_list)]

In [11]:
test_df.sample(10)

Unnamed: 0,Finding Label,Image Index,h,w,x,y
8572,5,00012263_007.png,2521.0,2794.0,0.143,0.143
14922,1,00017714_014.png,2544.0,3056.0,0.139,0.139
5920,4,00009608_042.png,2048.0,2500.0,0.168,0.168
26419,14,00022021_002.png,93.189418,154.954497,267.648677,493.037037
9914,12,00013615_029.png,2048.0,2500.0,0.168,0.168
20329,1,00025664_040.png,2544.0,3056.0,0.139,0.139
2352,13,00003528_017.png,2048.0,2500.0,0.168,0.168
19395,1,00022877_020.png,2544.0,3056.0,0.139,0.139
17954,10,00021047_009.png,2544.0,3056.0,0.139,0.139
17766,8,00020751_005.png,2991.0,2992.0,0.143,0.143


In [12]:
class RescaleImage(object):
    """Rescale the image in a sample to a given size.

    Args:
        output_size (tuple or int): Desired output size. If tuple, output is
            matched to output_size. If int, smaller of image edges is matched
            to output_size keeping aspect ratio the same.
            
        The samples coming into this class will have its images reduced assuming
        the input is a h, w, c numpy array
    """

    def __init__(self, output_size):
        assert isinstance(output_size, (int, tuple))
        self.output_size = output_size

    def __call__(self, sample):
        image, label = sample['image'], sample['label']
        h, w = image.size

        if isinstance(self.output_size, int):
            if h > w:
                new_h, new_w = self.output_size * h / w, self.output_size
            else:
                new_h, new_w = self.output_size, self.output_size * w / h
        else:
            new_h, new_w = self.output_size

        new_h, new_w = int(new_h), int(new_w)

        img = image.resize((new_h, new_w))

        return {'image': img, 'label': label}

In [68]:
class ToTensor(object):
    """Convert ndarrays in sample to Tensors."""

    def __call__(self, sample):
        image, label = sample['image'], sample['label']
        image = np.array(image)
        if len(image.shape) > 2:
            image = image[:,:,0]

        # numpy image: H x W x C
        # torch image: C X H X W
        return {'image': torch.FloatTensor(image).unsqueeze(0),
#         return {'image': torch.from_numpy(image),                
                'label': torch.from_numpy(np.array(label))}

In [63]:
#  print([Image.open('../Datasets/Lungs_Dataset/Xray/'+train_df['Image Index'][each_import]).size for each_import in range(100)])

In [69]:
#Note this will return an Image object, of h and w, and its corresponding label
class CovidLungsDataset(Dataset):
    """Face Landmarks dataset."""

    def __init__(self, dataframe, root_dir, transform=None):
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.dataframe = dataframe
        self.root_dir = root_dir
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        img_name = os.path.join(self.root_dir,
                                self.dataframe['Image Index'][idx])
        my_image = Image.open(img_name)        
        if len(my_image.size) > 2:
            assert len(my_image.size) > 2
        row = self.dataframe.iloc[idx]
        label = row['Finding Label']
        sample = {'image': my_image, 'label': label}

        if self.transform:
            sample = self.transform(sample)

        return sample

In [70]:
my_train_set = CovidLungsDataset(train_df, my_path, transform=transforms.Compose([
        RescaleImage(200),
        ToTensor()
]))

In [76]:
'''The original data coming out of the Dataset is a dictionary, 
the image is an Image object with the corresponding label.
The size of all of these images are 1024 by 1024.
The RescaleImage will convert the Image object to an Image object with 200x200 in size, 
leaving the label alone.
The ToTensor will convert the Image object with 200x200 to tensors of 1x200x200'''
from random import randint
a_sample = my_train_set.__getitem__(randint(0,256))
my_image = a_sample['image']
my_label = a_sample['label']
#imshow will work if it's h,w,c or h,w
#torch is c,h,w
#line below can be used if you don't use ToTensor
# plt.imshow(my_image)
type(my_image), my_image.shape, my_label

(torch.Tensor, torch.Size([1, 200, 200]), tensor(0))

In [77]:
# plt.imshow(my_image.squeeze(0));

In [128]:
import platform
batch_loader_params = {
    "batch_size": 10,
    "shuffle": True,
    "num_workers": 0 if platform.system() == 'Windows' else 2
}
dataloader = DataLoader(my_train_set, **batch_loader_params)

In [None]:
# # iter(dataloader).next()
# for i, each in enumerate(dataloader):
#     print(each['image'].shape, each['label'].shape)

In [None]:
# batch_samples = iter(dataloader)
# samples = batch_samples.next()
# datset_batch = torchvision.utils.make_grid(samples['image'])

In [None]:
# plt.figure(figsize=(20,10))
# for index, each in enumerate(datset_batch):
#     plt.imshow(each.squeeze(0))

In [94]:
def spatial_size(input_size: int, kernel_size: int, stride: int = 1, padding: int = 0):
    # https://cs231n.github.io/convolutional-networks/
    spatial_size = (input_size - kernel_size + 2 * padding)/stride + 1
    assert spatial_size % 1 == 0
    assert spatial_size > 0
    return int(spatial_size)

In [100]:
print(spatial_size(50, 25))

26


In [136]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 200, 100, 1)  
        self.conv2 = nn.Conv2d(200, 50, 25, 1)  
        self.fc1 = nn.Linear(13*13*50, 32)
        self.fc2 = nn.Linear(32, 15)

    def forward(self, x):
        x = F.relu(self.conv1(x)) # torch.Size([5, 1, 200, 200]) ==> torch.Size([5, 200, 101, 101])
        x = F.max_pool2d(x, 2, 2) # torch.Size([5, 200, 101, 101]) ==> torch.Size([5, 200, 50, 50])
        x = F.relu(self.conv2(x)) # torch.Size([5, 200, 50, 50]) ==> torch.Size([5, 50, 26, 26])
        x = F.max_pool2d(x, 2, 2) # torch.Size([5, 50, 26, 26]) ==> torch.Size([5, 50, 13, 13])
        x = x.view(-1, 13*13*50)  # torch.Size([5, 50, 13, 13]) ==> torch.Size([5, 8450])
        x = F.relu(self.fc1(x))   # torch.Size([5, 8450]) ==> torch.Size([5, 32])
        x = self.fc2(x)           # torch.Size([5, 32]) ==> torch.Size([5, 15])
        # # There's no activation at the final layer because of the criterion of CEL
#         return x
        return torch.log_softmax(x, dim=-1)


net = Net()

In [137]:
# criterion = nn.CrossEntropyLoss()
criterion = nn.NLLLoss()
# optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
optimizer = optim.Adam(net.parameters(), lr=0.01)

In [138]:
for epoch in range(1):  # loop over the dataset multiple times

    running_loss = 0.0
    for i, data in enumerate(dataloader):
        # get the inputs; data is a list of [inputs, labels]
#         inputs, labels = data['image'], data['label']
        
        # zero the parameter gradients
        optimizer.zero_grad()
        # forward + backward + optimize
        outputs = net(data['image'])
#         print(outputs.shape, data['label'].shape)
        loss = criterion(outputs, data['label'])
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()

        print('[%d, %5d] loss: %.5f' %
              (epoch + 1, i + 1, running_loss / (epoch*i+1)))

[1,     1] loss: 16.42096
[1,     2] loss: 7048195.42096
[1,     3] loss: 9058073.42096
[1,     4] loss: 9203302.17096
[1,     5] loss: 9235659.11432
[1,     6] loss: 9407265.22369
[1,     7] loss: 9436942.47955
[1,     8] loss: 9716284.32330
[1,     9] loss: 9890782.10455
[1,    10] loss: 9957298.92486
[1,    11] loss: 9957301.53093
[1,    12] loss: 9957304.08569
[1,    13] loss: 9957306.63232
[1,    14] loss: 9958839.13891
[1,    15] loss: 9958841.75833
[1,    16] loss: 9958844.30677
[1,    17] loss: 9958846.79561
[1,    18] loss: 9977143.56514
[1,    19] loss: 9977146.07202
[1,    20] loss: 9977148.62819
[1,    21] loss: 9977151.12402
[1,    22] loss: 9977153.65680
[1,    23] loss: 9977156.20601
[1,    24] loss: 9977158.62785
[1,    25] loss: 9977161.03367
[1,    26] loss: 9977163.52749
[1,    27] loss: 9977165.95432
[1,    28] loss: 9978689.93894
[1,    29] loss: 9978692.39836
[1,    30] loss: 9978694.77047
[1,    31] loss: 9978697.06743
[1,    32] loss: 9978699.61559
[1,    33] lo

Traceback (most recent call last):
  File "/Users/senhmo/anaconda3/lib/python3.6/multiprocessing/queues.py", line 240, in _feed
    send_bytes(obj)
  File "/Users/senhmo/anaconda3/lib/python3.6/multiprocessing/connection.py", line 200, in send_bytes
    self._send_bytes(m[offset:offset + size])
  File "/Users/senhmo/anaconda3/lib/python3.6/multiprocessing/connection.py", line 404, in _send_bytes
    self._send(header + buf)
  File "/Users/senhmo/anaconda3/lib/python3.6/multiprocessing/connection.py", line 368, in _send
    n = write(self._handle, buf)
BrokenPipeError: [Errno 32] Broken pipe
Traceback (most recent call last):
  File "/Users/senhmo/anaconda3/lib/python3.6/multiprocessing/queues.py", line 240, in _feed
    send_bytes(obj)
  File "/Users/senhmo/anaconda3/lib/python3.6/multiprocessing/connection.py", line 200, in send_bytes
    self._send_bytes(m[offset:offset + size])
  File "/Users/senhmo/anaconda3/lib/python3.6/multiprocessing/connection.py", line 404, in _send_bytes
   

KeyboardInterrupt: 