In [27]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

dicom_data = pd.read_csv('./Dataset/archive (1)/csv/dicom_info.csv')
mass_data_train = pd.read_csv('./Dataset/archive (1)/csv/mass_case_description_train_set.csv')
mass_data_test = pd.read_csv('./Dataset/archive (1)/csv/mass_case_description_test_set.csv')
dicom_data.head()

image_dir = './Dataset/archive (1)/jpeg'

In [28]:
image_size = 224
batch_size = 32
epoch = 10
num_classes = 3

## Cleaning Part 1: Are those NaN values in my dataset?!

In [29]:
dicom_cleaning = dicom_data.copy() #make a copy as not to affect the original dataset

#remove the columns that have no use for the study
dicom_cleaning.drop(['PatientBirthDate','AccessionNumber','Columns','ContentDate','ContentTime','PatientSex','PatientBirthDate',
                                                'ReferringPhysicianName','Rows','SOPClassUID','SOPInstanceUID',
                                                'StudyDate','StudyID','StudyInstanceUID','StudyTime','InstanceNumber','SeriesInstanceUID','SeriesNumber'],axis =1, inplace=True) 

dicom_cleaning.head()

Unnamed: 0,file_path,image_path,BitsAllocated,BitsStored,BodyPartExamined,ConversionType,HighBit,LargestImagePixelValue,Laterality,Modality,...,PatientName,PatientOrientation,PhotometricInterpretation,PixelRepresentation,SamplesPerPixel,SecondaryCaptureDeviceManufacturer,SecondaryCaptureDeviceManufacturerModelName,SeriesDescription,SmallestImagePixelValue,SpecificCharacterSet
0,CBIS-DDSM/dicom/1.3.6.1.4.1.9590.100.1.2.12930...,CBIS-DDSM/jpeg/1.3.6.1.4.1.9590.100.1.2.129308...,16,16,BREAST,WSD,15,65535,R,MG,...,Mass-Training_P_01265_RIGHT_MLO_1,MLO,MONOCHROME2,0,1,MathWorks,MATLAB,cropped images,23078,ISO_IR 100
1,CBIS-DDSM/dicom/1.3.6.1.4.1.9590.100.1.2.24838...,CBIS-DDSM/jpeg/1.3.6.1.4.1.9590.100.1.2.248386...,16,16,BREAST,WSD,15,65535,R,MG,...,Mass-Training_P_01754_RIGHT_CC,CC,MONOCHROME2,0,1,MathWorks,MATLAB,full mammogram images,0,ISO_IR 100
2,CBIS-DDSM/dicom/1.3.6.1.4.1.9590.100.1.2.26721...,CBIS-DDSM/jpeg/1.3.6.1.4.1.9590.100.1.2.267213...,16,16,BREAST,WSD,15,65535,R,MG,...,Calc-Training_P_00232_RIGHT_CC,CC,MONOCHROME2,0,1,MathWorks,MATLAB,full mammogram images,0,ISO_IR 100
3,CBIS-DDSM/dicom/1.3.6.1.4.1.9590.100.1.2.38118...,CBIS-DDSM/jpeg/1.3.6.1.4.1.9590.100.1.2.381187...,16,16,BREAST,WSD,15,65535,L,MG,...,Calc-Test_P_00562_LEFT_CC_2,CC,MONOCHROME2,0,1,MathWorks,MATLAB,cropped images,32298,ISO_IR 100
4,CBIS-DDSM/dicom/1.3.6.1.4.1.9590.100.1.2.38118...,CBIS-DDSM/jpeg/1.3.6.1.4.1.9590.100.1.2.381187...,8,8,Left Breast,WSD,7,255,,MG,...,P_00562^P_00562,CC,MONOCHROME2,0,1,MathWorks,MATLAB,,0,ISO_IR 100


In [30]:
df = dicom_cleaning.copy()
df['patient_id'] = df['PatientID'].str.extract(r'([A-Za-z]+_\d\d\d\d\d)')

mass_col = mass_data_test['patient_id']
full_mammogram_images = pd.merge(df, mass_data_train, on="patient_id") 

full_mammogram_images.merge(mass_data_test, on='patient_id')

fmi_train, fmi_test = train_test_split(full_mammogram_images, test_size=0.2, random_state=42)

fmi_train.head()

Unnamed: 0,file_path,image_path,BitsAllocated,BitsStored,BodyPartExamined,ConversionType,HighBit,LargestImagePixelValue,Laterality,Modality,...,abnormality id,abnormality type,mass shape,mass margins,assessment,pathology,subtlety,image file path,cropped image file path,ROI mask file path
2254,CBIS-DDSM/dicom/1.3.6.1.4.1.9590.100.1.2.78953...,CBIS-DDSM/jpeg/1.3.6.1.4.1.9590.100.1.2.789536...,8,8,BREAST,WSD,7,255,R,MG,...,2,mass,OVAL,ILL_DEFINED,4,MALIGNANT,5,Mass-Training_P_01596_RIGHT_MLO/1.3.6.1.4.1.95...,Mass-Training_P_01596_RIGHT_MLO_2/1.3.6.1.4.1....,Mass-Training_P_01596_RIGHT_MLO_2/1.3.6.1.4.1....
63,CBIS-DDSM/dicom/1.3.6.1.4.1.9590.100.1.2.38788...,CBIS-DDSM/jpeg/1.3.6.1.4.1.9590.100.1.2.387883...,8,8,BREAST,WSD,7,255,L,MG,...,1,mass,OVAL,CIRCUMSCRIBED,3,BENIGN_WITHOUT_CALLBACK,5,Mass-Training_P_01175_LEFT_MLO/1.3.6.1.4.1.959...,Mass-Training_P_01175_LEFT_MLO_1/1.3.6.1.4.1.9...,Mass-Training_P_01175_LEFT_MLO_1/1.3.6.1.4.1.9...
8021,CBIS-DDSM/dicom/1.3.6.1.4.1.9590.100.1.2.37107...,CBIS-DDSM/jpeg/1.3.6.1.4.1.9590.100.1.2.371077...,16,16,BREAST,WSD,15,65535,L,MG,...,1,mass,LOBULATED,CIRCUMSCRIBED-ILL_DEFINED,3,MALIGNANT,3,Mass-Training_P_00914_LEFT_CC/1.3.6.1.4.1.9590...,Mass-Training_P_00914_LEFT_CC_1/1.3.6.1.4.1.95...,Mass-Training_P_00914_LEFT_CC_1/1.3.6.1.4.1.95...
3279,CBIS-DDSM/dicom/1.3.6.1.4.1.9590.100.1.2.36170...,CBIS-DDSM/jpeg/1.3.6.1.4.1.9590.100.1.2.361705...,8,8,BREAST,WSD,7,255,L,MG,...,1,mass,IRREGULAR,SPICULATED,5,MALIGNANT,5,Mass-Training_P_01780_LEFT_MLO/1.3.6.1.4.1.959...,Mass-Training_P_01780_LEFT_MLO_1/1.3.6.1.4.1.9...,Mass-Training_P_01780_LEFT_MLO_1/1.3.6.1.4.1.9...
2697,CBIS-DDSM/dicom/1.3.6.1.4.1.9590.100.1.2.13263...,CBIS-DDSM/jpeg/1.3.6.1.4.1.9590.100.1.2.132637...,16,16,BREAST,WSD,15,65535,R,MG,...,1,mass,IRREGULAR,ILL_DEFINED,5,MALIGNANT,4,Mass-Training_P_00717_RIGHT_MLO/1.3.6.1.4.1.95...,Mass-Training_P_00717_RIGHT_MLO_1/1.3.6.1.4.1....,Mass-Training_P_00717_RIGHT_MLO_1/1.3.6.1.4.1....


In [31]:
#remove NaN values from the dataset
#training_cleaning['mass margins'].ffill(axis=0, inplace=True)

# full_mammogram_images.isna().sum()
# full_mammogram_images.drop(['file_path', 'BitsAllocated', 'BitsStored', 'ConversionType',
#                             'HighBit', 'LargestImagePixelValue',
#                             'Laterality', 'PatientName', 'PatientOrientation',
#                          'PhotometricInterpretation', 'PixelRepresentation',
#                             'SamplesPerPixel', 'SecondaryCaptureDeviceManufacturer', 'SecondaryCaptureDeviceManufacturerModelName',
#                             'SeriesDescription', 'SmallestImagePixelValue',
#                             'SpecificCharacterSet'], axis=1, inplace=True)

full_mammogram_images.head()

Unnamed: 0,file_path,image_path,BitsAllocated,BitsStored,BodyPartExamined,ConversionType,HighBit,LargestImagePixelValue,Laterality,Modality,...,abnormality id,abnormality type,mass shape,mass margins,assessment,pathology,subtlety,image file path,cropped image file path,ROI mask file path
0,CBIS-DDSM/dicom/1.3.6.1.4.1.9590.100.1.2.12930...,CBIS-DDSM/jpeg/1.3.6.1.4.1.9590.100.1.2.129308...,16,16,BREAST,WSD,15,65535,R,MG,...,1,mass,OVAL,CIRCUMSCRIBED,3,BENIGN,5,Mass-Training_P_01265_RIGHT_CC/1.3.6.1.4.1.959...,Mass-Training_P_01265_RIGHT_CC_1/1.3.6.1.4.1.9...,Mass-Training_P_01265_RIGHT_CC_1/1.3.6.1.4.1.9...
1,CBIS-DDSM/dicom/1.3.6.1.4.1.9590.100.1.2.12930...,CBIS-DDSM/jpeg/1.3.6.1.4.1.9590.100.1.2.129308...,16,16,BREAST,WSD,15,65535,R,MG,...,1,mass,OVAL,CIRCUMSCRIBED,3,BENIGN,4,Mass-Training_P_01265_RIGHT_MLO/1.3.6.1.4.1.95...,Mass-Training_P_01265_RIGHT_MLO_1/1.3.6.1.4.1....,Mass-Training_P_01265_RIGHT_MLO_1/1.3.6.1.4.1....
2,CBIS-DDSM/dicom/1.3.6.1.4.1.9590.100.1.2.61363...,CBIS-DDSM/jpeg/1.3.6.1.4.1.9590.100.1.2.613638...,16,16,BREAST,WSD,15,65535,R,MG,...,1,mass,OVAL,CIRCUMSCRIBED,3,BENIGN,5,Mass-Training_P_01265_RIGHT_CC/1.3.6.1.4.1.959...,Mass-Training_P_01265_RIGHT_CC_1/1.3.6.1.4.1.9...,Mass-Training_P_01265_RIGHT_CC_1/1.3.6.1.4.1.9...
3,CBIS-DDSM/dicom/1.3.6.1.4.1.9590.100.1.2.61363...,CBIS-DDSM/jpeg/1.3.6.1.4.1.9590.100.1.2.613638...,16,16,BREAST,WSD,15,65535,R,MG,...,1,mass,OVAL,CIRCUMSCRIBED,3,BENIGN,4,Mass-Training_P_01265_RIGHT_MLO/1.3.6.1.4.1.95...,Mass-Training_P_01265_RIGHT_MLO_1/1.3.6.1.4.1....,Mass-Training_P_01265_RIGHT_MLO_1/1.3.6.1.4.1....
4,CBIS-DDSM/dicom/1.3.6.1.4.1.9590.100.1.2.15329...,CBIS-DDSM/jpeg/1.3.6.1.4.1.9590.100.1.2.153292...,8,8,BREAST,WSD,7,255,R,MG,...,1,mass,OVAL,CIRCUMSCRIBED,3,BENIGN,5,Mass-Training_P_01265_RIGHT_CC/1.3.6.1.4.1.959...,Mass-Training_P_01265_RIGHT_CC_1/1.3.6.1.4.1.9...,Mass-Training_P_01265_RIGHT_CC_1/1.3.6.1.4.1.9...


In [32]:

full_mammogram_images['mass shape'].ffill(axis=0, inplace=True)
full_mammogram_images['mass margins'].ffill(axis=0, inplace=True)
full_mammogram_images.isna().sum()

file_path                                       0
image_path                                      0
BitsAllocated                                   0
BitsStored                                      0
BodyPartExamined                                0
ConversionType                                  0
HighBit                                         0
LargestImagePixelValue                          0
Laterality                                     92
Modality                                        0
PatientID                                       0
PatientName                                     0
PatientOrientation                              0
PhotometricInterpretation                       0
PixelRepresentation                             0
SamplesPerPixel                                 0
SecondaryCaptureDeviceManufacturer              0
SecondaryCaptureDeviceManufacturerModelName     0
SeriesDescription                              92
SmallestImagePixelValue                         0


In [33]:
dicom_cleaning.isna().sum() #get the sum of all of the data that just has nothing in it
#laterality and series description has 566 NaN entries, remove them

file_path                                        0
image_path                                       0
BitsAllocated                                    0
BitsStored                                       0
BodyPartExamined                                 0
ConversionType                                   0
HighBit                                          0
LargestImagePixelValue                           0
Laterality                                     566
Modality                                         0
PatientID                                        0
PatientName                                      0
PatientOrientation                               0
PhotometricInterpretation                        0
PixelRepresentation                              0
SamplesPerPixel                                  0
SecondaryCaptureDeviceManufacturer               0
SecondaryCaptureDeviceManufacturerModelName      0
SeriesDescription                              566
SmallestImagePixelValue        

In [34]:
dicom_cleaning['Laterality'].ffill(axis=0, inplace=True) #replaces all NaN values with 0
dicom_cleaning['SeriesDescription'].ffill(axis=0, inplace=True)
dicom_cleaning.isna().sum()

file_path                                      0
image_path                                     0
BitsAllocated                                  0
BitsStored                                     0
BodyPartExamined                               0
ConversionType                                 0
HighBit                                        0
LargestImagePixelValue                         0
Laterality                                     0
Modality                                       0
PatientID                                      0
PatientName                                    0
PatientOrientation                             0
PhotometricInterpretation                      0
PixelRepresentation                            0
SamplesPerPixel                                0
SecondaryCaptureDeviceManufacturer             0
SecondaryCaptureDeviceManufacturerModelName    0
SeriesDescription                              0
SmallestImagePixelValue                        0
SpecificCharacterSet

## Cleaning Part 2: We should probably encode the data

In [35]:
full_mammogram_images_enc = pd.get_dummies(full_mammogram_images['pathology'])
col = full_mammogram_images['patient_id']
full_mammogram_images = pd.DataFrame(full_mammogram_images_enc, columns=full_mammogram_images['pathology'].unique())
full_mammogram_images = full_mammogram_images.join(col)

full_mammogram_images

Unnamed: 0,BENIGN,MALIGNANT,BENIGN_WITHOUT_CALLBACK,patient_id
0,True,False,False,P_01265
1,True,False,False,P_01265
2,True,False,False,P_01265
3,True,False,False,P_01265
4,True,False,False,P_01265
...,...,...,...,...
10099,False,True,False,P_00058
10100,False,True,False,P_00058
10101,True,False,False,P_00206
10102,True,False,False,P_00206


### Define transforms for the images

In [36]:
transform = torchvision.transforms.Compose([torchvision.transforms.ToTensor()])

In [37]:
fmi_train, fmi_test = train_test_split(full_mammogram_images, test_size=0.2, random_state=42)

full_mammogram_images_train = fmi_train.apply(lambda x: x.replace('CBIS-DDSM/jpeg', image_dir))
full_mammogram_images_train['image_path'] = df['image_path'].str.replace('CBIS-DDSM/jpeg',image_dir)
full_mammogram_images_train.rename(columns={"image_path": "links"})
full_mammogram_images_test = fmi_test.apply(lambda x: x.replace('CBIS-DDSM/jpeg', image_dir))
full_mammogram_images_test['image_path'] = df['image_path'].str.replace('CBIS-DDSM/jpeg',image_dir)
full_mammogram_images_test.rename(columns={"image_path": "links"})

full_mammogram_images

trainloader = torch.utils.data.DataLoader(full_mammogram_images_train, batch_size=batch_size, shuffle=True)
testloader = torch.utils.data.DataLoader(full_mammogram_images_test, batch_size=batch_size, shuffle=True)


In [38]:
class BCIDModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 32, kernel_size=(3, 3), padding=1)
        self.act1 = nn.ReLU()
        self.drop1 = nn.Dropout(0.3)
        
        self.conv2 = nn.Conv2d(32, 32, kernel_size=(3, 3), padding=1)
        self.act2= nn.ReLU()
        self.pool2 = nn.MaxPool2d(kernel_size=(2, 2))
        
        self.flat = nn.Flatten()
        
        self.fc3 = nn.Linear(8192, 512)
        self.act3 = nn.ReLU()
        self.drop3 = nn.Dropout(0.5)
        
        self.fc4 = nn.Linear(512, 10)
        
    def forward(self, x):
        # input 3x32x32, output 32x32x32
        x = self.act1(self.conv1(x))
        x = self.drop1(x)
        # input 32x32x32, output 32x32x32
        x = self.act2(self.conv2(x))
        # input 32x32x32, output 32x16x16
        x = self.pool2(x)
        # input 32x16x16, output 8192
        x = self.flat(x)
        # input 8192, output 512
        x = self.act3(self.fc3(x))
        x = self.drop3(x)
        # input 512, output 10
        x = self.fc4(x)
        return x

In [40]:
model = BCIDModel()
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.09)

n_epochs = epoch
for epoch in range(n_epochs):
    for inputs, labels in trainloader:
        # forward, backward, and then weight update
        y_pred = model(inputs)
        loss = loss_fn(y_pred, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
 
    acc = 0
    count = 0
    for inputs, labels in testloader:
        y_pred = model(inputs)
        acc += (torch.argmax(y_pred, 1) == labels).float().sum()
        count += len(labels)
    acc /= count
    print("Epoch %d: model accuracy %.2f%%" % (epoch, acc*100))
