In [2]:
import pydicom
from itertools import chain
import os
import numpy
from matplotlib import pyplot, cm
import cv2
import pandas as pd
from keras.utils import to_categorical, Sequence
from keras.preprocessing.image import ImageDataGenerator
from keras.applications import VGG16
from keras import models
from keras import layers
from keras import optimizers

Using TensorFlow backend.


## Creation of Datasets and Dataframe, to use the Image Data Generator with keras flow_from_dataframe

### Also appplying CLAHE algorithm on data

In [60]:
# Look for all cropped images given in dataframes "paths_dicom_df" in the folder "pth_folder_dicom", 
# apply CLAHE algorithm if wanted, and record new JPEG images in "path_jpg"
# Also creates and record the data frame necessary for the Keras image generator
def dicom_to_jpg_with_df(paths_dicom_df, pth_folder_dicom, path_jpg, ENHANCE=True):
    clahe1 = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(6,6)) #CLAHE for local contrast enhancing
    clahe2 = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8,8))
    clahe3 = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(10,10))
    threshold1 = 250*250
    threshold2 = 400*400
    i=0
    dict_for_df = {'filename':[], 'pathology':[]}
    for df_path in paths_dicom_df:
        df = pd.read_csv(df_path, sep=',', header=0, engine='python', quotechar='"')
        df.dropna(0, inplace=True, how='all') # Remove fully empty rows
        df.reset_index(drop=True, inplace=True)
        for index, row in df.iterrows():
            foldername = row["cropped image file path"]
            arg_list = foldername.split('/')
            path = os.path.join(pth_folder_dicom, arg_list[0])
            dict_for_df['filename'].append(arg_list[0])
            dict_for_df['pathology'].append(row.pathology)
            for dirName, subdirList, fileList in os.walk(path):
                for filename in fileList:
                    filepath = os.path.join(dirName, filename)
                    dic = pydicom.read_file(filepath)
                    if (dic.SeriesDescription == "cropped images"):  # check whether the file's segmented image
                        pixels = dic.pixel_array
                        size = pixels.size
                        if ENHANCE:
                            if size<threshold1:
                                pixels = clahe1.apply(pixels) 
                            elif size<threshold2:
                                pixels = clahe2.apply(pixels)
                            else:
                                pixels = clahe3.apply(pixels)
                        cv2.imwrite(os.path.join(path_jpg, arg_list[0])+".jpg", pixels/255., [int(cv2.IMWRITE_JPEG_QUALITY), 100])
                        i+=1
    df_returned = pd.DataFrame(data=dict_for_df)
    df_returned.pathology[df_returned.pathology == 'BENIGN_WITHOUT_CALLBACK'] = 'BENIGN'
    return df_returned
    

In [61]:
path_train_mass_df = "D:\\datas\\Mammographies\\mass_case_description_train_set.csv"
path_train_calc_df = "D:\\datas\\Mammographies\\calc_case_description_train_set.csv"
path_test_mass_df = "D:\\datas\\Mammographies\\mass_case_description_test_set.csv"
path_test_calc_df = "D:\\datas\\Mammographies\\calc_case_description_train_set.csv"

path_dicom_folder = "D:\\datas\Mammographies\\CBIS-DDSM"

path_train_crop_jpg = "D:\\datas\\Mammographies\\mass_and_calc_crop_train\\"
path_test_crop_jpg = "D:\\datas\\Mammographies\\mass_and_calc_crop_test\\"

df_test = dicom_to_jpg_with_df([path_test_mass_df, path_test_calc_df], path_dicom_folder, path_test_crop_jpg)
df_test.to_csv("test_labels.csv", index=False)

df_train = dicom_to_jpg_with_df([path_train_mass_df, path_train_calc_df], path_dicom_folder, path_train_crop_jpg)
df_train.to_csv("train_labels.csv", index=False)

## Feature extraction

In [63]:
def extract_features(dataframe, directory, sample_count, subset):
    features = numpy.zeros(shape=(sample_count, 7, 7, 512))  # Must be equal to the output of the convolutional base
    labels = numpy.zeros(shape=(sample_count, 2))
    # Preprocess data
    generator = datagen.flow_from_dataframe(
        dataframe=dataframe, 
        directory=directory, 
        x_col="filename", y_col="pathology", class_mode="categorical",
        target_size=(224,224), 
        batch_size=batch_size, shuffle=True, 
        interpolation="bilinear",
        subset=subset)
    # Pass data through convolutional base
    i = 0
    inputs_batch, labels = next(generator)
    features = conv_base.predict(inputs_batch)
    for inputs_batch, labels_batch in generator:
        features_batch = conv_base.predict(inputs_batch)
        features = numpy.concatenate((features, features_batch), axis=0)
        labels = numpy.concatenate((labels, labels_batch), axis=0)
        i += 1
        if i * batch_size >= sample_count:
            break
    return features, labels

# Computes the total number of samples used for training phase after augmentation
def compute_num_samples(data_size, validation_ratio, aug_factor):
    train_size = int((1-validation_ratio) * data_size * aug_factor)
    return train_size

In [None]:
valid_ratio = 0.3
batch_size = 32
augmentation = 3
data_size = df.shape[0]
train_size = compute_num_samples(data_size, valid_ratio, augmentation)

conv_base = VGG16(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

datagen = ImageDataGenerator(
         rescale=1/255.,
         vertical_flip=True,
         horizontal_flip=True, 
         validation_split=valid_ratio)


Downloading data from https://github.com/fchollet/deep-learning-models/releases/download/v0.1/vgg16_weights_tf_dim_ordering_tf_kernels.h5
 22388736/553467096 [>.............................] - ETA: 2:17:44

ConnectionResetError: [WinError 10054] Une connexion existante a dû être fermée par l’hôte distant

In [None]:
train_features, train_labels = extract_features(df_train, path_train_crop_jog, train_size, "training")  # Agree with our small dataset size
validation_features, validation_labels = extract_features(df_train, path_train_crop_jog, valid_size, "validation")  # Agree with our small dataset size

In [None]:
epochs = 100

model = models.Sequential()
model.add(layers.Flatten(input_shape=(7,7,512)))
model.add(layers.Dense(256, activation='relu', input_dim=(7*7*512)))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(2, activation='sigmoid'))
model.summary()

# Compile model
model.compile(optimizer=optimizers.Adam(),
              loss='binary_crossentropy',
              metrics=['acc'])
              
# Train model
history = model.fit(train_features, train_labels,
                    epochs=epochs,
                    batch_size=batch_size, 
                    validation_data=(validation_features, validation_labels))

## 2 - Linear SVM classifier

In [None]:
# Concatenate training and validation sets
svm_features = numpy.concatenate((train_features, validation_features))
svm_labels = numpy.concatenate((train_labels, validation_labels))

In [None]:
# Build model
import sklearn
from sklearn.model_selection import train_test_split, GridSearchCV
#from sklearn.grid_search import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC

X_train, y_train = svm_features.reshape(svm,7*7*512), svm_labels

scaler = StandardScaler(copy=False)
scaler.fit(X_train)
scaler.transform(X_train)

param = [{
          "C": [0.01, 0.1, 1, 10, 100]
         }]
 
svm = LinearSVC(penalty='l2', loss='squared_hinge', max_iter=1000)  # As in Tang (2013)
clf = GridSearchCV(svm, param, cv=10)
clf.fit(X_train, y_train)