# Library

In [1]:
import sys
sys.path.append('../input/pytorch-image-models/pytorch-image-models-master')

import IPython.display

import os
import math
import time
import random
import shutil
from pathlib import Path
from contextlib import contextmanager
from collections import defaultdict, Counter
from PIL import Image
from glob import glob
import scipy as sp
import numpy as np
import pandas as pd
#import Pyvips

from sklearn import preprocessing
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold, train_test_split
from skimage.filters import sobel
from skimage import segmentation
from skimage.color import label2rgb
from skimage.color import rgb2hed, hed2rgb
from skimage.exposure import rescale_intensity
from skimage.measure import regionprops, regionprops_table
from mpl_toolkits.axes_grid1 import ImageGrid
from sklearn.preprocessing import StandardScaler
from scipy import ndimage as ndi
from matplotlib.patches import Rectangle
from pytorch_lightning.callbacks.early_stopping import EarlyStopping



import torchvision

from tqdm.auto import tqdm
from tqdm import trange
from time import sleep
from functools import partial
import tifffile as tiff

import cv2 as cv
from openslide import OpenSlide
import seaborn as sns
from matplotlib import pyplot as plt
from pprint import pprint

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam, SGD
import torchvision.models as models
from torch.nn.parameter import Parameter
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, CosineAnnealingLR, ReduceLROnPlateau
import torchvision.transforms as transforms
import torch.optim as optim
import gc
import torchvision.models as models
import copy


OUTPUT_DIR = './'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

from torch.cuda.amp import autocast, GradScaler
Image.MAX_IMAGE_PIXELS = None
import warnings
warnings.filterwarnings('ignore')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


# Loading data

In [2]:
transformed_train = pd.read_csv('../input/mayo-clinic-output/new_train.csv')
test = pd.read_csv('../input/mayo-clinic-strip-ai/test.csv')
train, valid = train_test_split(transformed_train, test_size=0.2)
classes_name = ["LAA","CE"]



# Test Transformation 

In [3]:
def rezie_image(image):
    resized_image = cv.resize(image,(int(image.shape[1]/33),int(image.shape[0]/33)),interpolation= cv.INTER_LINEAR)
    return resized_image


In [4]:
def grey_resize(image):
    gray_resized_image = cv.cvtColor(image, cv.COLOR_RGB2GRAY)    
    return gray_resized_image


In [5]:
def labeled_segment(grey_resized_image):
    elevation_map = sobel(grey_resized_image)
    markers = np.zeros_like(grey_resized_image)
    markers[grey_resized_image >= grey_resized_image.mean()] = 1
    markers[grey_resized_image < grey_resized_image.mean()] = 2
    segmented_img = segmentation.watershed(elevation_map, markers)
    filled_segments = ndi.binary_fill_holes(segmented_img - 1)
    labeled_segments, _ = ndi.label(filled_segments)
    return labeled_segments


In [6]:
def plot_labeled_segments(labeled_segments, resized_gray_img):
    image_label_overlay = label2rgb(labeled_segments, image=resized_gray_img, bg_label=0)
    fig, ax = plt.subplots(figsize=(10, 8))
    ax.imshow(image_label_overlay, cmap=plt.cm.gray)
    ax.set_title('segmentation')
    ax.axis('off')


In [7]:
def get_object_coordinates(labeled_segments):
    properties =['area','bbox','convex_area','bbox_area', 'major_axis_length', 'minor_axis_length', 'eccentricity']
    df = pd.DataFrame(regionprops_table(labeled_segments, properties=properties))
    standard_scaler = StandardScaler()
    scaled_area = standard_scaler.fit_transform(df.area.values.reshape(-1,1))
    df['scaled_area'] = scaled_area
    df.sort_values(by="scaled_area", ascending=False, inplace=True)
    objects = df[df['scaled_area']>=.75]
    object_coordinates = [(row['bbox-0'],row['bbox-1'],row['bbox-2'],row['bbox-3'] )for index, row in objects.iterrows()]
    return object_coordinates


In [8]:
def plot_object_coordinates(object_coordinates, resized_image):
    fig, ax = plt.subplots(1,1, figsize=(18, 16), dpi = 80)
    for blob in object_coordinates:
        width = blob[3] - blob[1]
        height = blob[2] - blob[0]
        patch = Rectangle((blob[1],blob[0]), width, height, edgecolor='r', facecolor='none')


In [9]:
def save_objects(object_coordinates, image, image_name, count):
    plt.figure(figsize=(10,18))
    for i in range(len(object_coordinates)):
        coordinates = object_coordinates[i]
        object_image = image[int(coordinates[0]):int(coordinates[2]), int(coordinates[1]):int(coordinates[3])]
        image_new_name = image_name + "_" + str(i)
        new_test["image_name"].append(image_new_name)
        new_test["image_count"].append(count)
        cv.imwrite(os.path.join("./", f"{image_new_name}.jpg"), object_image)


In [10]:
test_path = "../input/mayo-clinic-strip-ai/test"
Image_names = test['image_id'].values
new_test={"image_count":[],"image_name":[]}
count = 1
scale = 4
for image_name in Image_names:
    image = tiff.imread(os.path.join(test_path, f"{image_name}.tif"))
    resized_image=rezie_image(image)
    del image
    gc.collect()
    grey_resized_image = grey_resize(resized_image)
    labeled_segments = labeled_segment(grey_resized_image)
    object_coordinates = get_object_coordinates(labeled_segments)
    save_objects(object_coordinates, resized_image, image_name,count)
    
new_test=pd.DataFrame.from_dict(new_test)


<Figure size 720x1296 with 0 Axes>

<Figure size 720x1296 with 0 Axes>

<Figure size 720x1296 with 0 Axes>

<Figure size 720x1296 with 0 Axes>

# Data Loader

In [11]:
class TrainDataset(Dataset):
    def __init__(self, path, df,phase, transform=None):
        self.df = df
        self.path = path
        self.Image_names = df['image_name'].values
        self.phase = phase
        if phase =="train":
            self.labels = df['label'].values
        self.transform = transform
        
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        file_name = self.Image_names[idx]
        img= Image.open(os.path.join(self.path, f"{file_name}.jpg"))
        if self.transform:
            image=self.transform(img)
        if self.phase =="train":
            label = self.labels[idx]
            return image, torch.tensor(label), file_name
        else:
            return image, file_name
        

# Transforms

In [12]:
batch_size=64
data_transform = transforms.Compose([
        transforms.Resize((256,256)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])
    ])
train_dataset = TrainDataset("../input/mayo-clinic-output/", train,phase="train", transform = data_transform)
valid_dataset = TrainDataset("../input/mayo-clinic-output/", valid,phase="train", transform = data_transform)
test_dataset = TrainDataset("./", new_test,phase="test", transform = data_transform)

train_dl = torch.utils.data.DataLoader(train_dataset,
                                             batch_size=batch_size, shuffle=True,
                                             num_workers=0)
valid_dl = torch.utils.data.DataLoader(valid_dataset,
                                             batch_size=batch_size, shuffle=True,
                                             num_workers=0)
test_dl = torch.utils.data.DataLoader(test_dataset,
                                             batch_size=1, shuffle=True,
                                             num_workers=0)


In [13]:
def imshow(axis, inp):
    """Denormalize and show"""
    inp = inp.numpy().transpose((1, 2, 0))
    mean = np.array([0.485, 0.456, 0.406])
    std = np.array([0.229, 0.224, 0.225])
    inp = std * inp + mean
    axis.imshow(inp)


# PreTrained Model

In [14]:
use_gpu = torch.cuda.is_available()
model_ft = torchvision.models.resnet50(pretrained=False)
model_ft.load_state_dict(torch.load("../input/pretrained-model-weights-pytorch/resnet50-19c8e357.pth"))
#model_ft = models.resnet50(pretrained=True)
num_ftrs = model_ft.fc.in_features
model_ft.fc = nn.Linear(num_ftrs, 2)
model_ft=model_ft.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model_ft.parameters(), lr=0.01, momentum=0.9)
exp_lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)



# Training

In [15]:
def train_model(dataloders, model, criterion, optimizer, scheduler, num_epochs=25):
    since = time.time()
    best_model_wts = model.state_dict()
    best_acc = 0.0
    dataset_sizes = {'train': len(dataloders['train'].dataset), 
                     'valid': len(dataloders['valid'].dataset)}

    for epoch in range(num_epochs):
        for phase in ['train', 'valid']:
            if phase == 'train':
                scheduler.step()
                model.train(True)
            else:
                model.train(False)

            running_loss = 0.0
            running_corrects = 0

            for inputs, labels, _ in dataloders[phase]:
                if use_gpu:
                    inputs = inputs.to(device)
                    labels = labels.to(device)
                else:
                    inputs = inputs.to(device)
                    labels = labels.to(device)

                optimizer.zero_grad()

                outputs = model(inputs)
                _, preds = torch.max(outputs.data, 1)
                loss = criterion(outputs, labels)

                if phase == 'train':
                    loss.backward()
                    optimizer.step()

                running_loss += loss.data
                running_corrects += torch.sum(preds == labels.data)
                del inputs, labels
                gc.collect()
            
            if phase == 'train':
                train_epoch_loss = running_loss / dataset_sizes[phase]
                train_epoch_acc = running_corrects / dataset_sizes[phase]
            else:
                valid_epoch_loss = running_loss / dataset_sizes[phase]
                valid_epoch_acc = running_corrects / dataset_sizes[phase]
                
            if phase == 'valid' and valid_epoch_acc > best_acc:
                best_acc = valid_epoch_acc
                best_model_wts = model.state_dict()

        print('Epoch [{}/{}] train loss: {:.4f} acc: {:.4f} ' 
              'valid loss: {:.4f} acc: {:.4f}'.format(
                epoch, num_epochs - 1,
                train_epoch_loss, train_epoch_acc, 
                valid_epoch_loss, valid_epoch_acc))
            
    print('Best val Acc: {:4f}'.format(best_acc))

    model.load_state_dict(best_model_wts)
    return model

In [16]:
dloaders = {'train':train_dl, 'valid':valid_dl}
start_time = time.time()
model = train_model(dloaders, model_ft, criterion, optimizer, exp_lr_scheduler, num_epochs=12)
del model_ft
gc.collect()
print('Training time: {:10f} minutes'.format((time.time()-start_time)/60))

Epoch [0/11] train loss: 0.0125 acc: 0.6297 valid loss: 0.0100 acc: 0.6749
Epoch [1/11] train loss: 0.0086 acc: 0.7283 valid loss: 0.0113 acc: 0.5782
Epoch [2/11] train loss: 0.0077 acc: 0.7715 valid loss: 0.0107 acc: 0.6493
Epoch [3/11] train loss: 0.0063 acc: 0.8274 valid loss: 0.0099 acc: 0.6815
Epoch [4/11] train loss: 0.0045 acc: 0.8838 valid loss: 0.0139 acc: 0.7242
Epoch [5/11] train loss: 0.0030 acc: 0.9248 valid loss: 0.0117 acc: 0.7261
Epoch [6/11] train loss: 0.0010 acc: 0.9844 valid loss: 0.0098 acc: 0.7934
Epoch [7/11] train loss: 0.0004 acc: 0.9972 valid loss: 0.0103 acc: 0.8000
Epoch [8/11] train loss: 0.0002 acc: 0.9988 valid loss: 0.0105 acc: 0.8076
Epoch [9/11] train loss: 0.0001 acc: 0.9993 valid loss: 0.0117 acc: 0.8009
Epoch [10/11] train loss: 0.0001 acc: 0.9998 valid loss: 0.0119 acc: 0.8038
Epoch [11/11] train loss: 0.0001 acc: 0.9998 valid loss: 0.0119 acc: 0.8057
Best val Acc: 0.807583
Training time:   9.275694 minutes


In [17]:
def visualize_model(dataloders, model, num_images=25):
    cnt = 0
    fig = plt.figure(1, figsize=(20, 20))
    grid = ImageGrid(fig, 111, nrows_ncols=(5, 5), axes_pad=0.05)
    for i, (inputs, labels, file_name) in enumerate(dataloders['valid']):
        if use_gpu:
            inputs = inputs.to(device)
            labels = labels.to(device)
        else:
            inputs = inputs.to(device)
            labels = labels.to(device)

        outputs = model(inputs)
        _, preds = torch.max(outputs.data, 1)
        sm = torch.nn.Softmax()
        probabilities = sm(outputs).tolist()
        rounded_prob = [np.round(num, 1) for num in probabilities]
        for j in range(inputs.size()[0]):
            ax = grid[cnt]
            imshow(ax, inputs.cpu().data[j])
            ax.text(10, 20, 'ID: {} '.format(file_name[j]),color='k', backgroundcolor='w', alpha=0.8,size="small")
            ax.text(10, 230, 'Predicted {} || Actual {}'.format(classes_name[preds[j]], classes_name[labels.data[j]]), 
                    color='k', backgroundcolor='w', alpha=0.8,size="small")
            ax.text(10, 245,rounded_prob[j],color='k', backgroundcolor='w', alpha=0.8,size="small")
            cnt += 1
            if cnt == num_images:
                del inputs, labels
                gc.collect()
                return

In [18]:
#visualize_model(dloaders, model)


# Submission

In [19]:
def submission(dataloders, model):
    prob_data = {"ID":[],"prob":[]}
    if dataloders ==dloaders:
        dataloders = dataloders['valid']
        
    for i, (inputs, file_name) in enumerate(dataloders):
        if use_gpu:
            inputs = inputs.to(device)
            
        outputs = model(inputs)
        _, preds = torch.max(outputs.data, 1)
        sm = torch.nn.Softmax()
        probabilities = sm(outputs).tolist()
        rounded_prob = [np.round(num, 1) for num in probabilities]
        
        for i in range(inputs.size()[0]):
            prob_data["ID"].append(file_name[i])
            prob_data["prob"].append(rounded_prob[i])
            
        del inputs
        gc.collect()
    prob_data= pd.DataFrame.from_dict(prob_data)
    prob_data['patient_id'], prob_data['image_nu'],prob_data['split_nu'] = prob_data['ID'].str.split('_').str
    prob_data[['LAA','CE']] = pd.DataFrame(prob_data.prob.tolist(), index= prob_data.index)
    prob_data_submission = prob_data.drop(['ID', 'prob','image_nu','split_nu'], axis=1)
    avg = prob_data_submission.groupby('patient_id',as_index=False).mean()
    avg = avg[['patient_id','CE','LAA']]
    return avg
    

In [20]:
sub=submission(test_dl, model)


In [21]:
sub.to_csv('submission.csv',index=False)


In [22]:
print(torch.cuda.memory_summary())

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |  280135 KB |   13970 MB |   23702 GB |   23702 GB |
|       from large pool |  227072 KB |   13917 MB |   23678 GB |   23678 GB |
|       from small pool |   53063 KB |     145 MB |      24 GB |      24 GB |
|---------------------------------------------------------------------------|
| Active memory         |  280135 KB |   13970 MB |   23702 GB |   23702 GB |
|       from large pool |  227072 KB |   13917 MB |   23678 GB |   23678 GB |
|       from small pool |   53063 KB |     145 MB |      24 GB |      24 GB |
|---------------------------------------------------------------