<a href="https://colab.research.google.com/github/LL-Jan/Kaggle/blob/main/pytorch_keras_practice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install segmentation-models-pytorch==0.1.3
!pip install albumentations==0.5.2 
!pip install netron
!pip install plotly==4.14.3

In [3]:
import warnings
warnings.filterwarnings('ignore')

import os
import gc
import json 
import numpy as np
import pandas as pd
# pd.set_option("display.max_rows", 101)
# pd.set_option("expand_frame_repr", True)
# pd.set_option("mode.use_inf_as_na", True)
# pd.options.plotting.backend = 'plotly'

from random import shuffle

import time
from datetime import datetime as dt
from pytz import timezone
from tqdm import tqdm, trange

import matplotlib.pyplot as plt
# plt.style.use('dark_background')
# %matplotlib widget
%matplotlib inline

import plotly.express as px
import plotly.graph_objects as go 
# template = 'plotly_dark'
template = 'plotly'
from plotly.subplots import make_subplots

from sklearn.model_selection import train_test_split

import cv2 as cv

import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset, TensorDataset 
from torchvision.datasets import ImageFolder 
from torchvision import transforms as tfs 
import torchvision.models as models 

import albumentations as amt
from albumentations.pytorch import ToTensorV2

import segmentation_models_pytorch as smp
import netron

In [4]:
torch.cuda.is_available(), torch.cuda.get_device_name(0)

(True, 'Tesla T4')

In [5]:
random_state = 618
torch.manual_seed(random_state)

<torch._C.Generator at 0x7f435d2acc90>

# EDA

In [None]:
# Google Colab
nb_path = "/content/drive/MyDrive/Colab/Kaggle/severstal-steel-defect-detection"
out_path = np_path

In [6]:
# # Kaggle
# nb_path = "../input/severstal-steel-defect-detection"
# out_path = "./"

In [7]:
train = pd.read_csv(os.path.join(nb_path, "train.csv"))
# train = pd.read_csv("./severstal-steel-defect-detection/train.csv")                

In [8]:
train.head()

Unnamed: 0,ImageId,ClassId,EncodedPixels
0,0002cc93b.jpg,1,29102 12 29346 24 29602 24 29858 24 30114 24 3...
1,0007a71bf.jpg,3,18661 28 18863 82 19091 110 19347 110 19603 11...
2,000a4bcdd.jpg,1,37607 3 37858 8 38108 14 38359 20 38610 25 388...
3,000f6bf48.jpg,4,131973 1 132228 4 132483 6 132738 8 132993 11 ...
4,0014fce06.jpg,3,229501 11 229741 33 229981 55 230221 77 230468...


In [9]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7095 entries, 0 to 7094
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   ImageId        7095 non-null   object
 1   ClassId        7095 non-null   int64 
 2   EncodedPixels  7095 non-null   object
dtypes: int64(1), object(2)
memory usage: 166.4+ KB


In [10]:
train['ImageId'].nunique(), train['EncodedPixels'].nunique()

(6666, 7095)

In [11]:
train['ClassId'].nunique(), train['ClassId'].unique()

(4, array([1, 3, 4, 2]))

In [12]:
train_image_path = os.path.join(nb_path, "train_images")
test_image_path = os.path.join(nb_path, "test_images")

# train_image_path = "./severstal-steel-defect-detection/train_images/"
# test_image_path = "./severstal-steel-defect-detection/test_images/"

print(f"{len(os.listdir(train_image_path))} images in training set")
print(f"{len(os.listdir(test_image_path))} images in test set")

13011 images in training set
5702 images in test set


# Preprocess

## Alignment

In [13]:
# Images without defects are not included in the train.csv, we should concat them.
df = train.pivot(index='ImageId', columns='ClassId', values='EncodedPixels')
df = df.merge(pd.DataFrame(index=os.listdir(train_image_path)), 
              left_index=True, 
              right_index=True, 
              how='right', 
              validate='one_to_one')

df['num_defect'] = df.count(axis=1)
df['num_defect'] = df['num_defect'].astype(np.uint8)

In [14]:
df.head()

Unnamed: 0,1,2,3,4,num_defect
ea970bedf.jpg,,,63939 1 64195 2 64451 5 64706 9 64962 11 65218...,,1
eb4225311.jpg,,,303400 7 303654 15 303908 19 304163 19 304187 ...,,1
ec0e3a1c2.jpg,,,148340 2 148594 4 148848 6 149102 8 149357 9 1...,,1
eaf7443a7.jpg,,,70658 255 70914 255 71170 255 71426 255 71682 ...,,1
ea7752e39.jpg,,,,,0


In [15]:
df['num_defect'].value_counts().sort_index()

0    6345
1    6239
2     425
3       2
Name: num_defect, dtype: int64

## Split to training/validation

In [16]:
train_df, valid_df = train_test_split(df, 
                                      test_size=0.2, 
                                      random_state=random_state, 
                                      shuffle=True, 
                                      stratify=df['num_defect'])
train_df = train_df.drop(columns=['num_defect'])
valid_df = valid_df.drop(columns=['num_defect'])
train_df.shape, valid_df.shape

((10408, 4), (2603, 4))

In [17]:
train_id = [] # List of image id like 'xxx.jpg' for training set
train_id.extend(train_df.index.to_list())
train_id.extend([s.split('.')[0]+'_HF.jpg' for s in train_df.index]) # Add image id for HorizontalFlip
train_id.extend([s.split('.')[0]+'_VF.jpg' for s in train_df.index]) # Add image id for VerticalFlip
train_id.extend([s.split('.')[0]+'_HVF.jpg' for s in train_df.index]) # Add image id for HorizontalFlip and VerticalFlip

# train_id = train_id[:16]
valid_id = valid_df.index.to_list() # List of image id like 'xxx.jpg' for validation set

In [18]:
len(train_id), len(valid_id)

(41632, 2603)

In [19]:
shuffle(train_id)
# train_id[0]

In [20]:
def get_img_id(idx, img_id_list):
    img_id = img_id_list[idx]
    if '_' not in img_id:
        img_id, augment = img_id, None
    elif '_HF' in img_id: 
        img_id, augment = img_id.replace('_HF',''), 'HF'
    elif '_VF' in img_id: 
        img_id, augment = img_id.replace('_VF',''), 'VF'
    elif '_HVF' in img_id: 
        img_id, augment = img_id.replace('_HVF',''), 'HVF'
    return img_id, augment

## Image Augmentation

In [21]:
# Define the pipeline for augmentation, Normalizing and totensor transform
def take_trfm(mean, std, augment):
    if not augment: 
        trfms = amt.Compose([amt.Normalize(mean=mean, std=std, p=1), 
                             ToTensorV2(transpose_mask=True)])
    elif augment == 'HF': 
        trfms = amt.Compose([amt.HorizontalFlip(p=1), 
                             amt.Normalize(mean=mean, std=std, p=1), 
                             ToTensorV2(transpose_mask=True)])
    elif augment == 'VF': 
        trfms = amt.Compose([amt.VerticalFlip(p=1), 
                             amt.Normalize(mean=mean, std=std, p=1), 
                             ToTensorV2(transpose_mask=True)])
    elif augment == 'HVF': 
        trfms = amt.Compose([amt.HorizontalFlip(p=1), 
                             amt.VerticalFlip(p=1), 
                             amt.Normalize(mean=mean, std=std, p=1), 
                             ToTensorV2(transpose_mask=True)])
    return trfms

## Utility Functions

In [22]:
def get_mask(img_id, df):
    
    mask = np.zeros((256, 1600, 4))
    defects = [] 
    
    for i, label in enumerate(df.loc[img_id,:].to_list()):
        if label is not np.nan: 
            label = [int(x) for x in label.split()]
            pix_starts, pix_lengths = label[::2], label[1::2]
            mask_ = np.zeros((256*1600, 1))
            for start, length in zip(pix_starts, pix_lengths):
                mask_[start:(start+length)] = 1
            mask[:,:,i] = mask_.reshape((256, 1600), order='F') 
            defects.append(i+1)
            
    return mask

# DataLoader

## Map-style Dataset → DataLoader

In [23]:
# Inherit torch.utils.data.Dataset and override '__getitem__' and '__len__'
# Preprocessing 
class SteelDataset(Dataset):
    def __init__(self, 
                 df, 
                 image_path, 
                 img_id_list, 
                 mean=(0.485, 0.456, 0.406), 
                 std=(0.229, 0.224, 0.225)):
        self.df = df
        self.image_path = image_path 
        self.img_id_list = img_id_list
        self.mean = mean 
        self.std = std
    
    def __getitem__(self, idx): 
        img_id, augment = get_img_id(idx, self.img_id_list)
        img_path = os.path.join(self.image_path, img_id)
        img = cv.imread(img_path)
        mask = get_mask(img_id, self.df)
        trfm = take_trfm(self.mean, self.std, augment)
        amted = trfm(image=img, mask=mask)
        img, mask = amted['image'], amted['mask'] 
        return img, mask
    
    def __len__(self):
        return len(self.img_id_list)

In [24]:
train_dataset = SteelDataset(train_df, 
                             image_path=train_image_path, 
                             img_id_list=train_id)
valid_dataset = SteelDataset(valid_df, 
                             image_path=train_image_path, 
                             img_id_list=valid_id)

In [25]:
len(train_dataset), len(valid_dataset)

(41632, 2603)

In [26]:
batch_size = 8
train_dataloader = DataLoader(train_dataset, 
                              batch_size=batch_size, 
                              shuffle=True)
valid_dataloader = DataLoader(valid_dataset, 
                              batch_size=batch_size, 
                              shuffle=True)

In [27]:
len(train_dataloader), len(valid_dataloader)

(5204, 326)

In [28]:
for batch, (X, y) in enumerate(train_dataloader): 
    print(batch, X.shape, y.shape)
    break

0 torch.Size([8, 3, 256, 1600]) torch.Size([8, 4, 256, 1600])


## TensorData → DataLoader

## ImageFolder → DataLoader

# Evaluation

In [29]:
# Sum of dice in a batch
def cal_dice(output, mask_gt, threshold=0.5): 
    '''Calculate the dice of a batch (between ground truth mask and output tensor)'''
    
    batch_size = len(mask_gt)
    prob = torch.sigmoid(output)
#     mask_pred = (prob > threshold).astype('torch.uint8')
    mask_pred = (prob > threshold).int()
    
    assert mask_gt.shape == mask_pred.shape 
    
    mask_gt = mask_gt.reshape(shape=(batch_size, -1))
    mask_pred = mask_pred.reshape(shape=(batch_size, -1))
    
    idx_neg = torch.nonzero(mask_gt.sum(-1)==0)
    idx_pos = torch.nonzero(mask_gt.sum(-1)>=1)
    
    dice_neg = (mask_pred.sum(-1)==0).float()
    dice_pos = 2 * (mask_gt * mask_pred).sum(-1) / (mask_gt + mask_pred).sum(-1)
    
    dice = torch.cat([dice_pos[idx_pos], dice_neg[idx_neg]])
        
#     dice = dice.numpy() # torch.tensor to numpy.ndarray
#     dice = np.nanmean(dice) # Calculate the mean dice of a batch, ignore nan
    
    return dice.sum()

# Neural Network

In [30]:
# Get cpu or gpu device for training.
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

Using cuda device


In [38]:
# state_file = "../input/scores/model_state.pth"
# state_file = os.path.join(nb_path, "model_state.pth")
# model = smp.Unet(encoder_name='resnet34', 
#                  encoder_weights='imagenet', 
#                  in_channels=3, 
#                  classes=4,)
# if os.path.isfile(state_file):
#     model.load_state_dict(torch.load(state_file))

Downloading: "https://download.pytorch.org/models/resnet34-333f7ec4.pth" to /root/.cache/torch/hub/checkpoints/resnet34-333f7ec4.pth


HBox(children=(FloatProgress(value=0.0, max=87306240.0), HTML(value='')))




## Visiualization

In [32]:
# model_path = 'C:/Users/ll/.cache/torch/hub/checkpoints/resnet34-333f7ec4.pth'
# model_path = 'C:/Users/ll/.cache/torch/hub/checkpoints/deeplabv3_resnet101_coco-586e9e4e.pth'

# torch.onnx.export(model, torch.rand(8, 3, 256, 1600), 'onnx_model.onnx') 
# torch.onnx.export(model_, torch.rand(8, 3, 256, 1600), 'onnx_model_.onnx')

# netron.start('onnx_model.onnx')

# Train & Validate

In [33]:
def train(dataloader, model, loss_fn, optimizer, batch_size=batch_size):
    t0 = time.time()
    size = len(dataloader.dataset) # Total number of images in dataset
    n_batch = len(dataloader)
    cum_loss, cum_dice = 0, 0

    model.train() # Very important

    for batch, (X, y) in enumerate(dataloader): 
        X, y = X.to(device), y.to(device)
        
        output = model(X)
        loss = loss_fn(output, y)
#         output = output.detach().cpu()
#         y = y.detach().cpu()
        dice = cal_dice(output, y)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        cum_loss = cum_loss + loss.item()
        cum_dice = cum_dice + dice.item()
        
        if batch % 300 == 0:
            t1 = time.time()
            print(f'Training Batch: {batch}/{n_batch}. Loss: {loss.item():.4f}. Dice: {dice.item()/batch_size:.4f}. Cost: {int(t1-t0)}s') 
    
    return cum_loss/size, cum_dice/size

In [34]:
def validate(dataloader, model, loss_fn, batch_size=batch_size):
    t0 = time.time()
    size = len(dataloader.dataset)
    n_batch = len(dataloader)
    cum_loss, cum_dice = 0, 0

    model.eval() # Very important

    with torch.no_grad():
        for batch, (X, y) in enumerate(dataloader): 
            X, y = X.to(device), y.to(device)
        
            output = model(X)
            loss = loss_fn(output, y)
#             output = output.detach().cpu()
#             y = y.detach().cpu()
            dice = cal_dice(output, y) 
            
            cum_loss = cum_loss + loss.item()
            cum_dice = cum_dice + dice.item()

            if batch % 30 == 0:
                t1 = time.time()
                print(f'Validation Batch: {batch}/{n_batch}. Loss: {loss.item():.4f}. Dice: {dice.item()/batch_size:.4f}. Cost: {int(t1-t0)}s') 
                
    return cum_loss/size, cum_dice/size

In [35]:
score_file = os.path.join(nb_path, "scores.csv")
if os.path.isfile(score_file): 
    score_df = pd.read_csv(score_file, index_col=['epoch'])
else:
    score_df = pd.DataFrame(index=range(1,21), 
                            columns=['train_loss', 'train_dice', 'valid_loss', 'valid_dice'], 
                            dtype='float32')
    score_df = score_df.rename_axis(mapper='epoch', axis='index')
epoch_untrained = score_df[score_df['train_loss'].isna()].index.to_list()

In [36]:
f"{min(epoch_untrained)-1} Epochs Trained."

'0 Epochs Trained.'

In [None]:
for t in tqdm(epoch_untrained[:1]):
    
    print(f"Epoch {t} starts at {dt.now(tz=timezone('Asia/Shanghai')).strftime('%Y-%m-%d %H:%M:%S')}")
    t0 = time.time()

    # model_file = f"../input/scores/model_trained_{t-1}.pth"
    model_file = os.path.join(nb_path, f"model_trained_{t-1}.pth")
    if os.path.isfile(model_file): 
        model = torch.load(model_file)
        print(f"Use existed trained model.")
    else: 
        model = smp.Unet(encoder_name='resnet34', 
                         encoder_weights='imagenet', 
                         in_channels=3, 
                         classes=4,)
        print(f"Use pretrained model.")

    model = model.to(device) # Very important

    loss = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

    train_loss, train_dice = train(train_dataloader, 
                                   model=model, 
                                   loss_fn=loss, 
                                   optimizer=optimizer) 
    valid_loss, valid_dice = validate(valid_dataloader, 
                                      model=model, 
                                      loss_fn=loss) 
    
    score_df.loc[t,:] = [train_loss,train_dice,train_loss,valid_dice]
    torch.cuda.empty_cache()

    torch.save(model, os.path.join(out_path, f"model_trained_{t}.pth")) 
    print(f"Epoch {t} Model Saved!")
    torch.save(model.state_dict(), os.path.join(out_path, f"model_state_{t}.pth"))
    print(f"Epoch {t} Model State Saved!")
    score_df.to_csv(os.path.join(out_path, f"scores.csv"))
    score_df.to_csv(os.path.join(out_path, f"scores_{t}.csv"))
    print(f"Epoch {t} Scores Saved!")

    t1 = time.time()
    print(f"Epoch {t} ends at {dt.now(tz=timezone('Asia/Shanghai')).strftime('%Y-%m-%d %H:%M:%S')}. Total Cost: {(t1-t0)/60:.2f}min")

print("Done!")

  0%|          | 0/1 [00:00<?, ?it/s]

Epoch 1 starts at 2021-05-24 19:03:20
Training Batch: 0/5204. Loss: 1.0213. Dice: 0.0018. Cost: 1s
Training Batch: 300/5204. Loss: 0.2136. Dice: 0.1250. Cost: 290s
Training Batch: 600/5204. Loss: 0.0996. Dice: 0.3750. Cost: 587s
Training Batch: 900/5204. Loss: 0.0791. Dice: 0.2500. Cost: 882s
Training Batch: 1200/5204. Loss: 0.0359. Dice: 0.7500. Cost: 1178s
Training Batch: 1500/5204. Loss: 0.0247. Dice: 0.4657. Cost: 1473s
Training Batch: 1800/5204. Loss: 0.0210. Dice: 0.4589. Cost: 1769s
Training Batch: 2100/5204. Loss: 0.0147. Dice: 0.8786. Cost: 2064s
Training Batch: 2400/5204. Loss: 0.0619. Dice: 0.5120. Cost: 2359s
Training Batch: 2700/5204. Loss: 0.0132. Dice: 0.4974. Cost: 2655s
Training Batch: 3000/5204. Loss: 0.0078. Dice: 0.7209. Cost: 2951s
Training Batch: 3300/5204. Loss: 0.0112. Dice: 0.7317. Cost: 3246s
