# Catalyst Prediction

From: https://github.com/chizhu/kaggle-severstal

Applies 5 class classification including background class and then 4 class segmentation.

Classification: resnet50, efficientnet-b3 and se-resnext50.

Segmentation: Unet with resnet18, PSPNet with resnet18 and FPN with resnet50.

In [None]:
# mount the google drive
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
root_dir = "/content/gdrive/My Drive/"
base_dir = root_dir + 'Steel Segmentation/'

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


## Copy support files

In [None]:
import os

In [None]:
!mkdir working

In [None]:
# copy the specified folder from google drive
def copy_support_directory( name ):  
  if os.path.exists(name): 
    !rm -r {name}    
  !mkdir {name}
  gd_dir = base_dir + name
  !cp -r '{gd_dir}/'* {name}/.

In [None]:
copy_support_directory( 'utils' )
copy_support_directory( 'schedulers' )
copy_support_directory( 'transforms' )
copy_support_directory( 'losses' )
copy_support_directory( 'optimizers' )
copy_support_directory( 'models' )

In [None]:
copy_support_directory( 'datasets' )

In [None]:
copy_support_directory( 'config' )

# remove the configuration for models we haven't yet trained
# !rm config/cls/002_efnet_b3_cls_BCE_5class_fold1.yml
# !rm config/cls/003_seresnext50_cls_BCE_5class_fold2.yml
# !rm config/seg/002_resnet18_PSPNet_fold0.yml
# !rm config/seg/003_resnet50_fpn_fold0.yml

In [None]:
!ls config/cls

001_resnet50_BCE_5class_fold0.yml      003_seresnext50_cls_BCE_5class_fold2.yml
002_efnet_b3_cls_BCE_5class_fold1.yml


In [None]:
%%time

# get the image zip files using gdown as drive mapping was timing out
import gdown

# the directory containing the original competition data
data_dir = 'SteelDefect'

# test if the images are already here
if os.path.isdir(data_dir):
  !rm -r {data_dir}

!gdown --id 1fE3ITnDMGWdyckgynmLhWEN4DzJKDxtw
!mkdir -p {data_dir}
!unzip -q severstal-steel-defect-detection.zip -d {data_dir}
!rm severstal-steel-defect-detection.zip

Downloading...
From: https://drive.google.com/uc?id=1fE3ITnDMGWdyckgynmLhWEN4DzJKDxtw
To: /content/severstal-steel-defect-detection.zip
1.68GB [00:20, 82.7MB/s]
CPU times: user 240 ms, sys: 68.7 ms, total: 309 ms
Wall time: 54.3 s


## Copy created files

In [None]:
input_dir = base_dir + 'Input/'
!cp '{input_dir}orig_train.csv' '{data_dir}/.'
!cp '{input_dir}folds.csv' .

In [None]:
copy_support_directory( '001_resnet50_BCE_5class_fold0' )
copy_support_directory( '002_efnet_b3_cls_BCE_5class_fold1' )
copy_support_directory( '003_seresnext50_cls_BCE_5class_fold2' )

copy_support_directory( '001_resnet18_Unet_fold0' )
copy_support_directory( '002_resnet18_PSPNet_fold0')
copy_support_directory( '003_resnet50_fpn_fold0' )

In [None]:
!ls

001_resnet18_Unet_fold0		      datasets	   schedulers
001_resnet50_BCE_5class_fold0	      folds.csv    SteelDefect
002_efnet_b3_cls_BCE_5class_fold1     gdrive	   transforms
002_resnet18_PSPNet_fold0	      losses	   utils
003_resnet50_fpn_fold0		      models	   working
003_seresnext50_cls_BCE_5class_fold2  optimizers
config				      sample_data


In [None]:
# for augmentations
!pip install albumentations -q

# for pretrained segmentation models fo PyTorch
!pip install segmentation-models-pytorch -q

# for TTA
!pip install ttach==0.0.2 -q

# for Catalyst
!pip install -U catalyst -q

[K     |████████████████████████████████| 634kB 2.8MB/s 
[?25h  Building wheel for imgaug (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 51kB 1.7MB/s 
[K     |████████████████████████████████| 61kB 3.4MB/s 
[?25h  Building wheel for pretrainedmodels (setup.py) ... [?25l[?25hdone
  Building wheel for efficientnet-pytorch (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 399kB 2.9MB/s 
[K     |████████████████████████████████| 317kB 12.7MB/s 
[K     |████████████████████████████████| 163kB 14.4MB/s 
[K     |████████████████████████████████| 71kB 7.2MB/s 
[?25h

In [None]:
# import argparse
import json
import os
import warnings
from pathlib import Path

import numpy as np
import pandas as pd
import torch
from pathlib import Path
from tqdm import tqdm

warnings.filterwarnings("ignore")

from models import MultiClsModels, MultiSegModels
from utils import predict_batch
from utils.utils import mask2rle, post_process, load_model
from utils.config import load_config


# from datasets import make_loader
from datasets.dataset_factory import make_loader

from transforms import get_transforms

In [None]:
gd_old_style_submission_csv = base_dir + 'Input/orig_submission.csv'

def create_original_submission():
  """ copy or create the original submission CSV file
      - this had a single column for 'ImageId_ClassId' as opposed to sepate columns
  """  

  # test if the masks.csv file already exists on google drive
  if os.path.exists(gd_old_style_submission_csv): 

    # copy the original training csv file from google drive

    print(f"Copying {gd_old_style_submission_csv}")
    !cp '{gd_old_style_submission_csv}' SteelDefect/.

  else:  

    # swap the column order to match the original
    columns_titles = ["ImageId_ClassId","EncodedPixels"]
    df_masks=pd.DataFrame(columns=columns_titles)

    # add blank entries for defects
    fnames = os.listdir('SteelDefect/test_images')
    for fname in tqdm(fnames):
      for defect in range(1,5):
        defect_image = f"{fname}_{defect}"          
        row_df = pd.DataFrame([defect_image],columns=['ImageId_ClassId'])
        df_masks = pd.concat([row_df, df_masks], ignore_index=True)

    df_masks = df_masks.sort_values(["ImageId_ClassId"])  
    df_masks = df_masks.reset_index(drop=True) 

    # this file takes a long time to generate, so make sure its saved
    # - make sure it retains the indexes as these are the image names
    df_masks.to_csv(gd_old_style_submission_csv, index=False)

    # save a version to the input directory for use now
    df_masks.to_csv('SteelDefect/orig_submission.csv', index=False)

In [None]:
# move the old style submission and training CSVs to be the current ones

create_original_submission()

!mv SteelDefect/train.csv SteelDefect/new_format_train.csv
!mv SteelDefect/orig_train.csv SteelDefect/train.csv

!mv SteelDefect/sample_submission.csv SteelDefect/new_format_sample_submission.csv
!mv SteelDefect/orig_submission.csv SteelDefect/sample_submission.csv

Copying /content/gdrive/My Drive/Steel Segmentation/Input/orig_submission.csv


In [None]:
!ls SteelDefect

new_format_sample_submission.csv  sample_submission.csv  train.csv
new_format_train.csv		  test_images		 train_images


In [None]:
KAGGLE_WORK_DIR = './working'


def run_cls(config_dir):
    os.environ['CUDA_VISIBLE_DEVICES'] = '0'

    # ------------------------------------------------------------------------------------------------------------
    # 1. classification inference
    # ------------------------------------------------------------------------------------------------------------
    config_root = Path(config_dir) / 'cls'
    config_paths = [config_root / p for p in os.listdir(config_root)]
    base_config_paths = [Path(config_dir) / p for p in os.listdir(config_dir) if 'yml' in p]
    config = load_config(base_config_paths[0])

    models = []
    for c in config_paths:
        models.append(load_model(c))

    model = MultiClsModels(models)

    testloader = make_loader(
        data_folder=config.data.test_dir,
        df_path=config.data.sample_submission_path,
        phase='test',
        batch_size=config.test.batch_size,
        num_workers=config.num_workers,
        transforms=get_transforms(config.transforms.test),
        num_classes=config.data.num_classes,
    )

    all_fnames = []
    all_predictions = []
    with torch.no_grad():
        for i, (batch_fnames, batch_images) in enumerate(tqdm(testloader)):
            batch_images = batch_images.to(config.device)
            batch_preds = predict_batch(model, batch_images, tta=config.test.tta, task='cls')

            all_fnames.extend(batch_fnames)
            all_predictions.append(batch_preds)

    all_predictions = np.concatenate(all_predictions)

    np.save('all_preds', all_predictions)
    df = pd.DataFrame(data=all_predictions, index=all_fnames)

    df.to_csv('cls_preds.csv')
    df.to_csv(KAGGLE_WORK_DIR + '/cls_preds.csv')


def run_seg(config_dir):
    os.environ['CUDA_VISIBLE_DEVICES'] = '0'

    # ------------------------------------------------------------------------------------------------------------
    # 2. segmentation inference
    # ------------------------------------------------------------------------------------------------------------
    config_root = Path(config_dir) / 'seg'
    config_paths = [config_root / p for p in os.listdir(config_root)]
    base_config_paths = [Path(config_dir) / p for p in os.listdir(config_dir) if 'yml' in p]
    config = load_config(base_config_paths[0])

    models = []
    for c in config_paths:
        models.append(load_model(c))

    model = MultiSegModels(models)

    if os.path.exists('cls_preds.csv'):
        testloader = make_loader(
            data_folder=config.data.test_dir,
            df_path='cls_preds.csv',
            phase='filtered_test',
            batch_size=config.test.batch_size,
            num_workers=config.num_workers,
            transforms=get_transforms(config.transforms.test)
        )
    else:
        testloader = make_loader(
            data_folder=config.data.test_dir,
            df_path=config.data.sample_submission_path,
            phase='test',
            batch_size=config.test.batch_size,
            num_workers=config.num_workers,
            transforms=get_transforms(config.transforms.test)
        )

    if os.path.exists(config.work_dir + '/threshold_search.json'):
        with open(config.work_dir + '/threshold_search.json') as json_file:
            data = json.load(json_file)
        df = pd.DataFrame(data)
        min_sizes = list(df.T.idxmax().values.astype(int))
        print('load best threshold from validation:', min_sizes)
    else:
        min_sizes = config.test.min_size
        print('load default threshold:', min_sizes)

    predictions = []
    with torch.no_grad():
        for i, (batch_fnames, batch_images) in enumerate(tqdm(testloader)):
            batch_images = batch_images.to(config.device)
            batch_preds = predict_batch(model, batch_images, tta=config.test.tta)

            for fname, preds in zip(batch_fnames, batch_preds):
                for cls in range(preds.shape[0]):
                    mask = preds[cls, :, :]
                    mask, num = post_process(mask, config.test.best_threshold, min_sizes[cls])
                    rle = mask2rle(mask)
                    name = fname + f"_{cls + 1}"
                    predictions.append([name, rle])

    # ------------------------------------------------------------------------------------------------------------
    # submission
    # ------------------------------------------------------------------------------------------------------------
    sub_df = pd.DataFrame(predictions, columns=['ImageId_ClassId', 'EncodedPixels'])

    sample_submission = pd.read_csv(config.data.sample_submission_path)
    df_merged = pd.merge(sample_submission, sub_df, on='ImageId_ClassId', how='left')
    df_merged.fillna('', inplace=True)
    df_merged['EncodedPixels'] = df_merged['EncodedPixels_y']
    df_merged = df_merged[['ImageId_ClassId', 'EncodedPixels']]

    df_merged.to_csv("submission.csv", index=False)
    df_merged.to_csv(KAGGLE_WORK_DIR + "/submission.csv", index=False)


def parse_args():
    parser = argparse.ArgumentParser(description='Severstal')
    parser.add_argument('--config_dir', default=None, type=str)
    return parser.parse_args()


def main_ensemble(config_dir):    
    run_cls(config_dir)
    run_seg(config_dir)

In [None]:
main_ensemble('config')

./001_resnet50_BCE_5class_fold0/checkpoints/best.pth


Downloading: "https://download.pytorch.org/models/resnet50-19c8e357.pth" to /root/.cache/torch/checkpoints/resnet50-19c8e357.pth


HBox(children=(FloatProgress(value=0.0, max=102502400.0), HTML(value='')))


./002_efnet_b3_cls_BCE_5class_fold1/checkpoints/best.pth
./003_seresnext50_cls_BCE_5class_fold2/checkpoints/best.pth


Downloading: "http://data.lip6.fr/cadene/pretrainedmodels/se_resnext50_32x4d-a260b3a4.pth" to /root/.cache/torch/checkpoints/se_resnext50_32x4d-a260b3a4.pth


HBox(children=(FloatProgress(value=0.0, max=110559176.0), HTML(value='')))




100%|██████████| 345/345 [16:37<00:00,  2.89s/it]


./002_resnet18_PSPNet_fold0/checkpoints/best.pth
./001_resnet18_Unet_fold0/checkpoints/best.pth
./003_resnet50_fpn_fold0/checkpoints/best.pth


  0%|          | 0/179 [00:00<?, ?it/s]

load default threshold: [500, 500, 1000, 2000]


100%|██████████| 179/179 [06:08<00:00,  2.06s/it]


In [None]:
!cp submission.csv '{base_dir}/Output/submission_3classifiers_3segmenters.csv'