In [120]:
%reload_ext autoreload
%autoreload 2
## sys package
import os, sys
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0"  # specify which GPU(s) to be used
sys.path.append("../")
## warning off
import warnings
warnings.filterwarnings("ignore")

## general package
import random
from tqdm import tqdm_notebook as tqdm
import numpy as np
import pandas as pd
from torch.utils.data import *

## customized package
from input.inputPipeline_seg import *
from model.deeplabv3 import *

In [121]:
# DATA = "../input/prostate-cancer-grade-assessment"
image_dir = "../input/panda-32x256x256-tiles-data/train/"
csv_file = "../input/panda-16x128x128-tiles-data/4_fold_train.csv"

In [122]:
mean = torch.tensor([0.5, 0.5, 0.5])
std = torch.tensor([0.5, 0.5, 0.5])
## dataset, can fetch data by dataset[idx]
dataset = PandaPatchDatasetSegInfer(csv_file, image_dir, [mean, std], N = 12)

### Segmentation models

In [123]:
models = []
weights = [f'../train/weights/Deeplabv3Res50_12patch_radboud/Deeplabv3Res50_12patch_radboud_{i}_best.pth.tar' for i in range(4)]
for path in weights:
    state_dict = torch.load(path)
    model = Model(arch='deeplabv3_resnet50', n=6).cuda()
    model.load_state_dict(state_dict)
    model.float()
    model.eval()
    model.cuda()
    models.append(model)

del state_dict

### Inference

In [129]:
bs = 1
sz = 256
dataloader = DataLoader(dataset, batch_size=bs,
                            shuffle=False, num_workers=0, collate_fn=dataloader_collte_fn_infer)
names,preds = [],[] ## record image names and predictions
with torch.no_grad():
    for idx, data in enumerate(dataloader):
        img, name = data
        img = img.float().cuda()
        bs,N,C,h,w = img.shape
        img = torch.stack([img,img.flip(-1),img.flip(-2),img.flip(-1,-2),
                  img.transpose(-1,-2),img.transpose(-1,-2).flip(-1),
                  img.transpose(-1,-2).flip(-2),img.transpose(-1,-2).flip(-1,-2)],1)
        img = img.view(-1, N, C, h, w) # (bs * 8, N, C, w, h)
        p = [model(img) for model in models] # [4, bs * 8, 6]
        print(p[0].shape)
        break

RuntimeError: CUDA out of memory. Tried to allocate 768.00 MiB (GPU 0; 7.93 GiB total capacity; 5.16 GiB already allocated; 596.19 MiB free; 6.37 GiB reserved in total by PyTorch)

## Radboud

In [79]:
df_train = pd.read_csv(os.path.join(DATA, "train.csv"))
df_train = df_train[df_train.data_provider == 'radboud']
for label in range(0,6):
    df_train[f'percent_{label}'] = None
    df_train[f'count_{label}'] = None

for i in tqdm(range(len(df_train))):
    idx = df_train.iloc[i, 0]
    isup = df_train.iloc[i, 2]
    gleason = df_train.iloc[i, 3]
    mask_files = [os.path.join(MASK, f'{idx}_{j}.png') for j in range(12)]
    if os.path.exists(mask_files[0]):
        masks = [np.expand_dims(open_image(fname),0) for fname in mask_files]
        masks = np.concatenate(masks)
        cnt, feat = extract_features(masks)
        for label in range(0,6):
            df_train[f'count_{label}'].iloc[i] = cnt[label-1]
            df_train[f'percent_{label}'].iloc[i] = feat[label-1]
    else:
        continue

HBox(children=(FloatProgress(value=0.0, max=5160.0), HTML(value='')))




In [80]:
df_train = df_train.replace(to_replace='None', value=np.nan).dropna()
df_train.reset_index(drop=True)

Unnamed: 0,image_id,data_provider,isup_grade,gleason_score,percent_0,count_0,percent_1,count_1,percent_2,count_2,percent_3,count_3,percent_4,count_4,percent_5,count_5
0,0018ae58b01bdadc8e347995b69f99aa,radboud,4,4+4,0,0,0.612134,442740,0.00271267,1962,0,0,0.385153,278571,0,0
1,004dd32d9cd167d9cc31c13b704498af,radboud,1,3+3,0,0,0.957644,629782,0.0423562,27855,0,0,0,0,0,0
2,0068d4c7529e34fd4c9da863ce01a161,radboud,3,4+3,0,0,0.959698,487514,0.0402235,20433,7.87422e-05,40,0,0,0,0
3,006f6aa35a78965c92fffd1fbd53a058,radboud,3,4+3,0,0,0.708292,284666,0,0,0.0564487,22687,0.23526,94552,0,0
4,007433133235efc27a39f11df6940829,radboud,0,negative,0,0,0.955717,654403,0.0442835,30322,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5055,ffd2841373b39792ab0c84cccd066e31,radboud,0,negative,0,0,0.861589,641361,0.138411,103032,0,0,0,0,0,0
5056,ffdc59cd580a1468eac0e6a32dd1ff2d,radboud,5,4+5,0.0788637,59924,0.863513,656134,0.000686984,522,0,0,0.0569368,43263,0.0788637,59924
5057,ffe06afd66a93258f8fabdef6044e181,radboud,0,negative,0,0,0.918352,677851,0.0816483,60266,0,0,0,0,0,0
5058,ffe236a25d4cbed59438220799920749,radboud,2,3+4,0,0,0.615882,440984,0.0210162,15048,0.296363,212202,0.0667384,47786,0,0


In [83]:
skf = StratifiedKFold(4, shuffle=True, random_state=2020)
splits = list(skf.split(df_train, df_train.isup_grade))

#features = [f"percent_{label}" for label in range(1, 6)] 
features = [f"percent_{label}" for label in range(0, 6)] + [f"count_{label}" for label in range(0, 6)]
target = 'isup_grade'

In [84]:
from sklearn.neighbors import KNeighborsClassifier

scores = []
for fold in range(4):
    train = df_train.iloc[splits[fold][0]]
    valid = df_train.iloc[splits[fold][1]]
    
    model = KNeighborsClassifier(n_neighbors=5)
    
    model.fit(train[features], train[target])
    
    preds = model.predict(valid[features])
    
    score = cohen_kappa_score(preds, valid[target], weights='quadratic')
    scores.append(score)
    print(f"Fold = {fold}, QWK = {score:.4f}")
    
print(f"Mean = {np.mean(scores):.4f}")

Fold = 0, QWK = 0.7717
Fold = 1, QWK = 0.7589
Fold = 2, QWK = 0.7770
Fold = 3, QWK = 0.7502
Mean = 0.7645


In [85]:
# rfc
from sklearn.ensemble import RandomForestClassifier

scores = []
for fold in range(4):
    train = df_train.iloc[splits[fold][0]]
    valid = df_train.iloc[splits[fold][1]]
    
    model = RandomForestClassifier(random_state=42)
    
    model.fit(train[features], train[target])
    
    preds = model.predict(valid[features])
    
    score = cohen_kappa_score(preds, valid[target], weights='quadratic')
    scores.append(score)
    print(f"Fold = {fold}, QWK = {score:.4f}")
    
print(f"Mean = {np.mean(scores):.4f}")

Fold = 0, QWK = 0.9372
Fold = 1, QWK = 0.9265
Fold = 2, QWK = 0.9339
Fold = 3, QWK = 0.9139
Mean = 0.9279


In [87]:
df_train[features] = df_train[features].apply(pd.to_numeric, errors = 'coerce')

In [88]:
# lgb
import lightgbm as lgb

def QWK(preds, dtrain):
    labels = dtrain.get_label()
    preds = np.rint(preds)
    score = cohen_kappa_score(preds, labels, weights='quadratic')
    return ("QWK", score, True)

scores = []
for fold in range(4):
    train = df_train.iloc[splits[fold][0]]
    valid = df_train.iloc[splits[fold][1]]
    
    train_dataset = lgb.Dataset(train[features], train[target])
    valid_dataset = lgb.Dataset(valid[features], valid[target])
    
    params = {
                "objective": 'regression',
                "metric": 'rmse',
                "seed": 42,
                "learning_rate": 0.01,
                "boosting": "gbdt",
            }
        
    model = lgb.train(
                params=params,
                num_boost_round=1000,
                early_stopping_rounds=200,
                train_set=train_dataset,
                valid_sets=[train_dataset, valid_dataset],
                verbose_eval=100,
                feval=QWK,
            )
        
    
    preds = model.predict(valid[features], num_iteration=model.best_iteration)
    preds = np.rint(preds)
    
    score = cohen_kappa_score(preds, valid[target], weights='quadratic')
    scores.append(score)
    
    print(f"Fold = {fold}, QWK = {score:.4f}")
    
print(f"Mean = {np.mean(scores):.4f}")

Training until validation scores don't improve for 200 rounds
[100]	training's rmse: 0.839282	training's QWK: 0.809975	valid_1's rmse: 0.836875	valid_1's QWK: 0.802039
[200]	training's rmse: 0.601463	training's QWK: 0.93593	valid_1's rmse: 0.610676	valid_1's QWK: 0.930569
[300]	training's rmse: 0.549123	training's QWK: 0.946905	valid_1's rmse: 0.57589	valid_1's QWK: 0.938021
[400]	training's rmse: 0.530139	training's QWK: 0.949147	valid_1's rmse: 0.575231	valid_1's QWK: 0.936839
Early stopping, best iteration is:
[278]	training's rmse: 0.556135	training's QWK: 0.94573	valid_1's rmse: 0.578479	valid_1's QWK: 0.93911
Fold = 0, QWK = 0.9391
Training until validation scores don't improve for 200 rounds
[100]	training's rmse: 0.832482	training's QWK: 0.793946	valid_1's rmse: 0.850386	valid_1's QWK: 0.78581
[200]	training's rmse: 0.591465	training's QWK: 0.938665	valid_1's rmse: 0.63373	valid_1's QWK: 0.922456
[300]	training's rmse: 0.538515	training's QWK: 0.947418	valid_1's rmse: 0.601053	

## Karolinska

In [98]:
df_train = pd.read_csv(os.path.join(DATA, "train.csv"))
df_train = df_train[df_train.data_provider == 'karolinska']
for label in range(0,3):
    df_train[f'percent_{label}'] = None
    df_train[f'count_{label}'] = None

for i in tqdm(range(len(df_train))):
    idx = df_train.iloc[i, 0]
    isup = df_train.iloc[i, 2]
    gleason = df_train.iloc[i, 3]
    mask_files = [os.path.join(MASK, f'{idx}_{j}.png') for j in range(12)]
    if os.path.exists(mask_files[0]):
        masks = [np.expand_dims(open_image(fname),0) for fname in mask_files]
        masks = np.concatenate(masks)
        cnt, feat = extract_features(masks)
        for label in range(0,3):
            df_train[f'count_{label}'].iloc[i] = cnt[label-1]
            df_train[f'percent_{label}'].iloc[i] = feat[label-1]
    else:
        continue

HBox(children=(FloatProgress(value=0.0, max=5456.0), HTML(value='')))




In [99]:
df_train = df_train.replace(to_replace='None', value=np.nan).dropna()
df_train.reset_index(drop=True)

Unnamed: 0,image_id,data_provider,isup_grade,gleason_score,percent_0,count_0,percent_1,count_1,percent_2,count_2
0,0005f7aaab2800f6170c399693a96917,karolinska,0,0+0,0,0,1,706016,0,0
1,000920ad0b612851f8e01bcc880d9b3d,karolinska,0,0+0,0,0,1,545488,0,0
2,001c62abd11fa4b57bf7a6c603a11bb9,karolinska,4,4+4,0,0,0,0,1,738528
3,001d865e65ef5d2579c190a0e0350d8f,karolinska,0,0+0,0,0,1,704624,0,0
4,002a4db09dad406c85505a00fb6f6144,karolinska,0,0+0,0,0,1,693268,0,0
...,...,...,...,...,...,...,...,...,...,...
5431,ffb182fa5b636005e21eec384bbf406b,karolinska,0,0+0,0,0,1,771360,0,0
5432,ffc005d56a21efbd034425623f596984,karolinska,2,3+4,0,0,0.0235566,3264,0.976443,135296
5433,ffc0cbbe28a6345a179d6210ef5c579f,karolinska,4,4+4,0,0,0.777093,429944,0.222907,123328
5434,ffcd99c47e57ad2934dc6bbf5edf6675,karolinska,0,0+0,0,0,1,620768,0,0


In [100]:
skf = StratifiedKFold(4, shuffle=True, random_state=2020)
splits = list(skf.split(df_train, df_train.isup_grade))

#features = [f"percent_{label}" for label in range(1, 6)] 
features = [f"percent_{label}" for label in range(0, 3)] + [f"count_{label}" for label in range(0, 3)]
target = 'isup_grade'

In [101]:
from sklearn.neighbors import KNeighborsClassifier

scores = []
for fold in range(4):
    train = df_train.iloc[splits[fold][0]]
    valid = df_train.iloc[splits[fold][1]]
    
    model = KNeighborsClassifier(n_neighbors=5)
    
    model.fit(train[features], train[target])
    
    preds = model.predict(valid[features])
    
    score = cohen_kappa_score(preds, valid[target], weights='quadratic')
    scores.append(score)
    print(f"Fold = {fold}, QWK = {score:.4f}")
    
print(f"Mean = {np.mean(scores):.4f}")

Fold = 0, QWK = 0.5406
Fold = 1, QWK = 0.5726
Fold = 2, QWK = 0.5597
Fold = 3, QWK = 0.5671
Mean = 0.5600


In [102]:
# rfc
from sklearn.ensemble import RandomForestClassifier

scores = []
for fold in range(4):
    train = df_train.iloc[splits[fold][0]]
    valid = df_train.iloc[splits[fold][1]]
    
    model = RandomForestClassifier(random_state=42)
    
    model.fit(train[features], train[target])
    
    preds = model.predict(valid[features])
    
    score = cohen_kappa_score(preds, valid[target], weights='quadratic')
    scores.append(score)
    print(f"Fold = {fold}, QWK = {score:.4f}")
    
print(f"Mean = {np.mean(scores):.4f}")

Fold = 0, QWK = 0.5503
Fold = 1, QWK = 0.5687
Fold = 2, QWK = 0.5735
Fold = 3, QWK = 0.5608
Mean = 0.5633


In [103]:
# lgb
import lightgbm as lgb
df_train[features] = df_train[features].apply(pd.to_numeric, errors = 'coerce')
def QWK(preds, dtrain):
    labels = dtrain.get_label()
    preds = np.rint(preds)
    score = cohen_kappa_score(preds, labels, weights='quadratic')
    return ("QWK", score, True)

scores = []
for fold in range(4):
    train = df_train.iloc[splits[fold][0]]
    valid = df_train.iloc[splits[fold][1]]
    
    train_dataset = lgb.Dataset(train[features], train[target])
    valid_dataset = lgb.Dataset(valid[features], valid[target])
    
    params = {
                "objective": 'regression',
                "metric": 'rmse',
                "seed": 42,
                "learning_rate": 0.01,
                "boosting": "gbdt",
            }
        
    model = lgb.train(
                params=params,
                num_boost_round=1000,
                early_stopping_rounds=200,
                train_set=train_dataset,
                valid_sets=[train_dataset, valid_dataset],
                verbose_eval=100,
                feval=QWK,
            )
        
    
    preds = model.predict(valid[features], num_iteration=model.best_iteration)
    preds = np.rint(preds)
    
    score = cohen_kappa_score(preds, valid[target], weights='quadratic')
    scores.append(score)
    
    print(f"Fold = {fold}, QWK = {score:.4f}")
    
print(f"Mean = {np.mean(scores):.4f}")

Training until validation scores don't improve for 200 rounds
[100]	training's rmse: 1.02338	training's QWK: 0.48069	valid_1's rmse: 1.07574	valid_1's QWK: 0.405391
[200]	training's rmse: 0.934803	training's QWK: 0.727815	valid_1's rmse: 1.02592	valid_1's QWK: 0.66293
[300]	training's rmse: 0.908863	training's QWK: 0.74317	valid_1's rmse: 1.02514	valid_1's QWK: 0.668036
[400]	training's rmse: 0.893952	training's QWK: 0.75206	valid_1's rmse: 1.02963	valid_1's QWK: 0.672874
Early stopping, best iteration is:
[256]	training's rmse: 0.918438	training's QWK: 0.734015	valid_1's rmse: 1.02399	valid_1's QWK: 0.66339
Fold = 0, QWK = 0.6634
Training until validation scores don't improve for 200 rounds
[100]	training's rmse: 1.03602	training's QWK: 0.486891	valid_1's rmse: 1.0502	valid_1's QWK: 0.461811
[200]	training's rmse: 0.949962	training's QWK: 0.711479	valid_1's rmse: 0.987478	valid_1's QWK: 0.694018
[300]	training's rmse: 0.924728	training's QWK: 0.722137	valid_1's rmse: 0.982392	valid_1'