In [20]:
%reload_ext autoreload
%autoreload 2
## sys package
import os, sys
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="4"  # specify which GPU(s) to be used
sys.path.append("../")
## warning off
import warnings
warnings.filterwarnings("ignore")

## general package
import random
from tqdm import tqdm_notebook as tqdm
import numpy as np
import pandas as pd
from torch.utils.data import *
from sklearn.metrics import cohen_kappa_score
import lightgbm as lgb

## customized package
from input.inputPipeline_seg import *
from model.deeplabv3 import *
from model.maskClassifier import extract_features

ModuleNotFoundError: No module named 'lightgbm'

In [2]:
# DATA = "../input/prostate-cancer-grade-assessment"
image_dir = "../input/panda-32x256x256-tiles-data/train/"
csv_file = "../input/panda-16x128x128-tiles-data/4_fold_train.csv"

In [3]:
mean = torch.tensor([0.5, 0.5, 0.5])
std = torch.tensor([0.5, 0.5, 0.5])
## dataset, can fetch data by dataset[idx]
dataset = PandaPatchDatasetSegInfer(csv_file, image_dir, [mean, std], N = 12)

### Segmentation models

In [4]:
models = []
weights = [f'../train/weights/Deeplabv3Res50_12patch_radboud/Deeplabv3Res50_12patch_radboud_{i}_best.pth.tar' for i in range(4)]
for path in weights:
    state_dict = torch.load(path)
    model = Model(arch='deeplabv3_resnet50', n=6).cuda()
    model.load_state_dict(state_dict)
    model.float()
    model.eval()
    model.cuda()
    models.append(model)

del state_dict

### Inference

In [5]:
bs = 32
sz = 256
df_train = pd.read_csv(csv_file).set_index('image_id')

for label in range(1,6):
    df_train[f'percent_{label}'] = None
    df_train[f'count_{label}'] = None

dataloader = DataLoader(dataset, batch_size=bs,
                            shuffle=False, num_workers=0, collate_fn=dataloader_collte_fn_infer)
names,preds = [],[] ## record image names and predictions
with torch.no_grad():
    for idx, data in enumerate(tqdm(dataloader)):
        img, name = data
        img = img.float().cuda()
        bs,N,C,h,w = img.shape
#         img = torch.stack([img,img.flip(-1),img.flip(-2),img.flip(-1,-2),
#                   img.transpose(-1,-2),img.transpose(-1,-2).flip(-1),
#                   img.transpose(-1,-2).flip(-2),img.transpose(-1,-2).flip(-1,-2)],1)
        img = img.view(-1, N, C, h, w) # (bs * 8, N, C, w, h)
        p = [model(img)['out'] for model in models] # [4, bs * 8 * N, 6, h, w]
        p = torch.stack(p,1) # [bs * 8 * N, 4, 6, 256, 256]
#         print("0", p.shape)
        p = p.mean(1).argmax(1) # [bs * 8 * N, 1, h, w]
#         print(p.shape)
        p = p.view(bs, -1, h, w).cpu() # [bs, 8(augmentation), h, w]
#         print("1", p.shape)
#         break
        for i in range(bs):
            cnt, feat = extract_features(p[i,...])
            for label in range(1,6):
                df_train[f'count_{label}'].loc[name[i]] = cnt[label-1]
                df_train[f'percent_{label}'].loc[name[i]] = feat[label-1]

HBox(children=(FloatProgress(value=0.0, max=329.0), HTML(value='')))




In [6]:
df_train.head()

Unnamed: 0_level_0,data_provider,isup_grade,gleason_score,split,percent_1,count_1,percent_2,count_2,percent_3,count_3,percent_4,count_4,percent_5,count_5
image_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0005f7aaab2800f6170c399693a96917,karolinska,0,0+0,2,0.789948,580706,0.096262,70764,0.00371232,2729,0.110077,80920,0.0,0
000920ad0b612851f8e01bcc880d9b3d,karolinska,0,0+0,3,0.771695,440228,0.0400407,22842,0.0491,28010,0.139164,79389,0.0,0
0018ae58b01bdadc8e347995b69f99aa,radboud,4,4+4,2,0.575941,414081,0.00198063,1424,0.0,0,0.422078,303459,0.0,0
001c62abd11fa4b57bf7a6c603a11bb9,karolinska,4,4+4,1,0.514769,393213,0.0,0,0.0,0,0.484548,370128,0.000683369,522
001d865e65ef5d2579c190a0e0350d8f,karolinska,0,0+0,1,0.809235,598630,0.0566247,41888,0.104219,77096,0.029921,22134,0.0,0


In [8]:
df_train.to_csv('../input/panda-32x256x256-tiles-data/rad_seg_feature_train.csv')

## Radboud

In [9]:
df_train = pd.read_csv('../input/panda-32x256x256-tiles-data/rad_seg_feature_train.csv')
# df_train = df_train[df_train.data_provider == 'radboud']
# for label in range(0,6):
#     df_train[f'percent_{label}'] = None
#     df_train[f'count_{label}'] = None

# for i in tqdm(range(len(df_train))):
#     idx = df_train.iloc[i, 0]
#     isup = df_train.iloc[i, 2]
#     gleason = df_train.iloc[i, 3]
#     mask_files = [os.path.join(MASK, f'{idx}_{j}.png') for j in range(12)]
#     if os.path.exists(mask_files[0]):
#         masks = [np.expand_dims(open_image(fname),0) for fname in mask_files]
#         masks = np.concatenate(masks)
#         cnt, feat = extract_features(masks)
#         for label in range(0,6):
#             df_train[f'count_{label}'].iloc[i] = cnt[label-1]
#             df_train[f'percent_{label}'].iloc[i] = feat[label-1]
#     else:
#         continue

In [10]:
df_train = df_train.replace(to_replace='None', value=np.nan).dropna()
df_train.reset_index(drop=True)

Unnamed: 0,image_id,data_provider,isup_grade,gleason_score,split,percent_1,count_1,percent_2,count_2,percent_3,count_3,percent_4,count_4,percent_5,count_5
0,0005f7aaab2800f6170c399693a96917,karolinska,0,0+0,2,0.789948,580706,0.096262,70764,0.003712,2729,0.110077,80920,0.000000,0
1,000920ad0b612851f8e01bcc880d9b3d,karolinska,0,0+0,3,0.771695,440228,0.040041,22842,0.049100,28010,0.139164,79389,0.000000,0
2,0018ae58b01bdadc8e347995b69f99aa,radboud,4,4+4,2,0.575941,414081,0.001981,1424,0.000000,0,0.422078,303459,0.000000,0
3,001c62abd11fa4b57bf7a6c603a11bb9,karolinska,4,4+4,1,0.514769,393213,0.000000,0,0.000000,0,0.484548,370128,0.000683,522
4,001d865e65ef5d2579c190a0e0350d8f,karolinska,0,0+0,1,0.809235,598630,0.056625,41888,0.104219,77096,0.029921,22134,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10510,ffd2841373b39792ab0c84cccd066e31,radboud,0,negative,2,0.867852,647577,0.106699,79617,0.025449,18990,0.000000,0,0.000000,0
10511,ffdc59cd580a1468eac0e6a32dd1ff2d,radboud,5,4+5,2,0.874899,664547,0.001984,1507,0.000000,0,0.055447,42116,0.067670,51400
10512,ffe06afd66a93258f8fabdef6044e181,radboud,0,negative,1,0.920841,677996,0.075156,55336,0.000956,704,0.003046,2243,0.000000,0
10513,ffe236a25d4cbed59438220799920749,radboud,2,3+4,0,0.610516,436516,0.022168,15850,0.276704,197842,0.090612,64787,0.000000,0


In [13]:
# skf = StratifiedKFold(4, shuffle=True, random_state=2020)
# splits = list(skf.split(df_train, df_train.isup_grade))

#features = [f"percent_{label}" for label in range(1, 6)] 
features = [f"percent_{label}" for label in range(1, 6)] + [f"count_{label}" for label in range(1, 6)]
target = 'isup_grade'

In [16]:
from sklearn.neighbors import KNeighborsClassifier

scores = []
for fold in range(4):
    train = df_train[df_train['split'] != fold]
    valid = df_train[df_train['split'] == fold]
    
    model = KNeighborsClassifier(n_neighbors=5)
    
    model.fit(train[features], train[target])
    
    preds = model.predict(valid[features])
    
    score = cohen_kappa_score(preds, valid[target], weights='quadratic')
    scores.append(score)
    print(f"Fold = {fold}, QWK = {score:.4f}")
    
print(f"Mean = {np.mean(scores):.4f}")

Fold = 0, QWK = 0.4908
Fold = 1, QWK = 0.4666
Fold = 2, QWK = 0.4802
Fold = 3, QWK = 0.4843
Mean = 0.4805


In [18]:
# rfc
from sklearn.ensemble import RandomForestClassifier

scores = []
for fold in range(4):
    train = df_train[df_train['split'] != fold]
    valid = df_train[df_train['split'] == fold]
    
    model = RandomForestClassifier(random_state=42)
    
    model.fit(train[features], train[target])
    
    preds = model.predict(valid[features])
    
    score = cohen_kappa_score(preds, valid[target], weights='quadratic')
    scores.append(score)
    print(f"Fold = {fold}, QWK = {score:.4f}")
    
print(f"Mean = {np.mean(scores):.4f}")

Fold = 0, QWK = 0.5378
Fold = 1, QWK = 0.5247
Fold = 2, QWK = 0.5357
Fold = 3, QWK = 0.5308
Mean = 0.5323


In [None]:
df_train[features] = df_train[features].apply(pd.to_numeric, errors = 'coerce')

In [19]:
# lgb
import lightgbm as lgb

def QWK(preds, dtrain):
    labels = dtrain.get_label()
    preds = np.rint(preds)
    score = cohen_kappa_score(preds, labels, weights='quadratic')
    return ("QWK", score, True)

scores = []
for fold in range(4):
    train = df_train[df_train['split'] != fold]
    valid = df_train[df_train['split'] == fold]
    
    train_dataset = lgb.Dataset(train[features], train[target])
    valid_dataset = lgb.Dataset(valid[features], valid[target])
    
    params = {
                "objective": 'regression',
                "metric": 'rmse',
                "seed": 42,
                "learning_rate": 0.01,
                "boosting": "gbdt",
            }
        
    model = lgb.train(
                params=params,
                num_boost_round=1000,
                early_stopping_rounds=200,
                train_set=train_dataset,
                valid_sets=[train_dataset, valid_dataset],
                verbose_eval=100,
                feval=QWK,
            )
        
    
    preds = model.predict(valid[features], num_iteration=model.best_iteration)
    preds = np.rint(preds)
    
    score = cohen_kappa_score(preds, valid[target], weights='quadratic')
    scores.append(score)
    
    print(f"Fold = {fold}, QWK = {score:.4f}")
    
print(f"Mean = {np.mean(scores):.4f}")

ModuleNotFoundError: No module named 'lightgbm'

## Karolinska

In [None]:
df_train = pd.read_csv(os.path.join(DATA, "train.csv"))
df_train = df_train[df_train.data_provider == 'karolinska']
for label in range(0,3):
    df_train[f'percent_{label}'] = None
    df_train[f'count_{label}'] = None

for i in tqdm(range(len(df_train))):
    idx = df_train.iloc[i, 0]
    isup = df_train.iloc[i, 2]
    gleason = df_train.iloc[i, 3]
    mask_files = [os.path.join(MASK, f'{idx}_{j}.png') for j in range(12)]
    if os.path.exists(mask_files[0]):
        masks = [np.expand_dims(open_image(fname),0) for fname in mask_files]
        masks = np.concatenate(masks)
        cnt, feat = extract_features(masks)
        for label in range(0,3):
            df_train[f'count_{label}'].iloc[i] = cnt[label-1]
            df_train[f'percent_{label}'].iloc[i] = feat[label-1]
    else:
        continue

In [None]:
df_train = df_train.replace(to_replace='None', value=np.nan).dropna()
df_train.reset_index(drop=True)

In [None]:
skf = StratifiedKFold(4, shuffle=True, random_state=2020)
splits = list(skf.split(df_train, df_train.isup_grade))

#features = [f"percent_{label}" for label in range(1, 6)] 
features = [f"percent_{label}" for label in range(0, 3)] + [f"count_{label}" for label in range(0, 3)]
target = 'isup_grade'

In [None]:
from sklearn.neighbors import KNeighborsClassifier

scores = []
for fold in range(4):
    train = df_train.iloc[splits[fold][0]]
    valid = df_train.iloc[splits[fold][1]]
    
    model = KNeighborsClassifier(n_neighbors=5)
    
    model.fit(train[features], train[target])
    
    preds = model.predict(valid[features])
    
    score = cohen_kappa_score(preds, valid[target], weights='quadratic')
    scores.append(score)
    print(f"Fold = {fold}, QWK = {score:.4f}")
    
print(f"Mean = {np.mean(scores):.4f}")

In [None]:
# rfc
from sklearn.ensemble import RandomForestClassifier

scores = []
for fold in range(4):
    train = df_train.iloc[splits[fold][0]]
    valid = df_train.iloc[splits[fold][1]]
    
    model = RandomForestClassifier(random_state=42)
    
    model.fit(train[features], train[target])
    
    preds = model.predict(valid[features])
    
    score = cohen_kappa_score(preds, valid[target], weights='quadratic')
    scores.append(score)
    print(f"Fold = {fold}, QWK = {score:.4f}")
    
print(f"Mean = {np.mean(scores):.4f}")

In [None]:
# lgb
import lightgbm as lgb
df_train[features] = df_train[features].apply(pd.to_numeric, errors = 'coerce')
def QWK(preds, dtrain):
    labels = dtrain.get_label()
    preds = np.rint(preds)
    score = cohen_kappa_score(preds, labels, weights='quadratic')
    return ("QWK", score, True)

scores = []
for fold in range(4):
    train = df_train.iloc[splits[fold][0]]
    valid = df_train.iloc[splits[fold][1]]
    
    train_dataset = lgb.Dataset(train[features], train[target])
    valid_dataset = lgb.Dataset(valid[features], valid[target])
    
    params = {
                "objective": 'regression',
                "metric": 'rmse',
                "seed": 42,
                "learning_rate": 0.01,
                "boosting": "gbdt",
            }
        
    model = lgb.train(
                params=params,
                num_boost_round=1000,
                early_stopping_rounds=200,
                train_set=train_dataset,
                valid_sets=[train_dataset, valid_dataset],
                verbose_eval=100,
                feval=QWK,
            )
        
    
    preds = model.predict(valid[features], num_iteration=model.best_iteration)
    preds = np.rint(preds)
    
    score = cohen_kappa_score(preds, valid[target], weights='quadratic')
    scores.append(score)
    
    print(f"Fold = {fold}, QWK = {score:.4f}")
    
print(f"Mean = {np.mean(scores):.4f}")