In [1]:
!pip install -q torchviz
!pip install -q colored

In [2]:
import os
import gc
import cv2
import time

import colored
from colored import fg, bg, attr

import numpy as np
import pandas as pd
from random import randint
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

import torch
from torchviz import make_dot
torch.backends.cudnn.benchmark = True
from torchvision import transforms

import torch.nn as nn
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torchvision.models import resnet18, densenet121, mobilenet_v2

In [3]:
EPOCHS = 2
SPLIT = 0.8
LR = (1e-4, 1e-3)
MODEL_SAVE_PATH = "resnet_model"

W = 64
H = 64
BATCH_SIZE = 32
VAL_BATCH_SIZE = 32
DATA_PATH = '../input/trends-assessment-prediction/'
deep = 63

In [4]:
normalize = transforms.Normalize(mean=np.tile([0.485, 0.456, 0.406], 21), std=np.tile([0.229, 0.224, 0.225], 21))
normalize

Normalize(mean=[0.485 0.456 0.406 0.485 0.456 0.406 0.485 0.456 0.406 0.485 0.456 0.406
 0.485 0.456 0.406 0.485 0.456 0.406 0.485 0.456 0.406 0.485 0.456 0.406
 0.485 0.456 0.406 0.485 0.456 0.406 0.485 0.456 0.406 0.485 0.456 0.406
 0.485 0.456 0.406 0.485 0.456 0.406 0.485 0.456 0.406 0.485 0.456 0.406
 0.485 0.456 0.406 0.485 0.456 0.406 0.485 0.456 0.406 0.485 0.456 0.406
 0.485 0.456 0.406], std=[0.229 0.224 0.225 0.229 0.224 0.225 0.229 0.224 0.225 0.229 0.224 0.225
 0.229 0.224 0.225 0.229 0.224 0.225 0.229 0.224 0.225 0.229 0.224 0.225
 0.229 0.224 0.225 0.229 0.224 0.225 0.229 0.224 0.225 0.229 0.224 0.225
 0.229 0.224 0.225 0.229 0.224 0.225 0.229 0.224 0.225 0.229 0.224 0.225
 0.229 0.224 0.225 0.229 0.224 0.225 0.229 0.224 0.225 0.229 0.224 0.225
 0.229 0.224 0.225])

In [5]:
TEST_MAP_PATH = DATA_PATH + 'fMRI_test/'
TRAIN_MAP_PATH = DATA_PATH + 'fMRI_train/'

FEAT_PATH = DATA_PATH + 'fnc.csv'
TARG_PATH = DATA_PATH + 'train_scores.csv'
SAMPLE_SUB_PATH = DATA_PATH + 'sample_submission.csv'

TEST_IDS = [map_id[:-4] for map_id in sorted(os.listdir(TEST_MAP_PATH))]
TRAIN_IDS = [map_id[:-4] for map_id in sorted(os.listdir(TRAIN_MAP_PATH))]

In [6]:
targets = pd.read_csv(TARG_PATH)
targets = targets.fillna(targets.mean())
sample_submission = pd.read_csv(SAMPLE_SUB_PATH)

features = pd.read_csv(FEAT_PATH)
test_df = features.query('Id in {}'.format(TEST_IDS)).reset_index(drop=True)
train_df = features.query('Id in {}'.format(TRAIN_IDS)).reset_index(drop=True)

In [7]:
fnc_df = pd.read_csv("/kaggle/input/trends-assessment-prediction/fnc.csv")
loading_df = pd.read_csv("/kaggle/input/trends-assessment-prediction/loading.csv")


fnc_features, loading_features = list(fnc_df.columns[1:]), list(loading_df.columns[1:])
df_full = fnc_df.merge(loading_df, on="Id")


labels_df = pd.read_csv("/kaggle/input/trends-assessment-prediction/train_scores.csv")
labels_df["is_train"] = True

df = df_full.merge(labels_df, on="Id", how="left")
df_full = df_full.set_index('Id')

test_df_x = df[df["is_train"] != True].copy()
df = df[df["is_train"] == True].copy()

df.shape, test_df_x.shape
# Giving less importance to FNC features since they are easier to overfit due to high dimensionality.
FNC_SCALE = 1/500

df[fnc_features] *= FNC_SCALE
test_df_x[fnc_features] *= FNC_SCALE
df_full[fnc_features]  *= FNC_SCALE

features = loading_features + fnc_features

 #######
    # do not forget to do something about the mean
# selecting best features

from sklearn.feature_selection import SelectKBest, f_regression

targets_list = ["age", "domain1_var1", "domain1_var2", "domain2_var1", "domain2_var2"]

best_columns = {}
for target in targets_list:
    X, y = df[features], df[target]
    y.fillna(y.mean(), inplace=True)
    selector = SelectKBest(f_regression, k=128)
    X = selector.fit_transform(X, y)    
    best_columns[target] = selector.get_support(indices=True)

In [8]:
column_values = []
for target in targets_list:
    for val in best_columns[target]:
        column_values.append(val)

In [9]:
from collections import Counter
c = Counter(column_values)
final_columns = list(df[features].iloc[:, [i for i, _ in c.most_common(128)]].columns)

In [10]:
df_full = df_full[final_columns]
df_full

Unnamed: 0_level_0,IC_06,IC_15,IC_21,IC_28,SMN(2)_vs_SCN(53),SCN(99)_vs_SCN(98),DMN(40)_vs_CON(96),DMN(17)_vs_CON(37),DMN(17)_vs_CON(38),IC_05,...,CON(88)_vs_ADN(56),SMN(9)_vs_SMN(3),SMN(11)_vs_SMN(3),SMN(27)_vs_SMN(3),SMN(54)_vs_SMN(3),SMN(72)_vs_SMN(9),CON(81)_vs_SMN(9),CBN(13)_vs_SMN(2),SMN(54)_vs_SMN(11),CBN(18)_vs_SMN(11)
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10001,0.005033,0.005123,0.009177,-0.013929,-0.000191,0.000769,0.000902,0.000332,-0.000460,0.004136,...,0.000872,-0.000163,0.000169,-0.000272,-0.000002,0.001273,0.000670,0.000130,0.001074,-0.000712
10002,0.004605,0.008819,0.012004,-0.011814,-0.000067,0.001098,0.001179,0.000982,-0.000030,0.007049,...,0.000358,0.000751,0.000433,0.000679,0.000503,0.000519,0.000122,0.000217,0.000850,0.000456
10003,0.015042,0.012548,0.018184,-0.010469,-0.000421,0.000945,0.000344,-0.000273,-0.000538,0.010444,...,0.000495,0.000802,0.001352,0.000948,0.000879,0.000767,0.000973,-0.000993,0.001320,-0.001337
10004,0.011755,0.006837,0.005956,-0.010595,0.000204,0.001291,0.000983,0.000622,0.000154,0.006154,...,0.000397,0.000985,0.001058,-0.000135,0.000268,0.001019,0.000767,-0.000233,0.000761,-0.000555
10005,0.010679,0.005255,0.005454,-0.008591,-0.000310,0.001260,0.000916,-0.000020,0.000163,0.009051,...,-0.000006,0.000261,0.000500,0.000360,0.000670,0.000589,0.000507,-0.000187,0.000473,0.000129
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21750,0.020201,0.012912,0.006448,-0.007203,0.000305,0.000805,0.000581,0.000611,-0.000049,0.012353,...,0.000287,0.000812,0.001079,0.000938,0.000722,0.001362,0.000851,-0.000728,0.001279,-0.000841
21751,0.012396,0.020112,0.012326,-0.012152,0.000243,0.001193,-0.000008,-0.000020,-0.000497,0.015428,...,0.000898,0.000401,0.000150,0.000395,0.000322,0.000207,0.000026,-0.000192,0.000711,-0.000083
21752,0.013499,0.010305,0.001160,-0.007140,-0.000354,0.001437,0.000873,0.000717,0.000305,0.010957,...,0.000519,0.000758,0.001065,0.000951,0.000922,0.000669,0.000284,-0.000930,0.000659,-0.000723
21753,0.008602,0.017471,0.020715,-0.008130,-0.000388,0.000995,0.000960,0.000692,0.000374,0.014143,...,0.000796,0.000500,0.000814,0.000227,0.000146,0.000552,0.000107,-0.000078,0.000527,0.000334


In [11]:
import cv2
import nibabel as nib

In [12]:
import h5py
import nilearn as nl
import nibabel as nib
# code same as pytorch data...... but took too long in this script
# i left the code for reference
def load_nii(file_path):
    img = nib.load(file_path)
    #nparray
    data = img.get_fdata()
    return data
# get paths of preproccessed files
import os
import fnmatch

BASEPATH = '/kaggle/input/'

dir_list = ['pytorch-data-train-1', 'pytorch-data-train-2', 'pytorch-data-train-3', 'pytorch-data-train-4', 'pytorch-data-train-5', 'pytorch-data-train-6', 'pytorch-data-train-7',
           'pytorch-data-test-1', 'pytorch-data-test-2', 'pytorch-data-test-3', 'pytorch-data-test-4', 'pytorch-data-test-5', 'pytorch-data-test-6', 'pytorch-data-test-7',
           'missing-files-1', 'missing-files-2', 'missing-files-3', 'missing-files-4']
files = {}
for directory in dir_list:
    for file in os.listdir(BASEPATH + directory + '/'):
        if fnmatch.fnmatch(file, '*.npy'):
            files[file[:-4]] = BASEPATH + directory + '/' + file
            
            
dir_list = ['cf-1001fourth',
 'cf10004fourth',
 'cf1002fourth-test',
 'cf1002fourth',
 'cf1003forth',
 'cf1004fourth-test',
 'cf-1001fourth-test',
 'cf1003forth-test']
files_c = {}
for directory in dir_list:
    for file in os.listdir(BASEPATH + directory + '/'):
        if fnmatch.fnmatch(file, '*.nii'):
            files_c[file[:-4]] = BASEPATH + directory + '/' + file
            
            
            
dir_list = ['clustering-function-rena-train', 'clustering-function-rena-test']
files_rena = {}
for directory in dir_list:
    for file in os.listdir(BASEPATH + directory + '/'):
        if fnmatch.fnmatch(file, '*.nii'):
            files_rena[file[:-4]] = BASEPATH + directory + '/' + file
            
dir_list = ['trends-assessment-prediction/fMRI_train', 'trends-assessment-prediction/fMRI_test']
files_mat = {}
for directory in dir_list:
    for file in os.listdir(BASEPATH + directory + '/'):
        if fnmatch.fnmatch(file, '*.mat'):
            files_mat[file[:-4]] = BASEPATH + directory + '/' + file
            
# import mask image
mask_niimg = nl.image.load_img(BASEPATH + 'trends-assessment-prediction/fMRI_mask.nii')


class TReNDSDataset(Dataset):
    def __init__(self, data, targets, map_path, is_train):
        self.data = data
        self.is_train = is_train
        self.map_path = map_path
        self.map_id = self.data.Id
        if is_train: self.targets = targets
            
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
#         path_npy = files[str(self.map_id[idx])]
        
#         #features = np.load(path_npy)
#         #features = features[:106, :, :]
#         clusters = load_nii(files_rena[str(self.map_id[idx])])
#         clusters = cv2.resize(np.array(clusters).transpose(1, 2, 0), (H, W))
#         clusters = np.moveaxis(clusters, [0,1,2], [2,1,0])
#         #features = np.concatenate((features, clusters))
#         #features[:, 0, :] = np.array([(df_full.loc[self.map_id[idx]][:64])] * 159)
#         #features[:, 63, :] = np.array([(df_full.loc[self.map_id[idx]][64:])] * 159)
#         #features = np.stack([np.mean(features[::3, :, :], 0), np.mean(features[1::3, :, :], 0), np.mean(features[2::3, :, :], 0)])
#         #standardscale based on z score, and then use pca whiten = True
#         clusters = clusters.reshape(53, 64*64)
#         clusters = scaler.fit_transform(clusters)
#         clusters = clusters.reshape(53, 64, 64)
#         features = np.concatenate([clusters, clusters, clusters])
        path = files_mat[str(self.map_id[idx])]
        all_maps = h5py.File(path, 'r')['SM_feature'][()]
        all_maps = np.moveaxis(all_maps, [0,1,2,3], [3,2,1,0])
        # load image into nifti file
        subject_niimg = nl.image.new_img_like(mask_niimg, all_maps, affine=mask_niimg.affine, copy_header=True)
        features = nl.image.mean_img(subject_niimg)
        features = cv2.resize(nl.image.mean_img(subject_niimg).get_fdata().reshape(63, 52, 53).transpose(1, 2, 0), (H, W))
        features = np.moveaxis(features, [0,1,2], [2,1,0])
        
        if not self.is_train:
            return normalize(torch.FloatTensor(features))
        else:
            i = self.map_id[idx]
            targets = self.targets.query('Id == {}'.format(i)).values
            targets = np.repeat(targets[:, 1:], deep, 0).reshape(-1, 5)
            return normalize(torch.FloatTensor(features)), torch.FloatTensor(targets)

In [13]:
# path = files_mat['10003']
# all_maps = h5py.File(path, 'r')['SM_feature'][()]
# all_maps = np.moveaxis(all_maps, [0,1,2,3], [3,2,1,0])
# # load image into nifti file
# subject_niimg = nl.image.new_img_like(mask_niimg, all_maps, affine=mask_niimg.affine, copy_header=True)
# features = nl.image.mean_img(subject_niimg)
# features = cv2.resize(nl.image.mean_img(subject_niimg).get_fdata().reshape(63, 52, 53).transpose(1, 2, 0), (H, W))
# features = np.moveaxis(features, [0,1,2], [2,1,0])

In [14]:
class ResNetModel(nn.Module):
    def __init__(self):
        super(ResNetModel, self).__init__()
        
        self.identity = lambda x: x
        self.dense_out = nn.Linear(16, 5)
        self.dense_in = nn.Linear(512, 16)
        self.resnet = resnet18(pretrained=True, progress=True)
        self.resnet = nn.Sequential(*list(self.resnet.children())[:-1])
        
    def forward(self, img):
        img = img.reshape(-1, 1, H, W)
        feat = self.resnet(img.repeat(1, 3, 1, 1))
        conc = self.dense_in(feat.squeeze())
        return self.identity(self.dense_out(conc))

In [15]:
def weighted_nae(inp, targ):
    W = torch.FloatTensor([0.3, 0.175, 0.175, 0.175, 0.175])
    return torch.mean(torch.matmul(torch.abs(inp - targ), W.cuda()/torch.mean(targ, axis=0)))

In [16]:
def print_metric(data, batch, epoch, start, end, metric, typ):
    time = np.round(end - start, 1)
    time = "Time: %s{}%s s".format(time)

    if typ == "Train":
        pre = "BATCH %s" + str(batch-1) + "%s  "
    if typ == "Val":
        pre = "EPOCH %s" + str(epoch+1) + "%s  "
    
    fonts = (fg(216), attr('reset'))
    value = np.round(data.item(), 3)
    t = typ, metric, "%s", value, "%s"

    print(pre % fonts , end='')
    print("{} {}: {}{}{}".format(*t) % fonts + "  " + time % fonts)

In [17]:
val_out_shape = -1, 5
train_out_shape = -1, 5

split = int(SPLIT*len(train_df))
val = train_df[split:].reset_index(drop=True)
train = train_df[:split].reset_index(drop=True)

test_set_p = TReNDSDataset(train_df, None, TRAIN_MAP_PATH, False)
test_loader_p = DataLoader(test_set_p, batch_size=VAL_BATCH_SIZE)

test_set = TReNDSDataset(test_df, None, TEST_MAP_PATH, False)
test_loader = DataLoader(test_set, batch_size=VAL_BATCH_SIZE)

In [18]:
def train_resnet18():
    def cuda(tensor):
        return tensor.cuda()
   
    val_set = TReNDSDataset(val, targets, TRAIN_MAP_PATH, True)
    val_loader = DataLoader(val_set,  batch_size=VAL_BATCH_SIZE)
    train_set = TReNDSDataset(train, targets, TRAIN_MAP_PATH, True)
    train_loader = DataLoader(train_set,  batch_size=BATCH_SIZE, shuffle=True)

    network = cuda(ResNetModel())
    optimizer =  Adam([{'params': network.resnet.parameters(), 'lr': LR[0]},
                      {'params': network.dense_in.parameters(), 'lr': LR[1]},
                      {'params': network.dense_out.parameters(), 'lr': LR[1]}])

    scheduler = ReduceLROnPlateau(optimizer, 'min', factor=0.8,
                                  patience=4, verbose=True, eps=1e-6)
    start = time.time()
    for epoch in range(EPOCHS):
        batch = 1
        fonts = (fg(216), attr('reset'))
        print(("EPOCH %s" + str(epoch+1) + "%s") % fonts)

        for train_batch in train_loader:
            train_img, train_targs = train_batch
           
            network.train()
            network = cuda(network)
            train_preds = network.forward(cuda(train_img))
            train_targs = train_targs.reshape(train_out_shape)
            train_loss = weighted_nae(train_preds, cuda(train_targs))

            optimizer.zero_grad()
            train_loss.backward()

            optimizer.step()
            end = time.time()
            batch = batch + 1
            print_metric(train_loss, batch, epoch, start, end, metric="loss", typ="Train")
            
        print("\n")
           
        network.eval()
        for val_batch in val_loader:
            img, targ = val_batch
            val_preds, val_targs = [], []

            with torch.no_grad():
                img = cuda(img)
                network = cuda(network)
                pred = network.forward(img)
                val_preds.append(pred); val_targs.append(targ)

        val_preds = torch.cat(val_preds, axis=0)
        val_targs = torch.cat(val_targs, axis=0)
        val_targs = val_targs.reshape(val_out_shape)
        val_loss = weighted_nae(val_preds, cuda(val_targs))
        
        avg_preds = []
        avg_targs = []
        for idx in range(0, len(val_preds), deep):
            avg_preds.append(val_preds[idx:idx+deep].mean(axis=0))
            avg_targs.append(val_targs[idx:idx+deep].mean(axis=0))
            
        avg_preds = torch.stack(avg_preds, axis=0)
        avg_targs = torch.stack(avg_targs, axis=0)
        loss = weighted_nae(avg_preds, cuda(avg_targs))
        
        end = time.time()
        scheduler.step(val_loss)
        print_metric(loss, None, epoch, start, end, metric="loss", typ="Val")
        
        print("\n")
   
    network.eval()
    if os.path.exists(TRAIN_MAP_PATH):

        test_preds = []
        for test_img in test_loader:
            with torch.no_grad():
                network = cuda(network)
                test_img = cuda(test_img)
                test_preds.append(network.forward(test_img))
                
                
                
        test_preds_p = []
        for test_img_p in test_loader_p:
            with torch.no_grad():
                network = cuda(network)
                test_img_p = cuda(test_img_p)
                test_preds_p.append(network.forward(test_img_p))
        
        
        avg_preds = []
        test_preds = torch.cat(test_preds, axis=0)
        for idx in range(0, len(test_preds), deep):
            avg_preds.append(test_preds[idx:idx+deep].mean(axis=0))
            
        avg_preds_p = []
        test_preds_p = torch.cat(test_preds_p, axis=0)
        for idx in range(0, len(test_preds_p), deep):
            avg_preds_p.append(test_preds_p[idx:idx+deep].mean(axis=0))


        return torch.stack(avg_preds_p, axis=0).detach().cpu().numpy(), torch.stack(avg_preds, axis=0).detach().cpu().numpy()

In [19]:
print("STARTING TRAINING ...\n")

train_preds_final, test_preds_final = train_resnet18()
    
print("ENDING TRAINING ...")

STARTING TRAINING ...



Downloading: "https://download.pytorch.org/models/resnet18-5c106cde.pth" to /root/.cache/torch/checkpoints/resnet18-5c106cde.pth


HBox(children=(FloatProgress(value=0.0, max=46827520.0), HTML(value='')))


EPOCH [38;5;216m1[0m
BATCH [38;5;216m1[0m  Train loss: [38;5;216m0.999[0m  Time: [38;5;216m20.9[0m s
BATCH [38;5;216m2[0m  Train loss: [38;5;216m0.991[0m  Time: [38;5;216m37.0[0m s
BATCH [38;5;216m3[0m  Train loss: [38;5;216m0.983[0m  Time: [38;5;216m52.6[0m s
BATCH [38;5;216m4[0m  Train loss: [38;5;216m0.972[0m  Time: [38;5;216m67.8[0m s
BATCH [38;5;216m5[0m  Train loss: [38;5;216m0.964[0m  Time: [38;5;216m83.2[0m s
BATCH [38;5;216m6[0m  Train loss: [38;5;216m0.954[0m  Time: [38;5;216m101.2[0m s
BATCH [38;5;216m7[0m  Train loss: [38;5;216m0.944[0m  Time: [38;5;216m117.0[0m s
BATCH [38;5;216m8[0m  Train loss: [38;5;216m0.933[0m  Time: [38;5;216m133.2[0m s
BATCH [38;5;216m9[0m  Train loss: [38;5;216m0.919[0m  Time: [38;5;216m149.6[0m s
BATCH [38;5;216m10[0m  Train loss: [38;5;216m0.909[0m  Time: [38;5;216m165.9[0m s
BATCH [38;5;216m11[0m  Train loss: [38;5;216m0.901[0m  Time: [38;5;216m181.9[0m s
BATCH [38;5;216m12[0

In [20]:
pd.DataFrame(train_preds_final, columns=targets.columns[1:], index=train_df['Id']).to_csv('mean_image_train.csv')

In [21]:
pd.DataFrame(test_preds_final, columns=targets.columns[1:], index=test_df['Id']).to_csv('mean_image_test.csv')