In [1]:
batch_size = 64
ncomps = 500
crop_size = 180

In [2]:

import os
import sys
import random
import numpy as np
import pandas as pd
import torch
from monai.data import DataLoader
from monai.transforms import Compose, NormalizeIntensity, ToTensor
import matplotlib.pyplot as plt
# import config_file as cfg
# from utils import get_model
# from torchsummary import summary

import torchvision.transforms.functional as F

In [3]:
from pathlib import Path

macaw_path = Path(os.getcwd())
sys.path.append(str(macaw_path) +'/')
exp_name = 'moin_bias'

train_csv_path = macaw_path/'splits2/exp199/train.csv'
train_img_path = macaw_path/exp_name/'train'

val_csv_path = macaw_path/'splits2/exp199/val.csv'
val_img_path = macaw_path/exp_name/'val'

test_csv_path = macaw_path/'splits2/exp199/test.csv'
test_img_path = macaw_path/exp_name/'test'

pca_path = macaw_path/exp_name/f'train_hc_pca_{ncomps}.pkl'

train_path = macaw_path/exp_name/f'train_hc_data_PCA_{ncomps}.pkl'
val_path = macaw_path/exp_name/f'val_hc_data_PCA_{ncomps}.pkl'
test_path = macaw_path/exp_name/f'test_hc_data_PCA_{ncomps}.pkl'

In [4]:
home_dir = './'
working_dir = home_dir + exp_name + '/'

df_train = pd.read_csv(train_csv_path)
df_val = pd.read_csv(val_csv_path)

train_fpaths = [os.path.join(working_dir, "train", filename) for filename in df_train['filename']]
train_class_label = df_train['class_label']

val_fpaths = [os.path.join(working_dir, "val", filename) for filename in df_val['filename']]
val_class_label = df_val['class_label']

In [5]:
df_train.head()

In [6]:
df_val.head()

In [7]:
# Set bias to zero in csv for no_bias experiments
if exp_name=='no_bias': 
    df_train['bias_label'].values[:]=0
    df_val['bias_label'].values[:]=0
    
print(df_train.bias_label.mean())
df_val.bias_label.mean()

In [8]:
def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

In [9]:
seed = 1  # You can use any integer as the seed
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
random.seed(seed)
np.random.seed(seed)

g = torch.Generator()
g.manual_seed(seed)

In [10]:
from utils.datasets import EmmaDataset
from monai.data import pad_list_data_collate
from torchvision.transforms import CenterCrop
from utils.customTransforms import ToFloatUKBB

# Define transforms
transforms = Compose([ToTensor(),CenterCrop(crop_size),ToFloatUKBB()])
# train_dataset = UKBBT1Dataset(train_csv_path, train_img_path, transforms.Compose([ToFloatUKBB(),ToTensor(), CenterCrop(crop_size)]))

# create a training data loader - include padding
# train_ds = ImageDataset(image_files=train_fpaths, labels=train_class_label, transform=transforms, reader="ITKReader")
train_ds = EmmaDataset(train_csv_path, train_img_path, exp_name=='no_bias',transforms)
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=0,worker_init_fn=seed_worker, generator=g, pin_memory=torch.cuda.is_available(), collate_fn=pad_list_data_collate)

In [11]:
plt.imshow(next(iter(train_loader))[2][1], cmap='gray')

In [12]:
imgs = np.concatenate([d[2].numpy() for d in train_loader],axis=0)
imgs = imgs.reshape(imgs.shape[0],-1)
imgs_dim = imgs.shape[1]

In [13]:
plt.imshow(imgs[0].reshape(crop_size,crop_size), cmap='gray')

## Dimensionality reduction

In [14]:
import pickle
from sklearn.decomposition import PCA

# if os.path.exists(pca_path):
#     with open(pca_path, 'rb') as f:
#         pca = pickle.load(f)
# else:
pca = PCA(n_components=ncomps)
pca.fit(imgs)

with open(pca_path, 'wb') as f:
    pickle.dump(pca, f)

In [15]:
# Initialize lists to collect data
imgs_list = []
disease_list = []
bias_list = []
train_img_names_list=[]

# Process each batch in the test_loader
for d in train_loader:
    disease_list.append(d[0].numpy())
    bias_list.append(d[1].numpy())
    imgs_list.append(d[2].numpy())
    train_img_names_list.append(d[3])

# Concatenate collected data after the loop
imgs = np.concatenate(imgs_list, axis=0)
imgs = imgs.reshape(imgs.shape[0], -1)
imgs_dim = imgs.shape[1]

train_img_names = np.concatenate(train_img_names_list, axis=0)

disease = np.concatenate(disease_list, axis=0)
bias = np.concatenate(bias_list, axis=0)

encoded_data = pca.transform(imgs)

In [16]:
with open(train_path, 'wb') as f:
    pickle.dump({'imgs':imgs, 'disease':disease,'bias':bias, 'pca':pca,'encoded_data':encoded_data, 'img_names':train_img_names}, f)

In [17]:
val_ds = EmmaDataset(val_csv_path, val_img_path, exp_name=='no_bias',transforms)
val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=True, num_workers=0,worker_init_fn=seed_worker, generator=g, pin_memory=torch.cuda.is_available(), collate_fn=pad_list_data_collate)

In [18]:
# Initialize lists to collect data
imgs_list = []
disease_list = []
bias_list = []

# Process each batch in the test_loader
for d in train_loader:
    disease_list.append(d[0].numpy())
    bias_list.append(d[1].numpy())
    imgs_list.append(d[2].numpy())

# Concatenate collected data after the loop
imgs = np.concatenate(imgs_list, axis=0)
imgs = imgs.reshape(imgs.shape[0], -1)
imgs_dim = imgs.shape[1]

disease = np.concatenate(disease_list, axis=0)
bias = np.concatenate(bias_list, axis=0)
encoded_data = pca.transform(imgs)

In [19]:
with open(val_path, 'wb') as f:
    pickle.dump({'imgs':imgs, 'disease':disease,'bias':bias, 'pca':pca,'encoded_data':encoded_data}, f)

In [20]:
test_ds = EmmaDataset(test_csv_path, test_img_path, exp_name=='no_bias', transforms)
test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False, num_workers=0, worker_init_fn=seed_worker, generator=g, pin_memory=torch.cuda.is_available(), collate_fn=pad_list_data_collate)

In [21]:
test_ds.df.loc[test_ds.df['filename']=='00176_0.26_S_-0.941_D_1.539_B.tiff']

In [22]:
# found = False
# 
# j=0
# 
# while not found:
#     d,b,img,names=next(iter(test_loader))
#     j+=1
#     
#     for i in range(len(d)):
#         # print(names[i])
#         if names[i] == '00176_0.26_S_-0.941_D_1.539_B.tiff':
#             found = True
#             print(d[i].item())
#             print(b[i].item())
#             print(i,j)

In [23]:
d,b,img,names = next(iter(test_loader))

In [24]:
test_ds.df.isna().sum()

In [25]:
plt.imshow(next(iter(train_loader))[2][1])

In [26]:
test_imgs = np.concatenate([d[2].numpy() for d in test_loader],axis=0)
test_imgs = test_imgs.reshape(test_imgs.shape[0],-1)
test_imgs_dim = test_imgs.shape[1]

In [27]:
plt.imshow(test_imgs[1].reshape(crop_size,crop_size), cmap='gray')

In [28]:
sample_imgs = test_imgs[:5,:]
t = pca.transform(sample_imgs)
X_recon = pca.inverse_transform(t)

In [29]:
import utils.visualize as vis
print("Disease:", disease[:5])
print("Bias:",bias[:5])

plt.rcParams["figure.figsize"] = 20,5

diff = sample_imgs - X_recon
rr = X_recon + diff
fig = vis.img_grid([d.reshape(crop_size,crop_size) for d in sample_imgs], clim=(0,1), cols=5)
fig = vis.img_grid([d.reshape(crop_size,crop_size) for d in X_recon], clim=(0,1), cols=5)
fig = vis.img_grid([d.reshape(crop_size,crop_size) for d in diff],clim=(-.5,.5),cols=5, cmap='seismic')

In [30]:
max(diff[1])

In [31]:
# Initialize lists to collect data
test_imgs_list = []
test_img_names_list = []
disease_list = []
bias_list = []

# Process each batch in the test_loader
for d in test_loader:
    disease_list.append(d[0].numpy())
    bias_list.append(d[1].numpy())
    test_imgs_list.append(d[2].numpy())
    test_img_names_list.append(d[3])
    
# Concatenate collected data after the loop
test_imgs = np.concatenate(test_imgs_list, axis=0)
test_imgs = test_imgs.reshape(test_imgs.shape[0], -1)
test_imgs_dim = test_imgs.shape[1]

test_img_names = np.concatenate(test_img_names_list, axis=0)

disease = np.concatenate(disease_list, axis=0)
bias = np.concatenate(bias_list, axis=0)

encoded_data = pca.transform(test_imgs)

In [32]:
np.max(X_recon)

In [33]:
with open(test_path, 'wb') as f:
    pickle.dump({'imgs':test_imgs, 'disease':disease,'bias':bias, 'pca':pca,'encoded_data':encoded_data, 'img_names':test_img_names}, f)