# MUDI data preprocessing

In [None]:
import os
from pathlib import Path
from typing import Optional

import h5py
import matplotlib.pyplot as plt
import numpy as np
import plotly.graph_objects as go
from nilearn import image, plotting
from nilearn.masking import apply_mask
from plotly.subplots import make_subplots

from autoencoder.logger import logger, set_log_level

In [None]:
set_log_level(10)

In [None]:
root_dir = "/media/maarten/disk1/MUDI/"
img_file = "MB_Re_t_moco_registered_applytopup.nii.gz"
msk_file = "brain_mask.nii.gz"
scheme_file = "parameters_new"

## The MUDI data

In [None]:
selected_imgs = image.index_img(
    str(Path(root_dir, "cdmri0011", img_file)), np.array([1, 10, 100, 1000])
)

for img in image.iter_img(selected_imgs):
    # img is now an in-memory 3D img
    plotting.plot_anat(img)

Each scan has a mask to mask the brain out.

In [None]:
plotting.plot_anat(str(Path(root_dir, "cdmri0011", msk_file)));

In [None]:
scheme = np.loadtxt(Path(root_dir, "parameters_new.txt"))


def set_dir(r):
    if r[3] == 0.0:
        r[0:3] = 0.0
    return r


scheme = np.apply_along_axis(set_dir, 1, scheme)
fig = go.Figure(
    data=go.Scatter3d(
        x=scheme[:, 0],
        y=scheme[:, 1],
        z=scheme[:, 2],
        mode="markers",
        marker=dict(color=scheme[:, 3], colorscale="Bluered", showscale=True),
    )
)

fig.update_layout(
    title_text="Q-space colored by b-values.",
    margin=dict(l=0, r=0, b=0),
    width=1000,
    height=1000,
    scene=dict(
        annotations=[
            dict(
                showarrow=False,
                x=0,
                y=0,
                z=0,
                text="b = 0",
                xanchor="left",
                xshift=10,
                opacity=0.7,
            )
        ]
    ),
)
fig.show()

## Load data

In [None]:
scans = list()  # cannot be numpy array as scans are irregular shaped.
for name in ["cdmri0011", "cdmri0012", "cdmri0013", "cdmri0014", "cdmri0015"]:
    scan_f_img = str(Path(root_dir, name, img_file))
    scan_f_msk = str(Path(root_dir, name, msk_file))
    scan = np.transpose(apply_mask(imgs=scan_f_img, mask_img=scan_f_msk))

    scans.append(scan)

    logger.debug("Loaded scan with shape: %s", scan.shape)

In [None]:
fig, axes = plt.subplots(1, 5, sharey=True, figsize=(20, 10))
for i in range(5):
    axes[i].hist(scans[i].flatten())

### Distribution of the 95th percentile of the data

In [None]:
max_data = np.percentile(scans[0].flatten(), 95)
logger.info("95th percentile max: %f", max_data)

In [None]:
fig, axes = plt.subplots(1, 5, sharey=True, figsize=(20, 10))
for i in range(5):
    axes[i].hist(scans[i].flatten(), range=[0, max_data])

## Harmonize the data
We want the different scans to have a similair distribution of values. We look at different techniques to do this.

### Median across all images

In [None]:
median_scans = list()
for i in range(5):
    median_scan = np.median(scans[i], axis=0)
    median_scans.append(median_scan)

In [None]:
scan_lstsq_coefs = np.empty((5, 2), dtype=np.float32)
scan_lstsq_coefs[0] = np.asarray(
    [1.0, 11]
)  # initialize with 1 because the coef of itself is 1

for i in range(1, 5):
    scan_lstsq_coef, _, _, _ = np.linalg.lstsq(
        median_scans[i][:, np.newaxis], median_scans[0], rcond=-1
    )
    scan_lstsq_coefs[i] = np.asarray([scan_lstsq_coef[0], 11 + i])

    logger.info("lstsq coefficient for scan %d: %f", 11 + i, scan_lstsq_coef)

In [None]:
fig, axes = plt.subplots(1, 5, sharey=True, figsize=(20, 5))

colors = ["bo", "go", "ro", "co", "mo"]
for i in range(5):
    axes[i].plot(
        median_scans[i],
        median_scans[0],
        colors[i],
        median_scans[i],
        median_scans[i] * scan_lstsq_coefs[i, 0],
        ":k",
        median_scans[i],
        median_scans[i],
        "-k",
    )

for ax in axes:
    ax.set(aspect="equal")

### Median across b=0 images

In [None]:
scheme = np.loadtxt(Path(root_dir, "parameters_new.txt"))
mask = scheme[:, 3] == 0.0

In [None]:
median_scans = list()
for i in range(5):
    median_scan = np.median(scans[i][:, mask], axis=0)
    median_scans.append(median_scan)

In [None]:
scan_lstsq_coefs = np.empty((5, 2), dtype=np.float32)
scan_lstsq_coefs[0] = np.asarray(
    [1.0, 11]
)  # initialize with 1 because the coef of itself is 1

for i in range(1, 5):
    scan_lstsq_coef, _, _, _ = np.linalg.lstsq(
        median_scans[i][:, np.newaxis], median_scans[0], rcond=-1
    )
    scan_lstsq_coefs[i] = np.asarray([scan_lstsq_coef[0], 11 + i])

    logger.info("lstsq coefficient for scan %d: %f", 11 + i, scan_lstsq_coef)

In [None]:
fig, axes = plt.subplots(1, 5, sharey=True, figsize=(20, 5))

colors = ["bo", "go", "ro", "co", "mo"]
for i in range(5):
    axes[i].plot(
        median_scans[i],
        median_scans[0],
        colors[i],
        median_scans[i],
        median_scans[i] * scan_lstsq_coefs[i, 0],
        ":k",
        median_scans[i],
        median_scans[i],
        "-k",
    )

for ax in axes:
    ax.set(aspect="equal")

This technique seems to get us closer to scan 0, so we use this.

### Normalize the data

#### Normalize according to 99 percentile of Subject 11 and save in one big file

In [None]:
# max_data = masked_data11.max()
max_data = np.percentile(masked_data11, 99)
masked_data11n = masked_data11.astype("float32") / max_data
masked_data11n = np.clip(masked_data11n, 0, 1)
masked_data12n = masked_data12.astype("float32") * a12 / max_data
masked_data12n = np.clip(masked_data12n, 0, 1)
masked_data13n = masked_data13.astype("float32") * a13 / max_data
masked_data13n = np.clip(masked_data13n, 0, 1)
masked_data14n = masked_data14.astype("float32") * a14 / max_data
masked_data14n = np.clip(masked_data14n, 0, 1)
masked_data15n = masked_data15.astype("float32") * a15 / max_data
masked_data15n = np.clip(masked_data15n, 0, 1)

In [None]:
print(max_data)
print(masked_data11.shape)

In [None]:
# plt.hist(masked_data11, bins = 'auto')
# plt.show()

In [None]:
subj11 = 11 * np.ones((masked_data11.shape[0],), dtype=int)
subj12 = 12 * np.ones((masked_data12.shape[0],), dtype=int)
subj13 = 13 * np.ones((masked_data13.shape[0],), dtype=int)
subj14 = 14 * np.ones((masked_data14.shape[0],), dtype=int)
subj15 = 15 * np.ones((masked_data15.shape[0],), dtype=int)

In [None]:
subj = np.concatenate((subj11, subj12, subj13, subj14, subj15), axis=0)
print(subj.shape)
masked_data = np.concatenate(
    (masked_data11n, masked_data12n, masked_data13n, masked_data14n, masked_data15n),
    axis=0,
)
print(masked_data.shape)

In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame(np.concatenate((subj[:, np.newaxis], masked_data), axis=1))

In [None]:
df

In [None]:
df.to_csv("data.csv")

#### Normalize according to 95 percentile of Subject 11 and save as separate files

In [None]:
# max_data = masked_data11.max()
max_data = np.percentile(masked_data11, 95)
masked_data11n = masked_data11.astype("float32") / max_data
masked_data11n = np.clip(masked_data11n, 0, 1)
masked_data12n = masked_data12.astype("float32") * a12 / max_data
masked_data12n = np.clip(masked_data12n, 0, 1)
masked_data13n = masked_data13.astype("float32") * a13 / max_data
masked_data13n = np.clip(masked_data13n, 0, 1)
masked_data14n = masked_data14.astype("float32") * a14 / max_data
masked_data14n = np.clip(masked_data14n, 0, 1)
masked_data15n = masked_data15.astype("float32") * a15 / max_data
masked_data15n = np.clip(masked_data15n, 0, 1)

In [None]:
print(max_data)
print(masked_data11.shape)

In [None]:
fig, axes = plt.subplots(1, 5, sharey=True, figsize=(20, 10))
axes[0].hist(masked_data11n.flatten(), range=[0, 1])
axes[1].hist(masked_data12n.flatten(), range=[0, 1])
axes[2].hist(masked_data13n.flatten(), range=[0, 1])
axes[3].hist(masked_data14n.flatten(), range=[0, 1])
axes[4].hist(masked_data15n.flatten(), range=[0, 1])
fig.show()

In [None]:
import pandas as pd

In [None]:
df11 = pd.DataFrame(
    np.concatenate((np.zeros_like(subj11[:, np.newaxis]), masked_data11n), axis=1)
)
df12 = pd.DataFrame(
    np.concatenate((np.zeros_like(subj12[:, np.newaxis]), masked_data12n), axis=1)
)
df13 = pd.DataFrame(
    np.concatenate((np.zeros_like(subj13[:, np.newaxis]), masked_data13n), axis=1)
)
df14 = pd.DataFrame(
    np.concatenate((np.zeros_like(subj14[:, np.newaxis]), masked_data14n), axis=1)
)
df15 = pd.DataFrame(
    np.concatenate((np.zeros_like(subj15[:, np.newaxis]), masked_data15n), axis=1)
)

In [None]:
df11

In [None]:
df11.to_csv("data11.csv")
df12.to_csv("data12.csv")
df13.to_csv("data13.csv")
df14.to_csv("data14.csv")
df15.to_csv("data15.csv")

#### Normalize according to 95 percentile of Subject 11, don't clip, and save in two big files (one 'header' and one 'data') (USE THIS)

In [None]:
max_data = np.percentile(scans[0], 95)
logger.info("95th percentile: %f", max_data)

normalized_scans = list()
for i in range(5):
    normalized_scan = scans[i].astype("float32") * scan_lstsq_coefs[i, 0] / max_data
    normalized_scans.append(normalized_scan)

In [None]:
fig, axes = plt.subplots(1, 5, sharey=True, figsize=(20, 10))

for i in range(5):
    axes[i].hist(normalized_scans[i].flatten(), range=[0, 1])

Save the scan data in one file. This makes it easier later for training. We save a seperate header file such that we can distinguish the different scans from each other later. 

In [None]:
def save_data(name: str, max_rows: Optional[int] = None):
    """Save the data to hdf5

    Args:
        name (str): name of the file.
        max_rows (int, optional): use this to select a subset of the data. Used
        for testing. Set to None to get all the data. Defaults to None.
    """
    indexes = np.concatenate(
        [
            (i + 11) * np.ones((scans[i].shape[0],), dtype=int)[:max_rows]
            for i in range(5)
        ],
        axis=0,
    )
    data = np.concatenate(
        [normalized_scan[:max_rows, :] for normalized_scan in normalized_scans],
        axis=0,
    )

    hdf5_f_path = Path(root_dir, f"{name}.hdf5")
    with h5py.File(hdf5_f_path, "w") as hdf5_f:
        hdf5_f.create_dataset("data", data=data)
        hdf5_f.create_dataset("index", data=indexes)
        hdf5_f.create_dataset("scheme", data=scheme)

        normalization_data = hdf5_f.create_group("normalization_data")
        normalization_data.create_dataset("lstsq_coef", data=scan_lstsq_coefs)
        normalization_data.create_dataset("max_data", data=np.asarray([max_data]))

Save some fake data to test the correctness of the neural network

In [None]:
save_data("data_fake", max_rows=500)

Proceed saving the real data

In [None]:
save_data("data")

# Test data

In [None]:
selecf = "/home/sapct5/Documents/Code/MUDI/MUDI_CA_LR/Run(lr=0.001, batch_size=100)K=500_epoch=800_testnone_unique.txt"
selecind = np.sort(np.loadtxt(selecf, dtype=int))
print(selecind)

mask = scheme[:, 3] == 0.0

mask_ = np.zeros(np.shape(mask), dtype=bool)
mask_[selecind] = True
print(mask_.shape)

mask3 = mask & mask_
print(mask3.shape)

scheme_ = scheme[mask_]
print(scheme_.shape)

mask2 = scheme_[:, 3] == 0.0
print(mask2.shape)

In [None]:
direc16 = "./data"
masked_data16 = np.transpose(
    apply_mask(
        imgs=os.path.join(direc16, "16_MB_RE_t.nii.gz"),
        mask_img=os.path.join(direc16, "brain_mask-testing1.nii.gz"),
    )
)

In [None]:
masked_data16.shape

In [None]:
direc17 = "./data"
masked_data17 = np.transpose(
    apply_mask(
        imgs=os.path.join(direc17, "17_MB_RE_t.nii.gz"),
        mask_img=os.path.join(direc17, "brain_mask-testing2.nii.gz"),
    )
)

In [None]:
med16 = np.median(masked_data16[:, mask2], axis=0)
med17 = np.median(masked_data17[:, mask2], axis=0)

In [None]:
med11_ = np.median(masked_data11[:, mask3], axis=0)

In [None]:
a16, _, _, _ = np.linalg.lstsq(med16[:, np.newaxis], med11_)
a17, _, _, _ = np.linalg.lstsq(med17[:, np.newaxis], med11_)
print(a16, a17)

In [None]:
fig, axes = plt.subplots(1, 2, sharey=True, figsize=(20, 5))
axes[0].plot(med16, med11_, "yo", med16, med16 * a16, ":k", med16, med16, "-k")
axes[1].plot(med17, med11_, "bo", med17, med17 * a17, ":k", med17, med17, "-k")
for ax in axes:
    ax.set(aspect="equal")
fig.show()

In [None]:
selected_imgs = image.index_img(
    os.path.join(direc11, img_file), np.array(selecind[[1, 10, 100, 300]])
)
for img in image.iter_img(selected_imgs):
    # img is now an in-memory 3D img
    plotting.plot_anat(img, vmin=0, vmax=15)

In [None]:
selected_imgs = image.index_img(
    os.path.join(direc16, "16_MB_RE_t.nii.gz"), np.array([1, 10, 100, 300])
)
for img in image.iter_img(selected_imgs):
    # img is now an in-memory 3D img
    plotting.plot_anat(img, vmin=0, vmax=15)

In [None]:
masked_data16n = masked_data16.astype("float32") * a16 / max_data
masked_data17n = masked_data17.astype("float32") * a17 / max_data

In [None]:
subj16 = 16 * np.ones((masked_data16.shape[0],), dtype=int)
subj17 = 17 * np.ones((masked_data17.shape[0],), dtype=int)

In [None]:
subj = np.concatenate((subj16, subj17), axis=0)
print(subj.shape)
masked_data = np.concatenate((masked_data16n, masked_data17n), axis=0)
print(masked_data.shape)

In [None]:
ind = np.arange(len(subj16) + len(subj17))
ind.shape

In [None]:
df1 = pd.DataFrame(np.concatenate((ind[:, np.newaxis], subj[:, np.newaxis]), axis=1))

In [None]:
df1.to_csv("header_test.csv")

In [None]:
import h5py

h5f = h5py.File("data_test.hdf5", "w")
h5f.create_dataset("data1", data=masked_data)
h5f.close

# (not working yet) Log transform, normalise according to 95 percentile of Subject 11, don't clip, and save in two big files (one 'header' and one 'data')

In [None]:
# max_data = masked_data11.max()
max_data = np.percentile(masked_data11, 95)
masked_data11n = masked_data11.astype("float32") / max_data
masked_data12n = masked_data12.astype("float32") * a12 / max_data
masked_data13n = masked_data13.astype("float32") * a13 / max_data
masked_data14n = masked_data14.astype("float32") * a14 / max_data
masked_data15n = masked_data15.astype("float32") * a15 / max_data

In [None]:
print(max_data)
print(masked_data11.shape)

In [None]:
fig, axes = plt.subplots(1, 5, sharey=True, figsize=(20, 10))
axes[0].hist(masked_data11n.flatten(), range=[0, 1])
axes[1].hist(masked_data12n.flatten(), range=[0, 1])
axes[2].hist(masked_data13n.flatten(), range=[0, 1])
axes[3].hist(masked_data14n.flatten(), range=[0, 1])
axes[4].hist(masked_data15n.flatten(), range=[0, 1])
fig.show()

In [None]:
subj11 = 11 * np.ones((masked_data11.shape[0],), dtype=int)
subj12 = 12 * np.ones((masked_data12.shape[0],), dtype=int)
subj13 = 13 * np.ones((masked_data13.shape[0],), dtype=int)
subj14 = 14 * np.ones((masked_data14.shape[0],), dtype=int)
subj15 = 15 * np.ones((masked_data15.shape[0],), dtype=int)

In [None]:
subj = np.concatenate((subj11, subj12, subj13, subj14, subj15), axis=0)
print(subj.shape)
masked_data = np.concatenate(
    (masked_data11n, masked_data12n, masked_data13n, masked_data14n, masked_data15n),
    axis=0,
)
print(masked_data.shape)

In [None]:
ind = np.arange(len(subj11) + len(subj12) + len(subj13) + len(subj14) + len(subj15))
ind.shape

In [None]:
import pandas as pd

In [None]:
df1 = pd.DataFrame(np.concatenate((ind[:, np.newaxis], subj[:, np.newaxis]), axis=1))

In [None]:
df1.to_csv("header_.csv")

In [None]:
df1

In [None]:
df = pd.DataFrame(masked_data)

In [None]:
df.to_csv("data_.csv")