# Pre-processing


In [7]:
import scipy.io as scio
import numpy as np 
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture

## Get data - without baseline 
Removal method from Gniadecka

In [2]:
tissue3 = scio.loadmat('../Data/tissue3_1.mat')
tissue6 = scio.loadmat('../Data/tissue6.mat')
tissue30 = scio.loadmat('../Data/tissue30.mat')
tissue31 = scio.loadmat('../Data/tissue31.mat')
tissue34_1 = scio.loadmat('../Data/tissue34_1.mat')
tissue34_2 = scio.loadmat('../Data/tissue34_2.mat')
tissue36 = scio.loadmat('../Data/tissue36.mat')
tissue37 = scio.loadmat('../Data/tissue37.mat')
tissue39_1 = scio.loadmat('../Data/tissue39_1.mat')

calibration = scio.loadmat('../Data/cal_14.mat')
x = calibration['Cal'][0][0][0]
x = [item for sublist in x for item in sublist]

tissue_data = [tissue3,tissue6,tissue30,tissue31,tissue34_1,tissue34_2,tissue36,tissue37,tissue39_1]
file_names = ['map_t3_nobl.txt','map_t6_nobl.txt','map_t30_nobl.txt','map_t31_nobl.txt','map_t34_1_nobl.txt','map_t34_2_nobl.txt','map_t36_nobl.txt','map_t37_nobl.txt','map_t39_1_nobl.txt']

In [3]:
tissues = []
count = 0
for item in tissue_data: 
    t_IDX = item['IDX']
    t_bcc = item['bcc']
    width = t_bcc.shape[0]
    height = t_bcc.shape[1]
    t_map = np.loadtxt(file_names[count]).reshape(width*height,1024)
    tissues.append((t_map,t_bcc,t_IDX,width,height))
    count += 1
    
# tissues: (t_map, t_bcc, t_IDX, width, height)

In [4]:
## Train PCA only on relevant BCC data
pca_training_data = np.zeros((1000,1024))
count = 0

for tissue in tissues:
    for i in range(width):
        for j in range(height):
            if count < 1000:
                if tissue[1][i,j] == 1:
                    index = i*height + j
                    pca_training_data[count] = tissue[0][index,:]
                    count += 1

## PCA dimensionality reduction

In [5]:
# Fit PCA to data 
norm_data = normalize(pca_training_data)
pca = PCA(n_components=200)
pca.fit(norm_data)

PCA(n_components=200)

In [6]:
## Reshape to regain spatial information
shaped_tissue = []

for i in range(len(tissues)):
    (t_map,t_bcc,t_IDX,width,height) = tissues[i]
    # Transform data with PCA
    transformed = pca.transform(t_map)
    
    # Impose width and height to be 200 for uniform input shape
    width  = 200
    height = 200
    
    new_map     = np.zeros((width,height,1024))
    new_map_pca = np.zeros((width,height,200))
    
    for i in range(width):
        for j in range(height):
            index = i*height + j
            new_map[i,j,:]     = t_map[index,:]
            new_map_pca[i,j,:] = transformed[index,:]
    
    shaped_tissue.append((new_map,new_map_pca,t_bcc[:200,:200],t_IDX[:200,:200]))