In [None]:
# https://www.geeksforgeeks.org/implementing-an-autoencoder-in-pytorch/
import torch
from torchvision import datasets
from torchvision import transforms
import matplotlib.pyplot as plt
import loader as load
import processor as pr
import config
from torch.utils.data import Dataset, DataLoader

import random
import numpy as np
import pandas as pd


torch.backends.cudnn.deterministic = True
random.seed(1)
torch.manual_seed(1)
torch.cuda.manual_seed(1)
np.random.seed(1)

In [None]:

target="tumor"
c="STAD"
data, files_names = load.loadAll(includeStage=(target=="stage"), sameSamples=True, skipGenes=True)
ge_genus, ge_genus_name = data[-1], files_names[-1]
ge_genus = load.attachTumorStatus(ge_genus)

x, y = pr.splitData(ge_genus, target=target, project=c)
# x = x.drop(x.iloc[:, 5:5216], axis=1)
x

In [None]:
# https://towardsdatascience.com/how-to-use-datasets-and-dataloader-in-pytorch-for-custom-text-data-270eed7f7c00

class OverlapDataset(Dataset):
    """Genus + GE dataset."""

    def __init__(self, target, cancer):
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        data, files_names = load.loadAll(includeStage=(target=="stage"), sameSamples=True, skipGenes=True)
        ge_genus, ge_genus_name = data[-1], files_names[-1]

        if target=="tumor":
            ge_genus = load.attachTumorStatus(ge_genus)
        else:
            ge_genus = load.attachStageStatus(ge_genus)

        x, y = pr.splitData(ge_genus, target=target, project=cancer)
        # x = x.drop(x.iloc[:, 20:5201], axis=1)
        
        self.modality_features = x
        self.targets = y


    def __len__(self):
        return len(self.modality_features)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        sample_features = self.modality_features.iloc[idx].values
        sample_target = self.targets.iloc[idx]
        sample = {'features': sample_features, 'target': sample_target}

        return sample

In [None]:
overlapped = OverlapDataset("tumor", "STAD")
# Display text and label.
print('\nFirst iteration of data set: ', next(iter(overlapped)), '\n')
# Print how many items are in the data set
print('Length of data set: ', len(overlapped), '\n')
# Print entire data set
print('Entire data set: ', list(DataLoader(overlapped))[:2], '\n')

# DataLoader is used to load the dataset
# for training
loader = torch.utils.data.DataLoader(dataset = overlapped,
									batch_size = 16,
									shuffle = True)
									
print('Batched data set: ', list(loader)[:2], '\n')

In [None]:
# https://watermark.silverchair.com/1248.pdf?token=AQECAHi208BE49Ooan9kkhW_Ercy7Dm3ZL_9Cf3qfKAc485ysgAAArwwggK4BgkqhkiG9w0BBwagggKpMIICpQIBADCCAp4GCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQMM6y26O5A0lDKtXXvAgEQgIICb8n1ZwkcrxPuUifgr-Ud12gTf4jj9M7PdE0hTB1Wa3QllSw6ZyWaQagFfvUpTjgpYrEs1z9ZG3K3rdOjyf6wfpPsmYwm31SXna_VzaNB7KpelOTufAa1n21qxSdlcEAhZBX_taNOaI8dyvL45JK8NMB9lS_u_o9s_nlYooUYSUBquE3rXTFj-B0X0nFUzXKrHOnvvH_IqN4QUlXAnq-M1Cf_4WJMHDG5WKor_7iCUOM2ggFI5QYKMbKFr7-bua_ahStqgZIWf2ha7yvAl2W0Uh7osjfX_E6dsSyQIel-iHMtlvdNyrJpewesW3hDNva6ZZT7eCQWSq1STtN2qyDaIN6owPs3Nk2JWazJOq6m-kWEUwi0xRoufdKfMPttlt97VdLUly-HPn6TLthicSi-JWcKIjyAl808KW5EgGpq-TfwPPgto9CRR3qOGj4OxoQsIvMaUySpZIztbOmjI38GRrUkLPYRDY3qj1dRdccOPEIW378tEaNQMEImqWOhpHw_YYz4jPVTNtxB8usRfjZUf572W-pRtPRPd7Ysv-SNxlsdL4CHjPDFeJqdOnm8xV99jGvn962nNH4o3uNl87hPCBENoprDcesN3yMams69_vjzaCBUEv6ry6DSr96t82wb1KKSDlY2xp-3aLKz_1NgQRNnRMzSo0lMYtvhnAJHtPWnUdrvNanhp1pPlK0JgvwZKVjMdyVBfbpnGG6KLecXTycwHB7hcdNQNx6uatOYGCuGccW6Al_Nm3guDytzmCAs9eT85rck1ZOpc5P7R1qcxlHgFOVrlQ-AIeEA7e77MtKpc4uqnubdWBn6Lot_TmqG
# Creating a PyTorch class
# 28*28 ==> 9 ==> 28*28
class AE(torch.nn.Module):
	def __init__(self, input_features, hidden_features):
		super().__init__()
		
		# Building an linear encoder with Linear
		# layer followed by Relu activation function
		# 784 ==> 9
		self.encoder = torch.nn.Sequential(
			torch.nn.Linear(input_features, 128),
			torch.nn.ReLU(),
			torch.nn.Linear(128, 64),
			torch.nn.ReLU(),
			torch.nn.Linear(64, 36),
			torch.nn.ReLU(),
			torch.nn.Linear(36, 18),
			torch.nn.ReLU(),
			torch.nn.Linear(18, hidden_features)
		)
		
		# Building an linear decoder with Linear
		# layer followed by Relu activation function
		# The Sigmoid activation function
		# outputs the value between 0 and 1
		# 9 ==> 784
		self.decoder = torch.nn.Sequential(
			torch.nn.Linear(hidden_features, 18),
			torch.nn.ReLU(),
			torch.nn.Linear(18, 36),
			torch.nn.ReLU(),
			torch.nn.Linear(36, 64),
			torch.nn.ReLU(),
			torch.nn.Linear(64, 128),
			torch.nn.ReLU(),
			torch.nn.Linear(128, input_features),
			torch.nn.Sigmoid()
		)

	def forward(self, x):
		encoded = self.encoder(x)
		decoded = self.decoder(encoded)
		return encoded, decoded


In [None]:
# Model Initialization
model = AE(input_features=5221, hidden_features=3)

# Validation using MSE Loss function
loss_function = torch.nn.MSELoss()

# Using an Adam Optimizer with lr = 0.1
optimizer = torch.optim.Adam(model.parameters(),
							lr = 1e-2,
							weight_decay = 1e-8)

In [None]:
epochs = 10
outputs = []
losses = []
hidden_representation = None

for epoch in range(epochs):
	for batch_id, batched_samples in enumerate(loader):
		features = batched_samples["features"].float()
		
		# Output of Autoencoder
		hidden, reconstructed = model(features)
		
		# Calculating the loss function
		loss = loss_function(reconstructed, features)
		
		# The gradients are set to zero,
		# the gradient is computed and stored.
		# .step() performs parameter update
		optimizer.zero_grad()
		loss.backward()
		optimizer.step()
		
		# Storing the losses in a list for plotting
		losses.append(loss.detach())
		outputs.append((epochs, features, reconstructed))
		hidden_representation = hidden[-1]

		print("last hidden item: ", hidden_representation)

# Defining the Plot Style
plt.style.use('fivethirtyeight')
plt.xlabel('Iterations')
plt.ylabel('Loss')

# Plotting the last 100 values
plt.plot(losses[-100:])

In [None]:
torch.save(model.state_dict(), config.model_state_path)

In [None]:
model = AE(input_features=5221, hidden_features=3)
model.load_state_dict(torch.load(config.model_state_path))
model.eval()

In [None]:
with torch.no_grad():
    def extractFeatures(features):
        print(len(features))
        print(features.dtype)
        features_tensor = torch.tensor(features.values).float()
        hidden_representation, _ = model(features_tensor)
     
        print(hidden_representation)
        print(len(hidden_representation))
        return pd.Series(hidden_representation)

    # OverlapDataset

    x_s = x.head(2)
    x_integrated = x_s.apply(extractFeatures, axis=1)
    # extracted_features = model(x)

print(x_integrated)