In [1]:
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Lasso
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler
import tensorflow as tf
from matplotlib import pyplot
from tensorflow.keras.utils import plot_model
import seaborn as sns

In [2]:
    def two_step_normalization( data: pd.DataFrame):
        """
        Two step normalization from Ivan Izonin et al, 2022.  Two-Step Data Normalization Approach for Improving 
        Classification Accuracy in the Medical Diagnosis Domain.
        1. Max Abs Scaler
        2. L2 normalization
        3. Add norm feature
        4. L2 normalization
        5. Return the normalized dataset
        """
        dataset = data.values
        # Initialize the Max Abs Scaler (step 1)
        scaler = MaxAbsScaler()

        # Fit and transform the data Equation (1)
        dataset = scaler.fit_transform(dataset)
        # Calculate the norm of each vector in the dataset using Equation (2)
        norms = np.sqrt(np.sum(dataset ** 2, axis=1))
        # Apply Equation (3) to calculate the norm of each vector in the dataset
        dataset = np.divide(dataset, norms[:, np.newaxis])
        # add norm feature
        dataset = np.column_stack((dataset, norms))

        
        # let's repeat the scaler
        scaler_2 = StandardScaler()
        dataset = scaler_2.fit_transform(dataset)

        # dataset = np.delete(dataset, -1, axis=1)
        # transform back to dataframe and return
        columns = list(data.columns)
        columns.append('norm')
        return pd.DataFrame(dataset, index=data.index, columns=columns)


In [3]:
cts = pd.read_csv("/home/karen/Documents/phd/Data/trainign_dataset_51270_microarray_unadjusted.csv")

  cts = pd.read_csv("/home/karen/Documents/phd/Data/trainign_dataset_51270_microarray_unadjusted.csv")


In [4]:
metadata =   cts[["Experiment", "Sample", "Age"]]
sample_order = cts["Sample"].tolist()  # Get the order of samples in cts
experiment_order = cts["Experiment"].tolist() 
age_order = cts["Age"].tolist()

df =pd.DataFrame(experiment_order)
count_experiments = dict(df[0].value_counts())
count_experiments
experiments = list(count_experiments)
experiments

['GSE13070',
 'GSE47969',
 'GSE48278',
 'GSE9103',
 'GSE59880',
 'GSE9676',
 'GSE161643',
 'GSE8157',
 'GSE1428',
 'GSE38718',
 'GSE6348',
 'GSE40551',
 'GSE87105',
 'GSE674_2']

In [5]:
data = cts.copy()

# Extract batch information into a separate variable
# Replace 'batch_column_name' with the name of the column containing batch information
batch_info = data['Experiment']
age_list = data["Age"]
# Remove the batch column from the data
data.drop(['Age', 'Sex', 'Status', 'Experiment'], axis=1, inplace=True)


data.set_index("Sample", inplace=True)

In [6]:
two_step_normalization(data)

Unnamed: 0_level_0,U48705,M87338,X51757,X69699,L36861,L13852,X55005,X79510,M21121,J02843,...,AI654857,W22117,AI028241,AI571298,AA149545,C18318,AI219073,AI205180,AI363375,norm
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GSM342608,-0.297180,-0.182937,-0.237906,-0.211709,-0.242486,-0.232988,-0.243877,-0.239849,-0.265139,-0.233102,...,1.460189,-0.665390,0.203383,0.354878,0.854325,0.361689,-0.555322,-0.683317,0.005944,1.132345
GSM342609,-0.051121,-0.226303,-0.231740,-0.219484,-0.236915,-0.071544,-0.242499,-0.237025,-0.343052,-0.220636,...,0.841751,0.549154,1.127557,0.185602,2.160857,-0.121307,-0.872802,-0.665822,1.070054,1.030051
GSM342610,-0.252214,-0.235101,-0.235618,-0.207457,-0.234668,-0.066373,-0.248741,-0.242002,-0.328875,-0.272446,...,-0.076305,-0.523345,1.284368,0.008651,0.610010,1.507221,-0.102184,-0.355738,1.105189,1.092841
GSM342611,-0.179713,-0.174421,-0.237730,-0.253331,-0.248305,-0.225445,-0.242739,-0.244196,-0.297326,-0.246715,...,0.476120,1.247565,1.010719,-0.083720,0.766862,0.294131,-1.015306,-0.516633,1.740503,1.029151
GSM342614,-0.324220,-0.168625,-0.230150,-0.184496,-0.241334,-0.168880,-0.248677,-0.239550,-0.260365,-0.245102,...,1.377538,-0.768526,0.972613,-0.260372,0.718306,0.935710,-0.356877,0.147241,0.913642,1.271210
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GSM244612,1.973101,4.448237,4.363251,4.777202,4.824548,6.504738,5.164151,5.688954,1.459373,6.034319,...,-1.622278,-0.928914,-1.641816,-1.267168,-1.046166,-1.401854,-1.300649,-1.494349,-1.476452,-0.900845
GSM244613,1.475983,2.575072,4.118005,3.433660,5.036078,4.245286,3.407948,4.990705,2.462621,4.593786,...,-1.622278,-0.928914,-1.641816,-1.267168,-1.046166,-1.401854,-1.300649,-1.494349,-1.476452,-0.932382
GSM244614,2.033832,3.413587,4.238622,4.686407,5.095194,4.421702,3.983562,5.044907,2.884083,5.129896,...,-1.622278,-0.928914,-1.641816,-1.267168,-1.046166,-1.401854,-1.300649,-1.494349,-1.476452,-0.891300
GSM244615,1.966616,3.020723,3.683360,4.068144,5.243932,5.407543,3.242771,4.314668,2.606182,5.396385,...,-1.622278,-0.928914,-1.641816,-1.267168,-1.046166,-1.401854,-1.300649,-1.494349,-1.476452,-0.848867
