In [1]:
import pandas as pd
import numpy as np

In [2]:
def convert_to_categorical(data_frame):
    columns = data_frame.columns.tolist()

    for column in columns:
        data_frame.loc[:, column] = data_frame[column].map(lambda value: round(value, 1))

In [3]:
def fill_na(data_frame):
    columns = data_frame.columns.tolist()
    cond = data_frame.isna().sum().any()
    if cond:
        for column in columns:
            if data_frame[column].isna().any():
                median = np.median(data_frame[column])
                data_frame[column].fillna(by=median)

In [4]:
def variance_filtering(array):
    var = np.apply_along_axis(np.var, axis=0, arr=array)
    return var > 0

In [5]:
cll = pd.read_csv("data/raw/Subset_CLL_healthy.csv", index_col=0)
cll.shape

(377791, 141)

In [6]:
table = pd.read_excel("data/raw/Table_info.xlsx", index_col=0)
table.shape

(141, 2)

In [7]:
cll.head()

Unnamed: 0,GSM3516780,GSM3516781,GSM3516782,GSM3516783,GSM3516784,GSM3516785,GSM3516786,GSM3516787,GSM3516788,GSM3516789,...,GSM4056754,GSM4056755,GSM4056756,GSM4056759,GSM4056760,GSM4056762,GSM4056765,GSM4056766,GSM4056767,GSM4056768
cg00000029,0.391044,0.418785,0.395356,0.381291,0.407951,0.502557,0.395726,0.473019,0.416764,0.544809,...,0.836975,0.859731,0.947987,0.619287,0.907105,0.835069,0.834935,0.36919,0.373798,0.525558
cg00000109,0.94494,0.972309,0.958343,0.952636,0.933867,0.952705,0.96062,0.958447,0.939295,0.957803,...,0.914387,0.924648,0.92027,0.543636,0.925531,0.938895,0.91234,0.919451,0.891999,0.92693
cg00000165,0.168399,0.17885,0.10514,0.077278,0.091846,0.116646,0.183579,0.110159,0.104638,0.122046,...,0.526013,0.415611,0.159583,0.151504,0.12737,0.25697,0.131953,0.383655,0.140984,0.110868
cg00000236,0.785584,0.809741,0.75632,0.74359,0.755508,0.810732,0.773841,0.778512,0.7074,0.746812,...,0.945938,0.909755,0.860567,0.921357,0.919216,0.894677,0.877624,0.936717,0.95295,0.464696
cg00000289,0.777683,0.739181,0.766035,0.904918,0.865612,0.891836,0.870348,0.805715,0.939223,0.960015,...,0.570014,0.672622,0.624529,0.373654,0.48435,0.686215,0.463467,0.650734,0.514229,0.585756


In [8]:
table.head()

Unnamed: 0_level_0,IGHV_100,healthy_disease
Sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1
GSM4056740,0.0,D
GSM4056718,0.0,D
GSM4056710,0.0,D
GSM4056713,0.0,D
GSM4056732,0.0,D


In [9]:
table["healthy_disease"].value_counts()

D    72
H    69
Name: healthy_disease, dtype: int64

In [10]:
cll = cll.transpose()

In [11]:
convert_to_categorical(cll)

In [12]:
selected_columns = variance_filtering(cll.values)
cll = cll.loc[:, selected_columns]

In [13]:
cll.shape

(141, 351989)

In [14]:
fill_na(cll)

In [18]:
cll.head()

Unnamed: 0,cg00000029,cg00000109,cg00000165,cg00000236,cg00000289,cg00000292,cg00000321,cg00000363,cg00000658,cg00000714,...,cg27665715,cg27665754,cg27665767,cg27665769,cg27665829,cg27665860,cg27665925,cg27665985,cg27666046,cg27666123
GSM3516780,0.4,0.9,0.2,0.8,0.8,0.9,0.2,0.3,0.9,0.1,...,1.0,0.8,0.4,0.2,0.9,1.0,0.0,0.2,0.6,0.9
GSM3516781,0.4,1.0,0.2,0.8,0.7,0.9,0.2,0.3,0.9,0.1,...,1.0,0.9,0.4,0.2,0.9,0.9,0.0,0.3,0.5,0.8
GSM3516782,0.4,1.0,0.1,0.8,0.8,0.9,0.1,0.2,0.9,0.1,...,1.0,0.9,0.4,0.2,1.0,1.0,0.0,0.2,0.6,0.9
GSM3516783,0.4,1.0,0.1,0.7,0.9,0.8,0.1,0.2,0.9,0.1,...,1.0,0.8,0.3,0.3,1.0,1.0,0.0,0.1,0.6,0.9
GSM3516784,0.4,0.9,0.1,0.8,0.9,0.9,0.2,0.3,0.9,0.1,...,1.0,0.8,0.3,0.3,0.9,1.0,0.0,0.2,0.4,0.9


In [19]:
table["healthy_disease"].head()

Sample_name
GSM4056740    D
GSM4056718    D
GSM4056710    D
GSM4056713    D
GSM4056732    D
Name: healthy_disease, dtype: object

In [17]:
healthy_disease_data = pd.concat((cll, table["healthy_disease"]), axis=1, sort=False)
healthy_disease_data.head()

Unnamed: 0,cg00000029,cg00000109,cg00000165,cg00000236,cg00000289,cg00000292,cg00000321,cg00000363,cg00000658,cg00000714,...,cg27665754,cg27665767,cg27665769,cg27665829,cg27665860,cg27665925,cg27665985,cg27666046,cg27666123,healthy_disease
GSM3516780,0.4,0.9,0.2,0.8,0.8,0.9,0.2,0.3,0.9,0.1,...,0.8,0.4,0.2,0.9,1.0,0.0,0.2,0.6,0.9,H
GSM3516781,0.4,1.0,0.2,0.8,0.7,0.9,0.2,0.3,0.9,0.1,...,0.9,0.4,0.2,0.9,0.9,0.0,0.3,0.5,0.8,H
GSM3516782,0.4,1.0,0.1,0.8,0.8,0.9,0.1,0.2,0.9,0.1,...,0.9,0.4,0.2,1.0,1.0,0.0,0.2,0.6,0.9,H
GSM3516783,0.4,1.0,0.1,0.7,0.9,0.8,0.1,0.2,0.9,0.1,...,0.8,0.3,0.3,1.0,1.0,0.0,0.1,0.6,0.9,H
GSM3516784,0.4,0.9,0.1,0.8,0.9,0.9,0.2,0.3,0.9,0.1,...,0.8,0.3,0.3,0.9,1.0,0.0,0.2,0.4,0.9,H


In [20]:
healthy_disease_data.isna().sum().any()

False

In [21]:
healthy_disease_data.rename({"healthy_disease": "Trait"}, axis=1, inplace=True)

In [22]:
healthy_disease_data.head()

Unnamed: 0,cg00000029,cg00000109,cg00000165,cg00000236,cg00000289,cg00000292,cg00000321,cg00000363,cg00000658,cg00000714,...,cg27665754,cg27665767,cg27665769,cg27665829,cg27665860,cg27665925,cg27665985,cg27666046,cg27666123,Trait
GSM3516780,0.4,0.9,0.2,0.8,0.8,0.9,0.2,0.3,0.9,0.1,...,0.8,0.4,0.2,0.9,1.0,0.0,0.2,0.6,0.9,H
GSM3516781,0.4,1.0,0.2,0.8,0.7,0.9,0.2,0.3,0.9,0.1,...,0.9,0.4,0.2,0.9,0.9,0.0,0.3,0.5,0.8,H
GSM3516782,0.4,1.0,0.1,0.8,0.8,0.9,0.1,0.2,0.9,0.1,...,0.9,0.4,0.2,1.0,1.0,0.0,0.2,0.6,0.9,H
GSM3516783,0.4,1.0,0.1,0.7,0.9,0.8,0.1,0.2,0.9,0.1,...,0.8,0.3,0.3,1.0,1.0,0.0,0.1,0.6,0.9,H
GSM3516784,0.4,0.9,0.1,0.8,0.9,0.9,0.2,0.3,0.9,0.1,...,0.8,0.3,0.3,0.9,1.0,0.0,0.2,0.4,0.9,H


In [23]:
healthy_disease_data.to_csv("data/interim/healthy_disease_data_prepared.csv") 