In [1]:
import pandas as pd
import shutil
import pathlib
from scripts.python.routines.manifest import get_manifest
from scripts.python.routines.sections import get_sections
import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn2_circles

In [16]:
dataset = "GSEUNN"
path = f"D:/YandexDisk/Work/pydnameth/datasets"
datasets_info = pd.read_excel(f"{path}/datasets.xlsx", index_col='dataset')
platform = datasets_info.loc[dataset, 'platform']
manifest = get_manifest(platform, path=path)

path_save = f"{path}/{platform}/{dataset}/special/056_GEO_transplantation"
pathlib.Path(f"{path_save}").mkdir(parents=True, exist_ok=True)

df = pd.read_excel(f"{path}/{platform}/{dataset}/pheno.xlsx", index_col=0)

In [17]:
df.rename(columns={'TR_status': 'Transplantation'}, inplace=True)
df['Transplantation'].replace({'before': 'Before', '1y': 'After'}, inplace=True)
df = df.loc[df['Transplantation'].isin(['Before', 'After']), :]

In [18]:
df['Sample_ID'] = df["Sentrix_ID"].astype(str) + '_' + df["Sentrix_Position"].astype(str)
df['index'] = df.index.values
df.set_index('Sample_ID', inplace=True)

In [20]:
df.index.name = 'Sample name'
df['title'] =  'genomic DNA from Whole Blood for ' + df.index + ' sample'
df['source name'] = 'Whole Blood'
df['organism'] = 'Homo sapiens'
df['sample type'] = 'genomic'
df['idat file Grn'] = df.index + '_Grn.idat'
df['idat file Red'] = df.index + '_Red.idat'
df['characteristics: PatientID'] = df['index']
df['characteristics: Transplantation'] = df['Transplantation']
df['characteristics: Age'] = df['Age']
df['characteristics: Sex'] = df['Sex']
df['molecule'] = 'genomic DNA'
df['label'] = 'Cy5 and Cy3'
df['description'] = df.index + ' is ' + df['Status'] + ' sample from ' + df['Region']
df['platform'] = platform
df['Sample_Well'] = ''
df['Sample_Plate'] = ''
df['Sample_Group'] = ''
df['Pool_ID'] = ''

df = df.loc[:,
     [
         'title',
         'source name',
         'organism',
         'sample type',
         'idat file Grn',
         'idat file Red',
         'characteristics: PatientID',
         'characteristics: Transplantation',
         'characteristics: Age',
         'characteristics: Sex',
         'molecule',
         'label',
         'description',
         'platform',
         'index',
         'index_origin',
         'Sentrix_ID',
         'Sentrix_Position',
         'Sample_Well',
         'Sample_Plate',
         'Sample_Group',
         'Pool_ID'
     ]
     ]
df.to_excel(f"{path_save}/samples/df.xlsx", index_label='Sample name')

In [28]:
df = pd.read_excel(f"{path_save}/samples/df.xlsx", index_col=0)

In [22]:
betas = pd.read_csv(f"{path_save}/data/beta_table.txt", delimiter="\t", index_col='ID_REF')
pvals = pd.read_csv(f"{path_save}/data/pval_table.txt", delimiter="\t", index_col='ID_REF')
unmeth = pd.read_csv(f"{path_save}/data/unmeth_table.txt", delimiter="\t", index_col='ID_REF')
meth = pd.read_csv(f"{path_save}/data/meth_table.txt", delimiter="\t", index_col='ID_REF')

In [23]:
betas = betas.astype('float32')
pvals = pvals.astype('float32')
unmeth = unmeth.astype('int32')
meth = meth.astype('int32')

In [29]:
pheno_ids = df.index.tolist()
betas_ids = list(betas.columns.values)
pvals_ids = list(pvals.columns.values)
unmeth_ids = list(unmeth.columns.values)
meth_ids = list(meth.columns.values)
if  pheno_ids == betas_ids and pheno_ids == pvals_ids and pheno_ids == unmeth_ids and pheno_ids == meth_ids:
    print(f"Order is fine")
else:
    raise ValueError(f"Warning! Order is not the same!")

Order is fine


In [30]:
pvals_ids_new = [f'{x} Detection Pval' for x in pvals_ids]
pvals_ids_dict = dict(zip(pvals_ids, pvals_ids_new))
pvals.rename(columns=pvals_ids_dict, inplace=True)

mtx_proc = pd.concat([betas, pvals], axis=1)
mtx_proc_ids = []
for s_id in range(len(betas_ids)):
    mtx_proc_ids.append(betas_ids[s_id])
    mtx_proc_ids.append(pvals_ids_new[s_id])
mtx_proc = mtx_proc[mtx_proc_ids]
mtx_proc.index.name = 'ID_REF'

In [31]:
unmeth_ids_new = [f'{x} Unmethylated Signal' for x in unmeth_ids]
unmeth_ids_dict = dict(zip(unmeth_ids, unmeth_ids_new))
unmeth.rename(columns=unmeth_ids_dict, inplace=True)

meth_ids_new = [f'{x} Methylated Signal' for x in meth_ids]
meth_ids_dict = dict(zip(meth_ids, meth_ids_new))
meth.rename(columns=meth_ids_dict, inplace=True)

mtx_signal = pd.concat([unmeth, meth], axis=1)
mtx_signal_ids = []
for s_id in range(len(unmeth_ids)):
    mtx_signal_ids.append(unmeth_ids_new[s_id])
    mtx_signal_ids.append(meth_ids_new[s_id])
mtx_signal = mtx_signal[mtx_signal_ids]
mtx_signal.index.name = 'ID_REF'

In [32]:
df.to_excel(f"{path_save}/data/samples.xlsx", index=True)
mtx_proc.to_csv(f"{path_save}/data/mtx_proc.csv", index=True)
mtx_signal.to_csv(f"{path_save}/data/mtx_signal.csv", index=True)

In [21]:
for fn in list(df.loc[:, 'idat file Grn'].values) + list(df.loc[:, 'idat file Red'].values):
    shutil.copy2(f"{path}/{platform}/{dataset}/raw/idat/{fn}", f"{path_save}/idat")