In [10]:
import pandas as pd
import numpy as np
import scipy
from sklearn.linear_model import ElasticNet, ElasticNetCV
from sklearn.model_selection import RepeatedKFold, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scripts.python.routines.betas import betas_drop_na
from plotly.subplots import make_subplots
from scipy import stats
import pickle
import shutil
import random
import plotly.express as px
import copy
import statsmodels.formula.api as smf
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scripts.python.pheno.datasets.filter import filter_pheno
from scripts.python.pheno.datasets.features import get_column_name, get_status_dict, get_sex_dict
from scripts.python.routines.plot.scatter import add_scatter_trace
from scipy.stats import mannwhitneyu
import plotly.graph_objects as go
import pathlib
from scripts.python.routines.manifest import get_manifest
from scripts.python.routines.plot.save import save_figure
from scripts.python.routines.plot.layout import add_layout, get_axis
from scripts.python.routines.plot.p_value import add_p_value_annotation
from statsmodels.stats.multitest import multipletests
from sklearn.metrics import mean_absolute_error
import plotly.io as pio
pio.kaleido.scope.mathjax = None

In [2]:
dataset = "GSEUNN"
path = f"E:/YandexDisk/Work/pydnameth/datasets"
datasets_info = pd.read_excel(f"{path}/datasets.xlsx", index_col='dataset')
platform = datasets_info.loc[dataset, 'platform']
manifest = get_manifest(platform)

path_save = f"{path}/{platform}/{dataset}/special/026_data_for_GEO"
pathlib.Path(f"{path_save}").mkdir(parents=True, exist_ok=True)

pheno = pd.read_csv(f"{path_save}/data/part(v2_ipAGE_159).csv", index_col='Sample_ID')

source_name = 'Whole Blood'
organism = 'Homo sapiens'
sample_type = 'genomic'

pheno.index.name = 'Sample name'
pheno['title'] = pheno.index
pheno['source name'] = source_name
pheno['organism'] = organism
pheno['sample type'] = sample_type
pheno['idat file Grn'] = pheno['title'] + '_Grn.idat'
pheno['idat file Red'] = pheno['title'] + '_Red.idat'
pheno['characteristics: Age'] = pheno['Age']
pheno['characteristics: Sex'] = pheno['Sex']
pheno['characteristics: Group'] = pheno['Group']
pheno['characteristics: ipAGE_ID'] = pheno['ipAGE_ID']
pheno['molecule'] = 'genomic DNA'
pheno['label'] = 'Cy5 and Cy3'
pheno['description'] = pheno['source name'] + ' from ' + pheno['Group'] + ' participant ' +  pheno['title']
pheno['platform'] = platform

pheno.drop(['Sentrix_ID', 'Sentrix_Position', 'Sample_Well', 'Sample_Plate', 'Sample_Group', 'Pool_ID', 'Age', 'Sex', 'Group', 'ipAGE_ID'], axis=1, inplace=True)

In [3]:
betas = pd.read_csv(f"{path_save}/data/beta_table.txt", delimiter="\t", index_col='ID_REF')
pvals = pd.read_csv(f"{path_save}/data/pval_table.txt", delimiter="\t", index_col='ID_REF')
unmeth = pd.read_csv(f"{path_save}/data/unmeth_table.txt", delimiter="\t", index_col='ID_REF')
meth = pd.read_csv(f"{path_save}/data/meth_table.txt", delimiter="\t", index_col='ID_REF')

In [4]:
pheno_ids = pheno.index.tolist()
betas_ids = list(betas.columns.values)
pvals_ids = list(pvals.columns.values)
unmeth_ids = list(unmeth.columns.values)
meth_ids = list(meth.columns.values)
if  pheno_ids == betas_ids and pheno_ids == pvals_ids and pheno_ids == unmeth_ids and pheno_ids == meth_ids:
    print(f"Order is fine")
else:
    raise ValueError(f"Warning! Order is not the same!")

Order is fine


In [8]:
pvals_ids_new = ['Detection Pval ' + x for x in pvals_ids]
pvals_ids_dict = dict(zip(pvals_ids, pvals_ids_new))
pvals.rename(columns=pvals_ids_dict, inplace=True)

mtx_proc = pd.concat([betas, pvals], axis=1)
mtx_proc_ids = []
for s_id in range(len(betas_ids)):
    mtx_proc_ids.append(betas_ids[s_id])
    mtx_proc_ids.append(pvals_ids_new[s_id])
mtx_proc = mtx_proc[mtx_proc_ids]
mtx_proc.index.name = 'ID_REF'

In [6]:
unmeth_ids_new = [x + ' Unmethylated Signal' for x in unmeth_ids]
unmeth_ids_dict = dict(zip(unmeth_ids, unmeth_ids_new))
unmeth.rename(columns=unmeth_ids_dict, inplace=True)

meth_ids_new = [x + ' Methylated Signal' for x in meth_ids]
meth_ids_dict = dict(zip(meth_ids, meth_ids_new))
meth.rename(columns=meth_ids_dict, inplace=True)

mtx_signal = pd.concat([unmeth, meth], axis=1)
mtx_signal_ids = []
for s_id in range(len(unmeth_ids)):
    mtx_signal_ids.append(unmeth_ids_new[s_id])
    mtx_signal_ids.append(meth_ids_new[s_id])
mtx_signal = mtx_signal[mtx_signal_ids]
mtx_signal.index.name = 'ID_REF'

In [7]:
pheno.to_excel(f"{path_save}/data/samples.xlsx", index=True)
mtx_proc.to_csv(f"{path_save}/data/mtx_proc.csv", index=True)
mtx_signal.to_csv(f"{path_save}/data/mtx_signal.csv", index=True)

In [11]:
for fn in list(pheno.loc[:, 'idat file Grn'].values) + list(pheno.loc[:, 'idat file Red'].values):
    shutil.copy2(f"{path}/{platform}/{dataset}/raw/idat/{fn}", f"{path_save}/idat")