In [None]:
import pandas as pd
import numpy as np
import scipy
from sklearn.linear_model import ElasticNet, ElasticNetCV
from sklearn.model_selection import RepeatedKFold, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scripts.python.routines.betas import betas_drop_na
from plotly.subplots import make_subplots
from scipy import stats
import pickle
import shutil
import random
import plotly.express as px
import copy
import statsmodels.formula.api as smf
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scripts.python.pheno.datasets.filter import filter_pheno
from scripts.python.pheno.datasets.features import get_column_name, get_status_dict, get_sex_dict
from scripts.python.routines.plot.scatter import add_scatter_trace
from scipy.stats import mannwhitneyu
import plotly.graph_objects as go
import pathlib
from scripts.python.routines.manifest import get_manifest
from scripts.python.routines.plot.save import save_figure
from scripts.python.routines.plot.layout import add_layout, get_axis
from scripts.python.routines.plot.p_value import add_p_value_annotation
from statsmodels.stats.multitest import multipletests
from sklearn.metrics import mean_absolute_error
from scripts.python.routines.sections import get_sections
import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn2_circles
import matplotlib

In [None]:
dataset = "GSEUNN"
path = f"D:/YandexDisk/Work/pydnameth/datasets"
datasets_info = pd.read_excel(f"{path}/datasets.xlsx", index_col='dataset')
platform = datasets_info.loc[dataset, 'platform']
manifest = get_manifest(platform, path=path)

path_save = f"{path}/{platform}/{dataset}/special/026_data_for_GEO"
pathlib.Path(f"{path_save}").mkdir(parents=True, exist_ok=True)

In [None]:
df_ipage = pd.read_csv(f"{path_save}/samples/ipAGE.csv", index_col='Sample_ID')
df_yakutia = pd.read_excel(f"{path_save}/samples/Yakutia_245.xlsx", index_col='index')

df = pd.read_excel(f"{path}/{platform}/{dataset}/pheno.xlsx", index_col="index")
df['Sample in Yakutia work?'] = 'No'
df.loc[df_yakutia.index.values, 'Sample in Yakutia work?'] = 'Yes'
df['Sample_ID'] = df["Sentrix_ID"].astype(str) + '_' + df["Sentrix_Position"].astype(str)
df['index'] = df.index.values
df.set_index('Sample_ID', inplace=True)
df['Sample in ipAGE work?'] = 'No'
df.loc[df_ipage.index.values, 'Sample in ipAGE work?'] = 'Yes'

pathlib.Path(f"{path_save}/samples/controls_intersection").mkdir(parents=True, exist_ok=True)
ctrl_ipage = df.index[(df['Sample in ipAGE work?'] == 'Yes') & (df['Status'] == 'Control')].values
ctrl_yakutia = df.index[(df['Sample in Yakutia work?'] == 'Yes') & (df['Status'] == 'Control') & (df['Region'] == 'Central')].values
fig, ax = plt.subplots()
venn = venn2(
    subsets=(set(ctrl_ipage), set(ctrl_yakutia)),
    set_labels = ('ipAGE', 'Yakutia'),
    set_colors=('r', 'g'),
    alpha = 0.5
)
venn2_circles(subsets=(set(ctrl_ipage), set(ctrl_yakutia)))
for text in venn.set_labels:
    text.set_fontsize(16)
for text in venn.subset_labels:
    text.set_fontsize(25)
plt.savefig(f"{path_save}/samples/controls_intersection/venn.png", bbox_inches='tight', dpi=400)
plt.savefig(f"{path_save}/samples/controls_intersection/venn.pdf", bbox_inches='tight')
plt.clf()

sections = get_sections([set(ctrl_ipage), set(ctrl_yakutia)])
for sec in sections:
    df_sec = pd.DataFrame(index=list(sections[sec]))
    df_sec.to_excel(f"{path_save}/samples/controls_intersection/{sec}.xlsx", index_label='Sample_ID')

df = df.loc[(df['Sample in ipAGE work?'] == 'Yes') | (df['Sample in Yakutia work?'] == 'Yes'), :]
df.index.name = 'Sample name'
df['title'] =  'genomic DNA from Whole Blood for ' + df.index + ' sample'
df['source name'] = 'Whole Blood'
df['organism'] = 'Homo sapiens'
df['sample type'] = 'genomic'
df['idat file Grn'] = df.index + '_Grn.idat'
df['idat file Red'] = df.index + '_Red.idat'
df['characteristics: Sample in ipAGE work?'] = df['Sample in ipAGE work?']
df['characteristics: Sample in Yakutia work?'] = df['Sample in Yakutia work?']
df['characteristics: Age'] = df['Age']
df['characteristics: Sex'] = df['Sex']
df['characteristics: Status'] = df['Status']
df['characteristics: Region'] = df['Region']
df['molecule'] = 'genomic DNA'
df['label'] = 'Cy5 and Cy3'
df['description'] = df.index + ' is ' + df['Status'] + ' sample from ' + df['Region']
df['platform'] = platform
df['Sample_Well'] = ''
df['Sample_Plate'] = ''
df['Sample_Group'] = ''
df['Pool_ID'] = ''

df = df.loc[:,
     [
         'title',
         'source name',
         'organism',
         'sample type',
         'idat file Grn',
         'idat file Red',
         'characteristics: Sample in ipAGE work?',
         'characteristics: Sample in Yakutia work?',
         'characteristics: Age',
         'characteristics: Sex',
         'characteristics: Status',
         'characteristics: Region',
         'molecule',
         'label',
         'description',
         'platform',
         'index',
         'index_origin',
         'Sentrix_ID',
         'Sentrix_Position',
         'Sample_Well',
         'Sample_Plate',
         'Sample_Group',
         'Pool_ID'
     ]
     ]

df.to_excel(f"{path_save}/samples/df.xlsx", index_label='Sample name')

In [None]:
# Select target ids
df = df.loc[df['characteristics: Sample in Yakutia work?'] == 'Yes', :]

In [None]:
betas = pd.read_csv(f"{path_save}/data/beta_table.txt", delimiter="\t", index_col='ID_REF')
pvals = pd.read_csv(f"{path_save}/data/pval_table.txt", delimiter="\t", index_col='ID_REF')
unmeth = pd.read_csv(f"{path_save}/data/unmeth_table.txt", delimiter="\t", index_col='ID_REF')
meth = pd.read_csv(f"{path_save}/data/meth_table.txt", delimiter="\t", index_col='ID_REF')

In [None]:
betas = betas.astype('float32')
pvals = pvals.astype('float32')
unmeth = unmeth.astype('int32')
meth = meth.astype('int32')

In [None]:
pheno_ids = df.index.tolist()
betas_ids = list(betas.columns.values)
pvals_ids = list(pvals.columns.values)
unmeth_ids = list(unmeth.columns.values)
meth_ids = list(meth.columns.values)
if  pheno_ids == betas_ids and pheno_ids == pvals_ids and pheno_ids == unmeth_ids and pheno_ids == meth_ids:
    print(f"Order is fine")
else:
    raise ValueError(f"Warning! Order is not the same!")

In [None]:
pvals_ids_new = [f'{x} Detection Pval' for x in pvals_ids]
pvals_ids_dict = dict(zip(pvals_ids, pvals_ids_new))
pvals.rename(columns=pvals_ids_dict, inplace=True)

mtx_proc = pd.concat([betas, pvals], axis=1)
mtx_proc_ids = []
for s_id in range(len(betas_ids)):
    mtx_proc_ids.append(betas_ids[s_id])
    mtx_proc_ids.append(pvals_ids_new[s_id])
mtx_proc = mtx_proc[mtx_proc_ids]
mtx_proc.index.name = 'ID_REF'

In [None]:
unmeth_ids_new = [f'{x} Unmethylated Signal' for x in unmeth_ids]
unmeth_ids_dict = dict(zip(unmeth_ids, unmeth_ids_new))
unmeth.rename(columns=unmeth_ids_dict, inplace=True)

meth_ids_new = [f'{x} Methylated Signal' for x in meth_ids]
meth_ids_dict = dict(zip(meth_ids, meth_ids_new))
meth.rename(columns=meth_ids_dict, inplace=True)

mtx_signal = pd.concat([unmeth, meth], axis=1)
mtx_signal_ids = []
for s_id in range(len(unmeth_ids)):
    mtx_signal_ids.append(unmeth_ids_new[s_id])
    mtx_signal_ids.append(meth_ids_new[s_id])
mtx_signal = mtx_signal[mtx_signal_ids]
mtx_signal.index.name = 'ID_REF'

In [None]:
df.to_excel(f"{path_save}/data/samples.xlsx", index=True)
mtx_proc.to_csv(f"{path_save}/data/mtx_proc.csv", index=True)
mtx_signal.to_csv(f"{path_save}/data/mtx_signal.csv", index=True)

In [None]:
for fn in list(df.loc[:, 'idat file Grn'].values) + list(df.loc[:, 'idat file Red'].values):
    shutil.copy2(f"{path}/{platform}/{dataset}/raw/idat/{fn}", f"{path_save}/idat")

# Fixing existing tables

In [None]:
processed = pd.read_csv("D:/YandexDisk/Work/pydnameth/datasets/GPL21145/GSEUNN/special/026_data_for_GEO/Matrix_processed.csv", index_col=0)
processed.index.name = 'ID_REF'

In [None]:
cols_to_del = [
    '205724780068_R04C01',
    '205724780167_R01C01',
    '205724780167_R02C01',
    '205724780167_R03C01',
    '205724780167_R04C01',
    '205724780068_R04C01 Detection Pval',
    '205724780167_R01C01 Detection Pval',
    '205724780167_R02C01 Detection Pval',
    '205724780167_R03C01 Detection Pval',
    '205724780167_R04C01 Detection Pval',
]
processed.drop(columns=cols_to_del, inplace=True)

In [None]:
processed.to_csv("D:/YandexDisk/Work/pydnameth/datasets/GPL21145/GSEUNN/special/026_data_for_GEO/Matrix_processed_245.csv", index=True, index_label='ID_REF')