# Debugging autoreload

In [None]:
%load_ext autoreload
%autoreload 2

# Load packages

In [None]:
from pytorch_tabular.utils import load_covertype_dataset
from rich.pretty import pprint
from plotly.subplots import make_subplots
from pytorch_tabular import TabularModel
import plotly.express as px
import torch
import random
import plotly.graph_objects as go
from scipy import stats
import shap
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.impute import KNNImputer
from glob import glob
import ast
import matplotlib.pyplot as plt
import seaborn as sns
import copy
import itertools
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from pytorch_tabular import model_sweep
from src.pt.model_sweep import model_sweep_custom
import warnings
from src.utils.configs import read_parse_config
from src.pt.hyper_opt import train_hyper_opt
from src.utils.hash import dict_hash
import pathlib
from tqdm import tqdm
import distinctipy
import matplotlib.patheffects as pe
import matplotlib.colors as mcolors
from statannotations.Annotator import Annotator
from scipy.stats import mannwhitneyu
from plottable import ColumnDefinition, Table
from scipy.stats import chi2_contingency
from plottable.plots import bar
from plottable.cmap import normed_cmap, centered_cmap
import optuna
from matplotlib.colors import LinearSegmentedColormap
import matplotlib.cm
import matplotlib as mpl
from statsmodels.stats.multitest import multipletests
import re
import datetime
from collections import Counter
from matplotlib.ticker import MaxNLocator
from itertools import chain
from sklearn.metrics import mean_absolute_error
from scipy.stats import mannwhitneyu, variation, levene, zscore
import pyaging as pya
import matplotlib.lines as mlines
from src.models.simage.tabular.widedeep.ft_transformer import WDFTTransformerModel
import statsmodels.formula.api as smf
from itertools import chain
from pingouin import ancova
from sklearn.preprocessing import LabelEncoder 
from functools import reduce
import upsetplot
from src.plot.plotly_layout import add_layout
from docx import Document
from docx.shared import Inches, Cm, Mm, Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.enum.section import WD_ORIENT
from docx.enum.style import WD_STYLE_TYPE
from docx.shared import RGBColor
from pathlib import Path
import re
from openai import OpenAI
from matplotlib_venn import venn3, venn3_circles
import functools


def conjunction(conditions):
    return functools.reduce(np.logical_and, conditions)


def disjunction(conditions):
    return functools.reduce(np.logical_or, conditions)


def delete_paragraph(paragraph):
    p = paragraph._element
    p.getparent().remove(p)
    p._p = p._element = None
    

def markdown_to_docx(markdown_text, doc):
    
    # Регулярные выражения для обработки элементов
    header_re = re.compile(r'^(#+)\s+(.*)')
    list_re = re.compile(r'^(\s*)- (.*)')
    bold_re = re.compile(r'(\*\*|__)(.*?)\1')
    italic_re = re.compile(r'(\*|_)(.*?)\1')
    
    # Обработка каждой строки
    for line in markdown_text.split('\n'):
        stripped_line = line.strip()
        
        # Пропуск пустых строк и разделителей
        if not stripped_line or re.match(r'^-{3,}$', stripped_line):
            continue
        
        # Обработка заголовков
        header_match = header_re.match(line)
        if header_match:
            level = len(header_match.group(1))
            text = header_match.group(2).strip()
            doc.add_heading(text, level=min(level, 6))
            continue
        
        # Обработка списков
        if re.match(r'^[\s]*[-*+] ', line):
            indent = len(line) - len(line.lstrip())
            p = doc.add_paragraph(style='ListBullet')
            p.paragraph_format.left_indent = Pt(25 + 10 * (indent//2))
            line_content = line.lstrip()[2:]
            add_formatted_text(p, line_content, bold_re, italic_re)
            continue
        
        # Обработка обычного текста
        p = doc.add_paragraph()
        add_formatted_text(p, line.strip(), bold_re, italic_re)
    
    return doc

def add_formatted_text(paragraph, text, bold_re, italic_re):
    # Добавление текста с форматированием
    fragments = []
    last_pos = 0
    
    # Разделение текста на фрагменты с форматированием
    for match in bold_re.finditer(text):
        start, end = match.start(), match.end()
        if last_pos < start:
            fragments.append(('normal', text[last_pos:start]))
        fragments.append(('bold', match.group(2)))
        last_pos = end
    
    if last_pos < len(text):
        fragments.append(('normal', text[last_pos:]))
    
    # Обработка курсива внутри оставшихся фрагментов
    final_fragments = []
    for frag_type, content in fragments:
        if frag_type == 'bold':
            final_fragments.append(('bold', content))
            continue
            
        sub_last = 0
        for match in italic_re.finditer(content):
            start, end = match.start(), match.end()
            if sub_last < start:
                final_fragments.append(('normal', content[sub_last:start]))
            final_fragments.append(('italic', match.group(2)))
            sub_last = end
        
        if sub_last < len(content):
            final_fragments.append(('normal', content[sub_last:]))
    
    # Добавление фрагментов в параграф
    for frag_type, content in final_fragments:
        run = paragraph.add_run(content)
        if frag_type == 'bold':
            run.bold = True
        elif frag_type == 'italic':
            run.italic = True


# DNAm

## Load data

In [None]:
path = f"E:/YandexDisk/Work/bbd/fmba"
path_pyaging = "E:/YandexDisk/Work/pydnameth/datasets/pyaging"
path_epimage = "E:/Git/EpImAge"

epi_ages = []
epi_metrics = []
epi_imms = []
epi_scores = []

df_pheno = pd.read_excel(f"{path}/pheno_fixed.xlsx", index_col=0)
df_pheno.index = df_pheno.index.astype(str)
df_pheno['Текущая основная вредность - Физические факторы'] = df_pheno['Текущая основная вредность - Физические факторы'].replace(
    {
        'Ионизирующие излученияК, радиоактивные веществаК;': 'Yes',
        'нет': 'No'
    }
)
cols_diseases = pd.read_excel(f"{path}/diseases.xlsx", index_col=0).index.to_list()

for col_dis in cols_diseases:
    df_pheno[col_dis] = df_pheno[col_dis].replace({1: 'Yes', 0: 'No'})
df_pheno['Spinal osteochondrosis'] = 'No'
df_pheno.loc[(df_pheno['Невропатолог, M42'] == 'Yes') | (df_pheno['Невропатолог, M42.1'] == 'Yes'), 'Spinal osteochondrosis'] = 'Yes'

special_diseases = [
    "Терапевт, I10",        # гипертония
    "Терапевт, I10.0",      # гипертония
    "Терапевт, I11",        # гипертония
    "Терапевт, I11.0",      # гипертония
    "Терапевт, I11.9",      # гипертония
    "Терапевт, I20",        # ИБС
    "Терапевт, I25.0",      # ИБС
    "Терапевт, I25",        # ИБС
    "Терапевт, E78",        # холестерин (гиперлипидемия, гиперхолестеринемия)
    "Терапевт, E66",        # ожирение
    "Терапевт, E66.0",      # ожирение
]
df_pheno['Special Diseases'] = np.where(disjunction([df_pheno[m] == 'Yes' for m in special_diseases]), 1, 0)

df_pheno.loc[(df_pheno['Special Diseases'] == 0) & (df_pheno['Status'] == 'Control'), 'Special Status'] = 'Control'
df_pheno.loc[(df_pheno['Special Diseases'] == 1) & (df_pheno['Status'] == 'Case'), 'Special Status'] = 'Case'

df_pheno.rename(columns={
    'Текущая основная вредность - Физические факторы': 'Radioactive hazards',
    'невропатолог - код_заболевания': 'Невропатолог',
    'отоларинголог - код_заболевания': 'Отоларинголог',
    'офтальмолог - код_заболевания': 'Офтальмолог',
    'дерматолог - код_заболевания': 'Дерматолог',
    'хирург - код_заболевания': 'Хирург',
    'терапевт - код_заболевания': 'Терапевт',
    }, inplace=True
)
cols_pheno = [
    'Radioactive hazards',
    'Status',
    'Терапевт',
    'Хирург',
    'Невропатолог',
    'Дерматолог',
    'Отоларинголог',
    'Офтальмолог',
    'Special Diseases',
    'Special Status'
] + cols_diseases + ['Spinal osteochondrosis']

df_pheno2 = pd.read_excel(f"{path}/origin/Лесной_100_check_14.02_values.xlsx", index_col=0)
df_pheno2.index = df_pheno2.index.astype(str)
df_pheno2.rename(columns={'ССР': 'Сердечно-сосудистый риск', 'ВТЭО': 'Венозные тромбоэмболические осложнения'}, inplace=True)
cols_pheno2 = [
    'Сердечно-сосудистый риск',
    'Онкопоиск',
    'Метаболический синдром',
    'Патология печени',
    'Венозные тромбоэмболические осложнения',
]
df_pheno2['Сердечно-сосудистый риск'] = df_pheno2['Сердечно-сосудистый риск'].replace({'Низкий ССР': 'Низкий', 'Средний ССР': 'Средний', 'Высокий ССР': 'Высокий'})
df_pheno2['Онкопоиск'] = df_pheno2['Онкопоиск'].replace({0: 'Нет', 'Онкопоиск': 'Да'})
df_pheno2['Метаболический синдром'] = df_pheno2['Метаболический синдром'].replace({0: 'Нет', 'МС': 'Да'})
df_pheno2['Патология печени'] = df_pheno2['Патология печени'].replace({0: 'Нет', 'ПП': 'Да'})
df_pheno2['Венозные тромбоэмболические осложнения'] = df_pheno2['Венозные тромбоэмболические осложнения'].replace({0: 'Нет', 'ВТЭО': 'Да'})

df_pheno3 = pd.read_excel(f"{path}/origin/Лесной_100_- расширенный липидный спектр.xlsx", index_col=0)
df_pheno3.index = df_pheno3.index.astype(str)
cols_pheno3 = [
    'New Status 100',
]

df_pyaging = pd.read_excel(f"{path}/dnam/processed/pheno.xlsx", index_col=0)
df_pyaging.index = df_pyaging.index.astype(str)
pyaging_meta = pd.read_excel(f"{path_pyaging}/clocks_meta.xlsx", index_col='Clock Name')
pyaging_meta['Clock Name'] = pyaging_meta.index
df_pyaging.rename(columns=dict(zip(pyaging_meta['Model ID'].values, pyaging_meta['Clock Name'].values)), inplace=True)
pyaging_meta.drop(index=['Knight', 'LeeControl', 'LeeRefinedRobust', 'LeeRobust', 'PedBE', 'RepliTali', 'ENCen100'], inplace=True)
epi_ages += pyaging_meta[pyaging_meta['Type'] == 'Age'].index.to_list()
epi_metrics += pyaging_meta[pyaging_meta['Type'] != 'Age'].index.to_list()
cols_pyaging = ['Age', 'Sex', 'Tissue'] + pyaging_meta[pyaging_meta['Type'] == 'Age'].index.to_list() + pyaging_meta[pyaging_meta['Type'] != 'Age'].index.to_list()

df_epimage = pd.read_excel(f"{path}/dnam/processed/EpImAge.xlsx", index_col=0)
df_epimage.index = df_epimage.index.astype(str)
imms_epimage = pd.read_excel(f"{path_epimage}/models/InflammatoryMarkers/InflammatoryMarkers.xlsx", index_col='feature').index.to_list()
df_epimage.rename(columns=dict(zip([f"{f}_log" for f in imms_epimage], [f"{f} (EpImAge)" for f in imms_epimage])), inplace=True)
epi_ages += ['EpImAge']
epi_imms += [f"{f} (EpImAge)" for f in imms_epimage]
cols_epimage = ['EpImAge'] + [f"{f} (EpImAge)" for f in imms_epimage]

df_episcores = pd.read_csv(f"{path}/dnam/processed/episcores_Les.csv", index_col=0)
df_episcores.index = df_episcores.index.astype(str)
df_episcores.index = df_episcores.index.str.replace('X', '', regex=True)
selected_cols = df_episcores.columns[~df_episcores.columns.isin(['Sex', 'True Age', 'Epigenetic Age (Zhang)', 'Epigenetic Age (Bernabeu)'])].to_list()
df_episcores.rename(columns={'Epigenetic Age (Bernabeu)': 'Bernabeu'}, inplace=True)
df_episcores.rename(columns=dict(zip(selected_cols, [f"{f} (EpiScores)" for f in selected_cols])), inplace=True)
epi_ages += ['Bernabeu']
epi_scores += [f"{f} (EpiScores)" for f in selected_cols]
cols_episcores = ['Bernabeu'] + [f"{f} (EpiScores)" for f in selected_cols]

# n_cmn = df_pheno.index.intersection(df_pyaging.index).intersection(df_epimage.index).intersection(df_episcores.index)
# n_cmn = df_pheno.index.intersection(df_pyaging.index).intersection(df_epimage.index).intersection(df_episcores.index).intersection(df_pheno2.index)
n_cmn = df_pheno.index.intersection(df_pyaging.index).intersection(df_epimage.index).intersection(df_episcores.index).intersection(df_pheno3.index)

# dfs = [df_pheno[cols_pheno], df_pyaging[cols_pyaging], df_epimage[cols_epimage], df_episcores[cols_episcores]]
# cols_pheno_all = cols_pheno
# dfs = [df_pheno[cols_pheno], df_pyaging[cols_pyaging], df_epimage[cols_epimage], df_episcores[cols_episcores], df_pheno2[cols_pheno2]]
# cols_pheno_all = cols_pheno + cols_pheno2
dfs = [df_pheno[cols_pheno], df_pyaging[cols_pyaging], df_epimage[cols_epimage], df_episcores[cols_episcores], df_pheno3[cols_pheno3]]
cols_pheno_all = cols_pheno + cols_pheno3
df_epi = reduce(lambda left,right: pd.merge(left, right, left_index=True, right_index=True), dfs)

# Drop outliers
df_epi = df_epi.drop(['3278', '4436'])

pheno_associations = {
    # 'Radioactive hazards': {
    #     'groups': ['No', 'Yes'],
    #     'base': 'No',
    #     'colors': {'No': 'dodgerblue', 'Yes': 'crimson'}
    # },
    # 'Status': {
    #     'groups': ['Control', 'Case'],
    #     'base': 'Control',
    #     'colors': {'Control': 'dodgerblue', 'Case': 'crimson'}
    # },
    'New Status 100': {
        'groups': ['Control', 'Case'],
        'base': 'Control',
        'colors': {'Control': 'dodgerblue', 'Case': 'crimson'}
    },
    # 'Special Status': {
    #     'groups': ['Control', 'Case'],
    #     'base': 'Control',
    #     'colors': {'Control': 'dodgerblue', 'Case': 'crimson'}
    # },
    # 'Spinal osteochondrosis': {
    #     'groups': ['No', 'Yes'],
    #     'base': 'No',
    #     'colors': {'No': 'dodgerblue', 'Yes': 'crimson'}
    # },
    # 'Офтальмолог, H52.1': {
    #     'groups': ['No', 'Yes'],
    #     'base': 'No',
    #     'colors': {'No': 'dodgerblue', 'Yes': 'crimson'}
    # },
    # 'Сердечно-сосудистый риск': {
    #     'groups': ['Низкий', 'Высокий'],
    #     'base': 'Низкий',
    #     'colors': {'Низкий': 'dodgerblue', 'Высокий': 'crimson'}
    # },
    # 'Epigenetic profile': {
    #     'groups': ['Decelerated aging', 'Accelerated aging'],
    #     'base': 'Decelerated aging',
    #     'colors': {'Decelerated aging': 'dodgerblue', 'Accelerated aging': 'crimson'}
    # },
}

# pheno_associations = {}
# for col_dis in cols_diseases:
#     pheno_associations[col_dis] = {
#         'groups': ['No', 'Yes'],
#         'base': 'No',
#         'colors': {'No': 'dodgerblue', 'Yes': 'crimson'}
#     }

### Special status and Status intersection

In [None]:
pathlib.Path(f"{path}/associations/dnam/Special Status").mkdir(parents=True, exist_ok=True)

fig, ax = plt.subplots()
venn = venn3(
    subsets=(set(df_epi.index[df_epi['Status'] == 'Control'].values), set(df_epi.index[df_epi['Status'] == 'Case'].values), set(df_epi.index[df_epi['Special Status'] == 1].values)),
    set_labels = ('Control', 'Case', 'With diseases'),
    set_colors=('r', 'g', 'b'),
    alpha = 0.5)
venn3_circles(subsets=(set(df_epi.index[df_epi['Status'] == 'Control'].values), set(df_epi.index[df_epi['Status'] == 'Case'].values), set(df_epi.index[df_epi['Special Status'] == 1].values)))
plt.savefig(f"{path}/associations/dnam/Special Status/venn3.png", bbox_inches='tight', dpi=400)
plt.savefig(f"{path}/associations/dnam/Special Status/venn3.pdf", bbox_inches='tight', dpi=400)
plt.clf()

df_epi.loc[:, ['Age', 'Status', 'Special Status'] + special_diseases].to_excel(f"{path}/associations/dnam/Special Status/venn3.xlsx")

### Check number of samples in categories

In [None]:
for an_col in pheno_associations:
    df_epi_ages_stat = pd.DataFrame(index=epi_ages)
    an_vals = pheno_associations[an_col]['groups']
    an_val_base = pheno_associations[an_col]['base']
    an_colors = pheno_associations[an_col]['colors']
    for group in an_vals:
        print(f"{an_col} ({group}): {len(df_epi.index[df_epi[an_col] == group])}")

### Load epigenetic aging profile (if necessary)

In [None]:
df_epi_age_pf = pd.read_excel(f"{path}/individual/EpiAgeScore/data.xlsx", index_col=0)
df_epi_age_pf.index = df_epi_age_pf.index.astype(str)
cols_pheno_all += ["Epigenetic profile"]
df_epi = pd.merge(df_epi, df_epi_age_pf[["Epigenetic profile"]], left_index=True, right_index=True)

### Categorical features tests

In [None]:
pathlib.Path(f"{path}/associations/dnam/categorical_tests").mkdir(parents=True, exist_ok=True)

for f in cols_pheno2:
    df_cross = pd.crosstab(df_epi[f], df_epi['Status'])
    res = chi2_contingency(df_cross, correction=True)
    ax = df_cross.plot(kind="bar", rot=0, color={'Case': 'crimson', 'Control': 'dodgerblue'})
    ax.set_title(r'$\chi^2$' + f' p-value: {res.pvalue:0.2e}')
    fig = ax.get_figure()
    fig.savefig(f"{path}/associations/dnam/categorical_tests/{f}.png", bbox_inches='tight', dpi=200)
    fig.savefig(f"{path}/associations/dnam/categorical_tests/{f}.pdf", bbox_inches='tight')
    plt.close(fig)

## Epigenetic ages

In [None]:
df_epi_ages = df_epi[cols_pheno_all + ['Age'] + epi_ages].copy()
for an_col in pheno_associations:
    df_epi_ages_ass = df_epi_ages.loc[df_epi_ages[an_col].isin(pheno_associations[an_col]['groups'])]
    df_epi_ages_stat = pd.DataFrame(index=epi_ages)
    an_col_str = an_col.replace(' ', '_')
    an_col_str = an_col_str.replace(',', '')
    an_col_str = an_col_str.replace('.', '')
    an_col_str = an_col_str.replace('-', '_')
    df_epi_ages_ass[an_col_str] = df_epi_ages_ass[an_col]
    an_vals = pheno_associations[an_col]['groups']
    an_val_base = pheno_associations[an_col]['base']
    an_colors = pheno_associations[an_col]['colors']
    pathlib.Path(f"{path}/associations/dnam/{an_col}").mkdir(parents=True, exist_ok=True)
    for epiage_id, epiage in enumerate(epi_ages):
        # linreg = smf.ols(formula=f"{epiage} ~ Age", data=df_epi_ages_ann.loc[:, :]).fit()
        linreg = smf.ols(formula=f"{epiage} ~ Age", data=df_epi_ages_ass.loc[df_epi_ages_ass[an_col] == an_val_base, :]).fit()
        df_epi_ages_ass[f"{epiage}_linear_pred"] = linreg.predict(df_epi_ages_ass)
        df_epi_ages_ass[f"{epiage} acceleration"] = df_epi_ages_ass[epiage] - df_epi_ages_ass[f"{epiage}_linear_pred"]
        vals = {}
        for group in an_vals:
            vals[group] = df_epi_ages_ass.loc[df_epi_ages_ass[an_col] == group, f"{epiage} acceleration"].values
            df_epi_ages_stat.at[epiage, f"Mean {group}"] = np.mean(vals[group])
            df_epi_ages_stat.at[epiage, f"Median {group}"] = np.median(vals[group])
            df_epi_ages_stat.at[epiage, f"Q75 {group}"], df_epi_ages_stat.at[epiage, f"Q25 {group}"] = np.percentile(vals[group], [75 , 25])
            df_epi_ages_stat.at[epiage, f"IQR {group}"] = df_epi_ages_stat.at[epiage, f"Q75 {group}"] - df_epi_ages_stat.at[epiage, f"Q25 {group}"]
            df_epi_ages_stat.at[epiage, f"Variation {group}"] = variation(vals[group])
        _, df_epi_ages_stat.at[epiage, "mannwhitneyu_pval"] = mannwhitneyu(vals[an_vals[0]], vals[an_vals[1]], alternative='two-sided')
        _, df_epi_ages_stat.at[epiage, "levene_pval"] = levene(vals[an_vals[0]], vals[an_vals[1]])
        regcov = smf.ols(formula=f"{epiage} ~ {an_col_str} + Age", data=df_epi_ages_ass).fit()
        reg_sum = regcov.summary2().tables[1]
        pvals_cols = reg_sum.index[reg_sum.index.str.contains(an_col_str)].values
        for pval_col_id, pval_col in enumerate(pvals_cols):
            df_epi_ages_stat.at[epiage, f"ancova_{pval_col}_pval"] = reg_sum.at[pval_col, 'P>|t|']
    _, df_epi_ages_stat.loc[epi_ages, "mannwhitneyu_pval_fdr_bh"], _, _ = multipletests(df_epi_ages_stat.loc[epi_ages, "mannwhitneyu_pval"].values, 0.05, method='fdr_bh')
    _, df_epi_ages_stat.loc[epi_ages, "mannwhitneyu_pval_bonferroni"], _, _ = multipletests(df_epi_ages_stat.loc[epi_ages, "mannwhitneyu_pval"].values, 0.05, method='bonferroni')
    _, df_epi_ages_stat.loc[epi_ages, "mannwhitneyu_pval_simes-hochberg"], _, _ = multipletests(df_epi_ages_stat.loc[epi_ages, "mannwhitneyu_pval"].values, 0.05, method='simes-hochberg')
    _, df_epi_ages_stat.loc[epi_ages, "levene_pval_fdr_bh"], _, _ = multipletests(df_epi_ages_stat.loc[epi_ages, "levene_pval"].values, 0.05, method='fdr_bh')
    _, df_epi_ages_stat.loc[epi_ages, "levene_pval_bonferroni"], _, _ = multipletests(df_epi_ages_stat.loc[epi_ages, "levene_pval"].values, 0.05, method='bonferroni')
    _, df_epi_ages_stat.loc[epi_ages, "levene_pval_simes-hochberg"], _, _ = multipletests(df_epi_ages_stat.loc[epi_ages, "levene_pval"].values, 0.05, method='simes-hochberg')
    pvals_cols_ancova = df_epi_ages_stat.columns[df_epi_ages_stat.columns.str.contains(an_col_str)].values
    for pval_col in pvals_cols_ancova:
        _, df_epi_ages_stat.loc[epi_ages, f"{pval_col}_fdr_bh"], _, _ = multipletests(df_epi_ages_stat.loc[epi_ages, pval_col].values, 0.05, method='fdr_bh')
        _, df_epi_ages_stat.loc[epi_ages, f"{pval_col}_bonferroni"], _, _ = multipletests(df_epi_ages_stat.loc[epi_ages, pval_col].values, 0.05, method='bonferroni')
        _, df_epi_ages_stat.loc[epi_ages, f"{pval_col}_simes-hochberg"], _, _ = multipletests(df_epi_ages_stat.loc[epi_ages, pval_col].values, 0.05, method='simes-hochberg')
    df_epi_ages_stat.sort_values([f"mannwhitneyu_pval"], ascending=[True], inplace=True)
    df_epi_ages_stat.to_excel(f"{path}/associations/dnam/{an_col}/ages.xlsx")
    
    hist_bins = np.linspace(5, 115, 23)
    fig, ax = plt.subplots(figsize=(6, 4))
    histplot = sns.histplot(
        data=df_epi_ages_ass,
        bins=hist_bins,
        edgecolor='k',
        linewidth=1,
        x="Age",
        hue=an_col,
        palette=an_colors,
        ax=ax
    )
    histplot.set(xlim=(0, 120))
    plt.savefig(f"{path}/associations/dnam/{an_col}/hist_age.png", bbox_inches='tight', dpi=200)
    plt.savefig(f"{path}/associations/dnam/{an_col}/hist_age.pdf", bbox_inches='tight')
    plt.close(fig)
    
    for stat_test in [x.replace('_pval', '') for x in pvals_cols_ancova] + ['mannwhitneyu', 'levene']:
        df_fig = df_epi_ages_stat.copy()
        df_fig.sort_values([f"{stat_test}_pval"], ascending=[True], inplace=True)
        df_fig['Features'] = df_fig.index
        df_fig[f'{stat_test}_pval_fdr_bh_log'] = -np.log10(df_fig[f'{stat_test}_pval_fdr_bh'])
        df_fig['color'] = 'pink'
        df_fig.loc[df_fig[f'{stat_test}_pval_fdr_bh'] < 0.05, 'color'] = 'red'
        sns.set_theme(style='ticks')
        fig, ax = plt.subplots(figsize=(3, df_fig.shape[0] * 0.5))
        barplot = sns.barplot(
            data=df_fig,
            y='Features',
            x=f'{stat_test}_pval_fdr_bh_log',
            edgecolor='black',
            palette=df_fig['color'].values,
            ax=ax,
        )
        ax.set_xlabel(r"$-\log_{10}(\mathrm{p-value})$")
        ax.xaxis.tick_top()
        ax.xaxis.set_label_position('top')
        ax.set_ylabel('')
        plt.savefig(f"{path}/associations/dnam/{an_col}/ages_pvals_{stat_test}.png", bbox_inches='tight', dpi=200)
        plt.savefig(f"{path}/associations/dnam/{an_col}/ages_pvals_{stat_test}.pdf", bbox_inches='tight')
        plt.close(fig)
        
    sns.set_theme(style='ticks')
    fig = plt.figure(
        figsize=(36, 20),
        layout="constrained"
    )
    subfigs = fig.subfigures(
        nrows=5,
        ncols=6,
        # wspace=0.001,
        # hspace=0.001,
    )
    for epiage_id, epiage in enumerate(df_epi_ages_stat.index.values):
        row_id, col_id = divmod(epiage_id, 6)

        axs = subfigs[row_id, col_id].subplot_mosaic(
            [
                ['11', '12'],
                ['21', '22'],
            ],
            height_ratios=[1, 4],
            width_ratios=[3, 1.5],
            gridspec_kw={
                "bottom": 0.14,
                "top": 0.95,
                # "left": 0.1,
                # "right": 0.5,
                "wspace": 0.33,
                "hspace": 0.01,
            },
        )
        
        ds_table = pd.DataFrame(index=['MAE (from diagonal)', 'MAE (from regression)', fr"Pearson $\rho$", "Bias"], columns=[epiage])
        mae_diag = mean_absolute_error(df_epi_ages_ass['Age'].values, df_epi_ages_ass[epiage].values)
        mae_regr = np.mean(np.abs(df_epi_ages_ass[f"{epiage} acceleration"].values))
        rho, _ = stats.pearsonr(df_epi_ages_ass['Age'].values, df_epi_ages_ass[epiage].values)
        bias = np.mean(df_epi_ages_ass[epiage] - df_epi_ages_ass['Age'])
        ds_table.at['MAE (from diagonal)', epiage] = f"{mae_diag:0.2f}"
        ds_table.at['MAE (from regression)', epiage] = f"{mae_regr:0.2f}"
        ds_table.at[fr"Pearson $\rho$", epiage] = f"{rho:0.2f}"
        ds_table.at["Bias", epiage] = f"{bias:0.2f}"
        col_defs = [
            ColumnDefinition(
                name="index",
                title=epiage,
                textprops={"ha": "left"},
                width=4.5,
            ),
            ColumnDefinition(
                name=epiage,
                title='',
                textprops={"ha": "center"},
                width=2.0,
            ),
        ]
        table = Table(
            ds_table,
            column_definitions=col_defs,
            row_dividers=True,
            footer_divider=False,
            ax=axs['11'],
            textprops={"fontsize": 7},
            row_divider_kw={"linewidth": 1, "linestyle": (0, (1, 1))},
            col_label_divider_kw={"linewidth": 1, "linestyle": "-"},
            column_border_kw={"linewidth": 1, "linestyle": "-"},
        ).autoset_fontcolors(colnames=[epiage])
        
        axs['12'].axis('off')
        
        xy_min = df_epi_ages_ass[['Age', epiage]].min().min()
        xy_max = df_epi_ages_ass[['Age', epiage]].max().max()
        xy_ptp = xy_max - xy_min
        bisect = sns.lineplot(
            x=[xy_min - 0.1 * xy_ptp, xy_max + 0.1 * xy_ptp],
            y=[xy_min - 0.1 * xy_ptp, xy_max + 0.1 * xy_ptp],
            linestyle='--',
            color='black',
            linewidth=1.0,
            ax=axs['21']
        )
        regplot = sns.regplot(
            data=df_epi_ages_ass.loc[df_epi_ages_ass[an_col] == an_val_base, :],
            x='Age',
            y=epiage,
            color=an_colors[an_val_base],
            scatter=False,
            truncate=False,
            ax=axs['21']
        )
        scatter = sns.scatterplot(
            data=df_epi_ages_ass,
            x='Age',
            y=epiage,
            hue=an_col,
            palette=an_colors,
            linewidth=0.5,
            alpha=0.75,
            edgecolor="k",
            s=20,
            hue_order=list(an_colors.keys()),
            legend=True,
            ax=axs['21'],
        )
        axs['21'].set_xlim(xy_min - 0.1 * xy_ptp, xy_max + 0.1 * xy_ptp)
        axs['21'].set_ylim(xy_min - 0.1 * xy_ptp, xy_max + 0.1 * xy_ptp)
        
        sns.violinplot(
            data=df_epi_ages_ass,
            x=an_col,
            y=f"{epiage} acceleration",
            hue=an_col,
            palette=an_colors,
            density_norm='width',
            order=an_vals,
            saturation=0.75,
            linewidth=1.0,
            ax=axs['22'],
            legend=False,
            cut=0,
        )
        axs['22'].set_ylabel(f"{epiage} acceleration")
        mannwhitneyu_pval = df_epi_ages_stat.at[epiage, "mannwhitneyu_pval_fdr_bh"]
        levene_pval = df_epi_ages_stat.at[epiage, "levene_pval_fdr_bh"]
        title = f'Mann-Whitney: {mannwhitneyu_pval:.2e}\nLevene: {levene_pval:.2e}'
        for pval_col in pvals_cols_ancova:
            title += f"\nANCOVA: {df_epi_ages_stat.at[epiage, pval_col + '_fdr_bh']:.2e}"
        axs['22'].set_title(title)

    fig.savefig(f"{path}/associations/dnam/{an_col}/ages_distribution.png", bbox_inches='tight', dpi=200)
    fig.savefig(f"{path}/associations/dnam/{an_col}/ages_distribution.pdf", bbox_inches='tight')
    plt.close(fig)

## Epigenetic ages (corrected)

In [None]:
df_epi_ages = df_epi[cols_pheno_all + ['Age'] + epi_ages].copy()
for an_col in pheno_associations:
    df_epi_ages_ass = df_epi_ages.loc[df_epi_ages[an_col].isin(pheno_associations[an_col]['groups'])]
    df_epi_ages_stat = pd.DataFrame(index=epi_ages)
    an_col_str = an_col.replace(' ', '_')
    an_col_str = an_col_str.replace(',', '')
    an_col_str = an_col_str.replace('.', '')
    an_col_str = an_col_str.replace('-', '_')
    df_epi_ages_ass[an_col_str] = df_epi_ages_ass[an_col]
    an_vals = pheno_associations[an_col]['groups']
    an_val_base = pheno_associations[an_col]['base']
    an_colors = pheno_associations[an_col]['colors']
    
    pathlib.Path(f"{path}/associations/dnam/{an_col}").mkdir(parents=True, exist_ok=True)
    
    epi_ages_mae = {}
    for epiage_id, epiage in enumerate(epi_ages):
        
        linreg_cx = smf.ols(formula=f"{epiage} ~ Age", data=df_epi_ages_ass).fit()
        df_epi_ages_ass[f"{epiage}_linear_pred_cx"] = linreg_cx.predict(df_epi_ages_ass)
        df_epi_ages_ass[f"{epiage}_acceleration_cx"] = df_epi_ages_ass[epiage] - df_epi_ages_ass[f"{epiage}_linear_pred_cx"]
        df_epi_ages_ass[f"{epiage}"] = df_epi_ages_ass["Age"] + df_epi_ages_ass[f"{epiage}_acceleration_cx"]
        epi_ages_mae[epiage] = np.mean(np.abs(df_epi_ages_ass[f"{epiage}_acceleration_cx"].values))
        df_epi_ages_ass[f"{epiage} acceleration by MAE"] = df_epi_ages_ass[f"{epiage}_acceleration_cx"] / epi_ages_mae[epiage]
        df_epi_ages_ass.loc[:, f"{epiage} acceleration type"] = 0
        df_epi_ages_ass.loc[df_epi_ages_ass[f"{epiage} acceleration by MAE"] > 1.0, f"{epiage} acceleration type"] = 1.0
        df_epi_ages_ass.loc[df_epi_ages_ass[f"{epiage} acceleration by MAE"] < -1.0, f"{epiage} acceleration type"] = -1.0
        
        # linreg = smf.ols(formula=f"{epiage} ~ Age", data=df_epi_ages_ass.loc[:, :]).fit()
        linreg = smf.ols(formula=f"{epiage} ~ Age", data=df_epi_ages_ass.loc[df_epi_ages_ass[an_col] == an_val_base, :]).fit()
        df_epi_ages_ass[f"{epiage}_linear_pred"] = linreg.predict(df_epi_ages_ass)
        df_epi_ages_ass[f"{epiage} acceleration"] = df_epi_ages_ass[epiage] - df_epi_ages_ass[f"{epiage}_linear_pred"]
        vals = {}
        for group in an_vals:
            vals[group] = df_epi_ages_ass.loc[df_epi_ages_ass[an_col] == group, f"{epiage} acceleration"].values
            df_epi_ages_stat.at[epiage, f"Mean {group}"] = np.mean(vals[group])
            df_epi_ages_stat.at[epiage, f"Median {group}"] = np.median(vals[group])
            df_epi_ages_stat.at[epiage, f"Q75 {group}"], df_epi_ages_stat.at[epiage, f"Q25 {group}"] = np.percentile(vals[group], [75 , 25])
            df_epi_ages_stat.at[epiage, f"IQR {group}"] = df_epi_ages_stat.at[epiage, f"Q75 {group}"] - df_epi_ages_stat.at[epiage, f"Q25 {group}"]
            df_epi_ages_stat.at[epiage, f"Variation {group}"] = variation(vals[group])
        _, df_epi_ages_stat.at[epiage, "mannwhitneyu_pval"] = mannwhitneyu(vals[an_vals[0]], vals[an_vals[1]], alternative='two-sided')
        _, df_epi_ages_stat.at[epiage, "levene_pval"] = levene(vals[an_vals[0]], vals[an_vals[1]])
        regcov = smf.ols(formula=f"{epiage} ~ {an_col_str} + Age", data=df_epi_ages_ass).fit()
        reg_sum = regcov.summary2().tables[1]
        pvals_cols = reg_sum.index[reg_sum.index.str.contains(an_col_str)].values
        for pval_col_id, pval_col in enumerate(pvals_cols):
            df_epi_ages_stat.at[epiage, f"ancova_{pval_col}_pval"] = reg_sum.at[pval_col, 'P>|t|']
    
    df_epi_ages_ass[f"Epigenetic Ages Summary"] = df_epi_ages_ass.loc[:, [f"{epiage} acceleration type" for epiage in epi_ages]].sum(axis=1)
    df_epi_ages_ass[f"Epigenetic profile"] = 'Neutral'
    epi_profile_thld = 6
    df_epi_ages_ass.loc[df_epi_ages_ass[f"Epigenetic Ages Summary"] > epi_profile_thld, f"Epigenetic profile"] = 'Accelerated aging'
    df_epi_ages_ass.loc[df_epi_ages_ass[f"Epigenetic Ages Summary"] < -epi_profile_thld, f"Epigenetic profile"] = 'Decelerated aging'
    df_epi_ages_ass.to_excel(f"{path}/associations/dnam/{an_col}/ages_data_corrected.xlsx")
    
    _, df_epi_ages_stat.loc[epi_ages, "mannwhitneyu_pval_fdr_bh"], _, _ = multipletests(df_epi_ages_stat.loc[epi_ages, "mannwhitneyu_pval"].values, 0.05, method='fdr_bh')
    _, df_epi_ages_stat.loc[epi_ages, "mannwhitneyu_pval_bonferroni"], _, _ = multipletests(df_epi_ages_stat.loc[epi_ages, "mannwhitneyu_pval"].values, 0.05, method='bonferroni')
    _, df_epi_ages_stat.loc[epi_ages, "mannwhitneyu_pval_simes-hochberg"], _, _ = multipletests(df_epi_ages_stat.loc[epi_ages, "mannwhitneyu_pval"].values, 0.05, method='simes-hochberg')
    _, df_epi_ages_stat.loc[epi_ages, "levene_pval_fdr_bh"], _, _ = multipletests(df_epi_ages_stat.loc[epi_ages, "levene_pval"].values, 0.05, method='fdr_bh')
    _, df_epi_ages_stat.loc[epi_ages, "levene_pval_bonferroni"], _, _ = multipletests(df_epi_ages_stat.loc[epi_ages, "levene_pval"].values, 0.05, method='bonferroni')
    _, df_epi_ages_stat.loc[epi_ages, "levene_pval_simes-hochberg"], _, _ = multipletests(df_epi_ages_stat.loc[epi_ages, "levene_pval"].values, 0.05, method='simes-hochberg')
    pvals_cols_ancova = df_epi_ages_stat.columns[df_epi_ages_stat.columns.str.contains(an_col_str)].values
    for pval_col in pvals_cols_ancova:
        _, df_epi_ages_stat.loc[epi_ages, f"{pval_col}_fdr_bh"], _, _ = multipletests(df_epi_ages_stat.loc[epi_ages, pval_col].values, 0.05, method='fdr_bh')
        _, df_epi_ages_stat.loc[epi_ages, f"{pval_col}_bonferroni"], _, _ = multipletests(df_epi_ages_stat.loc[epi_ages, pval_col].values, 0.05, method='bonferroni')
        _, df_epi_ages_stat.loc[epi_ages, f"{pval_col}_simes-hochberg"], _, _ = multipletests(df_epi_ages_stat.loc[epi_ages, pval_col].values, 0.05, method='simes-hochberg')
    df_epi_ages_stat.sort_values([f"mannwhitneyu_pval"], ascending=[True], inplace=True)
    df_epi_ages_stat.to_excel(f"{path}/associations/dnam/{an_col}/ages_corrected.xlsx")
    
    hist_bins = np.linspace(5, 115, 23)
    fig, ax = plt.subplots(figsize=(6, 4))
    histplot = sns.histplot(
        data=df_epi_ages_ass,
        bins=hist_bins,
        edgecolor='k',
        linewidth=1,
        x="Age",
        hue=an_col,
        palette=an_colors,
        ax=ax
    )
    histplot.set(xlim=(0, 120))
    plt.savefig(f"{path}/associations/dnam/{an_col}/hist_age_corrected.png", bbox_inches='tight', dpi=200)
    plt.savefig(f"{path}/associations/dnam/{an_col}/hist_age_corrected.pdf", bbox_inches='tight')
    plt.close(fig)
    
    for stat_test in [x.replace('_pval', '') for x in pvals_cols_ancova] + ['mannwhitneyu', 'levene']:
        df_fig = df_epi_ages_stat.copy()
        df_fig.sort_values([f"{stat_test}_pval"], ascending=[True], inplace=True)
        df_fig['Features'] = df_fig.index
        df_fig[f'{stat_test}_pval_fdr_bh_log'] = -np.log10(df_fig[f'{stat_test}_pval_fdr_bh'])
        df_fig['color'] = 'pink'
        df_fig.loc[df_fig[f'{stat_test}_pval_fdr_bh'] < 0.05, 'color'] = 'red'
        sns.set_theme(style='ticks')
        fig, ax = plt.subplots(figsize=(3, df_fig.shape[0] * 0.5))
        barplot = sns.barplot(
            data=df_fig,
            y='Features',
            x=f'{stat_test}_pval_fdr_bh_log',
            edgecolor='black',
            palette=df_fig['color'].values,
            ax=ax,
        )
        ax.set_xlabel(r"$-\log_{10}(\mathrm{p-value})$")
        ax.xaxis.tick_top()
        ax.xaxis.set_label_position('top')
        ax.set_ylabel('')
        plt.savefig(f"{path}/associations/dnam/{an_col}/ages_pvals_{stat_test}_corrected.png", bbox_inches='tight', dpi=200)
        plt.savefig(f"{path}/associations/dnam/{an_col}/ages_pvals_{stat_test}_corrected.pdf", bbox_inches='tight')
        plt.close(fig)
        
    sns.set_theme(style='ticks')
    fig = plt.figure(
        figsize=(36, 20),
        layout="constrained"
    )
    subfigs = fig.subfigures(
        nrows=5,
        ncols=6,
        # wspace=0.001,
        # hspace=0.001,
    )
    for epiage_id, epiage in enumerate(df_epi_ages_stat.index.values):
        row_id, col_id = divmod(epiage_id, 6)

        axs = subfigs[row_id, col_id].subplot_mosaic(
            [
                ['11', '12'],
                ['21', '22'],
            ],
            height_ratios=[1, 4],
            width_ratios=[3, 1.5],
            gridspec_kw={
                "bottom": 0.14,
                "top": 0.95,
                # "left": 0.1,
                # "right": 0.5,
                "wspace": 0.33,
                "hspace": 0.01,
            },
        )
        
        ds_table = pd.DataFrame(index=['MAE (from diagonal)', 'MAE (from regression)', fr"Pearson $\rho$", "Bias"], columns=[epiage])
        mae_diag = mean_absolute_error(df_epi_ages_ass['Age'].values, df_epi_ages_ass[epiage].values)
        mae_regr = np.mean(np.abs(df_epi_ages_ass[f"{epiage} acceleration"].values))
        rho, _ = stats.pearsonr(df_epi_ages_ass['Age'].values, df_epi_ages_ass[epiage].values)
        bias = np.mean(df_epi_ages_ass[epiage] - df_epi_ages_ass['Age'])
        ds_table.at['MAE (from diagonal)', epiage] = f"{mae_diag:0.2f}"
        ds_table.at['MAE (from regression)', epiage] = f"{mae_regr:0.2f}"
        ds_table.at[fr"Pearson $\rho$", epiage] = f"{rho:0.2f}"
        ds_table.at["Bias", epiage] = f"{bias:0.2f}"
        col_defs = [
            ColumnDefinition(
                name="index",
                title=epiage,
                textprops={"ha": "left"},
                width=4.5,
            ),
            ColumnDefinition(
                name=epiage,
                title='',
                textprops={"ha": "center"},
                width=2.0,
            ),
        ]
        table = Table(
            ds_table,
            column_definitions=col_defs,
            row_dividers=True,
            footer_divider=False,
            ax=axs['11'],
            textprops={"fontsize": 7},
            row_divider_kw={"linewidth": 1, "linestyle": (0, (1, 1))},
            col_label_divider_kw={"linewidth": 1, "linestyle": "-"},
            column_border_kw={"linewidth": 1, "linestyle": "-"},
        ).autoset_fontcolors(colnames=[epiage])
        
        axs['12'].axis('off')
        
        xy_min = df_epi_ages_ass[['Age', epiage]].min().min()
        xy_max = df_epi_ages_ass[['Age', epiage]].max().max()
        xy_ptp = xy_max - xy_min
        bisect = sns.lineplot(
            x=[xy_min - 0.1 * xy_ptp, xy_max + 0.1 * xy_ptp],
            y=[xy_min - 0.1 * xy_ptp, xy_max + 0.1 * xy_ptp],
            linestyle='--',
            color='black',
            linewidth=1.0,
            ax=axs['21']
        )
        regplot = sns.regplot(
            data=df_epi_ages_ass.loc[df_epi_ages_ass[an_col] == an_val_base, :],
            x='Age',
            y=epiage,
            color=an_colors[an_val_base],
            scatter=False,
            truncate=False,
            ax=axs['21']
        )
        scatter = sns.scatterplot(
            data=df_epi_ages_ass,
            x='Age',
            y=epiage,
            hue=an_col,
            palette=an_colors,
            linewidth=0.5,
            alpha=0.75,
            edgecolor="k",
            s=20,
            hue_order=list(an_colors.keys()),
            legend=True,
            ax=axs['21'],
        )
        axs['21'].set_xlim(xy_min - 0.1 * xy_ptp, xy_max + 0.1 * xy_ptp)
        axs['21'].set_ylim(xy_min - 0.1 * xy_ptp, xy_max + 0.1 * xy_ptp)
        
        sns.violinplot(
            data=df_epi_ages_ass,
            x=an_col,
            y=f"{epiage} acceleration",
            hue=an_col,
            palette=an_colors,
            density_norm='width',
            order=an_vals,
            saturation=0.75,
            linewidth=1.0,
            ax=axs['22'],
            legend=False,
            cut=0,
        )
        axs['22'].set_ylabel(f"{epiage} acceleration")
        mannwhitneyu_pval = df_epi_ages_stat.at[epiage, "mannwhitneyu_pval_fdr_bh"]
        levene_pval = df_epi_ages_stat.at[epiage, "levene_pval_fdr_bh"]
        title = f'Mann-Whitney: {mannwhitneyu_pval:.2e}\nLevene: {levene_pval:.2e}'
        for pval_col in pvals_cols_ancova:
            title += f"\nANCOVA: {df_epi_ages_stat.at[epiage, pval_col + '_fdr_bh']:.2e}"
        axs['22'].set_title(title)

    fig.savefig(f"{path}/associations/dnam/{an_col}/ages_distribution_corrected.png", bbox_inches='tight', dpi=200)
    fig.savefig(f"{path}/associations/dnam/{an_col}/ages_distribution_corrected.pdf", bbox_inches='tight')
    plt.close(fig)

## Epigenetic metrics

In [None]:
df_epi_metrics = df_epi[cols_pheno_all + ['Age'] + epi_metrics].copy()
for an_col in pheno_associations:
    df_epi_metrics_ass = df_epi_metrics.loc[df_epi_metrics[an_col].isin(pheno_associations[an_col]['groups'])]
    df_epi_metrics_stat = pd.DataFrame(index=epi_metrics)
    an_col_str = an_col.replace(' ', '_')
    an_col_str = an_col_str.replace(',', '')
    an_col_str = an_col_str.replace('.', '')
    an_col_str = an_col_str.replace('-', '_')
    df_epi_metrics_ass[an_col_str] = df_epi_metrics_ass[an_col]
    an_vals = pheno_associations[an_col]['groups']
    an_val_base = pheno_associations[an_col]['base']
    an_colors = pheno_associations[an_col]['colors']
    pathlib.Path(f"{path}/associations/dnam/{an_col}").mkdir(parents=True, exist_ok=True)
    for epi_metric_id, epi_metric in enumerate(epi_metrics):
        vals = {}
        for group in an_vals:
            vals[group] = df_epi_metrics_ass.loc[df_epi_metrics_ass[an_col] == group, epi_metric].values
            df_epi_metrics_stat.at[epi_metric, f"Mean {group}"] = np.mean(vals[group])
            df_epi_metrics_stat.at[epi_metric, f"Median {group}"] = np.median(vals[group])
            df_epi_metrics_stat.at[epi_metric, f"Q75 {group}"], df_epi_metrics_stat.at[epi_metric, f"Q25 {group}"] = np.percentile(vals[group], [75 , 25])
            df_epi_metrics_stat.at[epi_metric, f"IQR {group}"] = df_epi_metrics_stat.at[epi_metric, f"Q75 {group}"] - df_epi_metrics_stat.at[epi_metric, f"Q25 {group}"]
            df_epi_metrics_stat.at[epi_metric, f"Variation {group}"] = variation(vals[group])
        _, df_epi_metrics_stat.at[epi_metric, "mannwhitneyu_pval"] = mannwhitneyu(vals[an_vals[0]], vals[an_vals[1]], alternative='two-sided')
        _, df_epi_metrics_stat.at[epi_metric, "levene_pval"] = levene(vals[an_vals[0]], vals[an_vals[1]])
        regcov = smf.ols(formula=f"{epi_metric} ~ {an_col_str} + Age", data=df_epi_metrics_ass).fit()
        reg_sum = regcov.summary2().tables[1]
        pvals_cols = reg_sum.index[reg_sum.index.str.contains(an_col_str)].values
        for pval_col_id, pval_col in enumerate(pvals_cols):
            df_epi_metrics_stat.at[epi_metric, f"ancova_{pval_col}_pval"] = reg_sum.at[pval_col, 'P>|t|']
    _, df_epi_metrics_stat.loc[epi_metrics, "mannwhitneyu_pval_fdr_bh"], _, _ = multipletests(df_epi_metrics_stat.loc[epi_metrics, "mannwhitneyu_pval"].values, 0.05, method='fdr_bh')
    _, df_epi_metrics_stat.loc[epi_metrics, "mannwhitneyu_pval_bonferroni"], _, _ = multipletests(df_epi_metrics_stat.loc[epi_metrics, "mannwhitneyu_pval"].values, 0.05, method='bonferroni')
    _, df_epi_metrics_stat.loc[epi_metrics, "mannwhitneyu_pval_simes-hochberg"], _, _ = multipletests(df_epi_metrics_stat.loc[epi_metrics, "mannwhitneyu_pval"].values, 0.05, method='simes-hochberg')
    _, df_epi_metrics_stat.loc[epi_metrics, "levene_pval_fdr_bh"], _, _ = multipletests(df_epi_metrics_stat.loc[epi_metrics, "levene_pval"].values, 0.05, method='fdr_bh')
    _, df_epi_metrics_stat.loc[epi_metrics, "levene_pval_bonferroni"], _, _ = multipletests(df_epi_metrics_stat.loc[epi_metrics, "levene_pval"].values, 0.05, method='bonferroni')
    _, df_epi_metrics_stat.loc[epi_metrics, "levene_pval_simes-hochberg"], _, _ = multipletests(df_epi_metrics_stat.loc[epi_metrics, "levene_pval"].values, 0.05, method='simes-hochberg')
    pvals_cols_ancova = df_epi_metrics_stat.columns[df_epi_metrics_stat.columns.str.contains(an_col_str)].values
    for pval_col in pvals_cols_ancova:
        _, df_epi_metrics_stat.loc[epi_metrics, f"{pval_col}_fdr_bh"], _, _ = multipletests(df_epi_metrics_stat.loc[epi_metrics, pval_col].values, 0.05, method='fdr_bh')
        _, df_epi_metrics_stat.loc[epi_metrics, f"{pval_col}_bonferroni"], _, _ = multipletests(df_epi_metrics_stat.loc[epi_metrics, pval_col].values, 0.05, method='bonferroni')
        _, df_epi_metrics_stat.loc[epi_metrics, f"{pval_col}_simes-hochberg"], _, _ = multipletests(df_epi_metrics_stat.loc[epi_metrics, pval_col].values, 0.05, method='simes-hochberg')
    df_epi_metrics_stat.sort_values([f"mannwhitneyu_pval"], ascending=[True], inplace=True)
    df_epi_metrics_stat.to_excel(f"{path}/associations/dnam/{an_col}/metrics.xlsx")
    
    for stat_test in [x.replace('_pval', '') for x in pvals_cols_ancova] + ['mannwhitneyu', 'levene']:
        df_fig = df_epi_metrics_stat.copy()
        df_fig.sort_values([f"{stat_test}_pval"], ascending=[True], inplace=True)
        df_fig['Features'] = df_fig.index
        df_fig[f'{stat_test}_pval_fdr_bh_log'] = -np.log10(df_fig[f'{stat_test}_pval_fdr_bh'])
        df_fig['color'] = 'pink'
        df_fig.loc[df_fig[f'{stat_test}_pval_fdr_bh'] < 0.05, 'color'] = 'red'
        sns.set_theme(style='ticks')
        fig, ax = plt.subplots(figsize=(3, df_fig.shape[0] * 0.5))
        barplot = sns.barplot(
            data=df_fig,
            y='Features',
            x=f'{stat_test}_pval_fdr_bh_log',
            edgecolor='black',
            palette=df_fig['color'].values,
            ax=ax,
        )
        ax.set_xlabel(r"$-\log_{10}(\mathrm{p-value})$")
        ax.xaxis.tick_top()
        ax.xaxis.set_label_position('top')
        ax.set_ylabel('')
        plt.savefig(f"{path}/associations/dnam/{an_col}/metrics_pvals_{stat_test}.png", bbox_inches='tight', dpi=200)
        plt.savefig(f"{path}/associations/dnam/{an_col}/metrics_pvals_{stat_test}.pdf", bbox_inches='tight')
        plt.close(fig)
    
    n_rows = 2
    n_cols = 3
    fig_width = 12
    fig_height = 9
    
    sns.set_theme(style='ticks')
    fig, axs = plt.subplots(n_rows, n_cols, figsize=(fig_width, fig_height), gridspec_kw={'wspace':0.15, 'hspace': 0.15}, layout='constrained')
    for epi_metric_id, epi_metric in enumerate(df_epi_metrics_stat.index.values):
        row_id, col_id = divmod(epi_metric_id, n_cols)
        
        ql = df_epi_metrics_ass[epi_metric].quantile(0.02)
        qh = df_epi_metrics_ass[epi_metric].quantile(0.98)
        
        sns.violinplot(
            data=df_epi_metrics_ass.loc[(df_epi_metrics_ass[epi_metric] > ql) & (df_epi_metrics_ass[epi_metric] < qh), :],
            x=an_col,
            y=epi_metric,
            palette=an_colors,
            scale='width',
            order=an_vals,
            saturation=0.75,
            ax=axs[row_id, col_id],
            legend=False,
            cut=0,
        )
        axs[row_id, col_id].set_ylabel(epi_metric)
        axs[row_id, col_id].ticklabel_format(style='scientific', scilimits=(-1, 1), axis='y', useOffset=True, useMathText=True)
        mannwhitneyu_pval = df_epi_metrics_stat.at[epi_metric, "mannwhitneyu_pval_fdr_bh"]
        levene_pval = df_epi_metrics_stat.at[epi_metric, "levene_pval_fdr_bh"]
        title = f'Mann-Whitney: {mannwhitneyu_pval:.2e}\nLevene: {levene_pval:.2e}'
        for pval_col in pvals_cols_ancova:
            title += f"\nANCOVA: {df_epi_metrics_stat.at[epi_metric, pval_col + '_fdr_bh']:.2e}"
        axs[row_id, col_id].set_title(title)

    fig.savefig(f"{path}/associations/dnam/{an_col}/metrics_distribution.png", bbox_inches='tight', dpi=200)
    fig.savefig(f"{path}/associations/dnam/{an_col}/metrics_distribution.pdf", bbox_inches='tight')
    plt.close(fig)
    

## Epigenetic imms

In [None]:
df_epi_imms = df_epi[cols_pheno_all + ['Age'] + epi_imms].copy()
for an_col in pheno_associations:
    df_epi_imms_ass = df_epi_imms.loc[df_epi_imms[an_col].isin(pheno_associations[an_col]['groups'])]
    df_epi_imms_stat = pd.DataFrame(index=epi_imms)
    an_col_str = an_col.replace(' ', '_')
    an_col_str = an_col_str.replace(',', '')
    an_col_str = an_col_str.replace('.', '')
    an_col_str = an_col_str.replace('-', '_')
    df_epi_imms_ass[an_col_str] = df_epi_imms_ass[an_col]
    an_vals = pheno_associations[an_col]['groups']
    an_val_base = pheno_associations[an_col]['base']
    an_colors = pheno_associations[an_col]['colors']
    pathlib.Path(f"{path}/associations/dnam/{an_col}").mkdir(parents=True, exist_ok=True)
    for epi_imm_id, epi_imm in enumerate(epi_imms):
        epi_imm_str = epi_imm.replace(' ', '_')
        epi_imm_str = epi_imm_str.replace('(', '')
        epi_imm_str = epi_imm_str.replace(')', '')
        df_epi_imms_ass[epi_imm_str] = df_epi_imms_ass[epi_imm]
        vals = {}
        for group in an_vals:
            vals[group] = df_epi_imms_ass.loc[df_epi_imms_ass[an_col] == group, epi_imm].values
            df_epi_imms_stat.at[epi_imm, f"Mean {group}"] = np.mean(vals[group])
            df_epi_imms_stat.at[epi_imm, f"Median {group}"] = np.median(vals[group])
            df_epi_imms_stat.at[epi_imm, f"Q75 {group}"], df_epi_imms_stat.at[epi_imm, f"Q25 {group}"] = np.percentile(vals[group], [75 , 25])
            df_epi_imms_stat.at[epi_imm, f"IQR {group}"] = df_epi_imms_stat.at[epi_imm, f"Q75 {group}"] - df_epi_imms_stat.at[epi_imm, f"Q25 {group}"]
            df_epi_imms_stat.at[epi_imm, f"Variation {group}"] = variation(vals[group])
        _, df_epi_imms_stat.at[epi_imm, "mannwhitneyu_pval"] = mannwhitneyu(vals[an_vals[0]], vals[an_vals[1]], alternative='two-sided')
        _, df_epi_imms_stat.at[epi_imm, "levene_pval"] = levene(vals[an_vals[0]], vals[an_vals[1]])
        regcov = smf.ols(formula=f"{epi_imm_str} ~ {an_col_str} + Age", data=df_epi_imms_ass).fit()
        reg_sum = regcov.summary2().tables[1]
        pvals_cols = reg_sum.index[reg_sum.index.str.contains(an_col_str)].values
        for pval_col_id, pval_col in enumerate(pvals_cols):
            df_epi_imms_stat.at[epi_imm, f"ancova_{pval_col}_pval"] = reg_sum.at[pval_col, 'P>|t|']
    _, df_epi_imms_stat.loc[epi_imms, "mannwhitneyu_pval_fdr_bh"], _, _ = multipletests(df_epi_imms_stat.loc[epi_imms, "mannwhitneyu_pval"].values, 0.05, method='fdr_bh')
    _, df_epi_imms_stat.loc[epi_imms, "mannwhitneyu_pval_bonferroni"], _, _ = multipletests(df_epi_imms_stat.loc[epi_imms, "mannwhitneyu_pval"].values, 0.05, method='bonferroni')
    _, df_epi_imms_stat.loc[epi_imms, "mannwhitneyu_pval_simes-hochberg"], _, _ = multipletests(df_epi_imms_stat.loc[epi_imms, "mannwhitneyu_pval"].values, 0.05, method='simes-hochberg')
    _, df_epi_imms_stat.loc[epi_imms, "levene_pval_fdr_bh"], _, _ = multipletests(df_epi_imms_stat.loc[epi_imms, "levene_pval"].values, 0.05, method='fdr_bh')
    _, df_epi_imms_stat.loc[epi_imms, "levene_pval_bonferroni"], _, _ = multipletests(df_epi_imms_stat.loc[epi_imms, "levene_pval"].values, 0.05, method='bonferroni')
    _, df_epi_imms_stat.loc[epi_imms, "levene_pval_simes-hochberg"], _, _ = multipletests(df_epi_imms_stat.loc[epi_imms, "levene_pval"].values, 0.05, method='simes-hochberg')
    pvals_cols_ancova = df_epi_imms_stat.columns[df_epi_imms_stat.columns.str.contains(an_col_str)].values
    for pval_col in pvals_cols_ancova:
        _, df_epi_imms_stat.loc[epi_imms, f"{pval_col}_fdr_bh"], _, _ = multipletests(df_epi_imms_stat.loc[epi_imms, pval_col].values, 0.05, method='fdr_bh')
        _, df_epi_imms_stat.loc[epi_imms, f"{pval_col}_bonferroni"], _, _ = multipletests(df_epi_imms_stat.loc[epi_imms, pval_col].values, 0.05, method='bonferroni')
        _, df_epi_imms_stat.loc[epi_imms, f"{pval_col}_simes-hochberg"], _, _ = multipletests(df_epi_imms_stat.loc[epi_imms, pval_col].values, 0.05, method='simes-hochberg')
    df_epi_imms_stat.sort_values([f"mannwhitneyu_pval"], ascending=[True], inplace=True)
    df_epi_imms_stat.to_excel(f"{path}/associations/dnam/{an_col}/imms.xlsx")
    
    for stat_test in [x.replace('_pval', '') for x in pvals_cols_ancova] + ['mannwhitneyu', 'levene']:
        df_fig = df_epi_imms_stat.copy()
        df_fig.sort_values([f"{stat_test}_pval"], ascending=[True], inplace=True)
        df_fig['Features'] = df_fig.index
        df_fig[f'{stat_test}_pval_fdr_bh_log'] = -np.log10(df_fig[f'{stat_test}_pval_fdr_bh'])
        df_fig['color'] = 'pink'
        df_fig.loc[df_fig[f'{stat_test}_pval_fdr_bh'] < 0.05, 'color'] = 'red'
        sns.set_theme(style='ticks')
        fig, ax = plt.subplots(figsize=(3, df_fig.shape[0] * 0.5))
        barplot = sns.barplot(
            data=df_fig,
            y='Features',
            x=f'{stat_test}_pval_fdr_bh_log',
            edgecolor='black',
            palette=df_fig['color'].values,
            ax=ax,
        )
        ax.set_xlabel(r"$-\log_{10}(\mathrm{p-value})$")
        ax.xaxis.tick_top()
        ax.xaxis.set_label_position('top')
        ax.set_ylabel('')
        plt.savefig(f"{path}/associations/dnam/{an_col}/imms_pvals_{stat_test}.png", bbox_inches='tight', dpi=200)
        plt.savefig(f"{path}/associations/dnam/{an_col}/imms_pvals_{stat_test}.pdf", bbox_inches='tight')
        plt.close(fig)
    
    n_rows = 4
    n_cols = 6
    fig_width = 27
    fig_height = 16
    
    sns.set_theme(style='ticks')
    fig, axs = plt.subplots(n_rows, n_cols, figsize=(fig_width, fig_height), gridspec_kw={'wspace':0.15, 'hspace': 0.15}, layout='constrained')
    for epi_imm_id, epi_imm in enumerate(df_epi_imms_stat.index.values):
        row_id, col_id = divmod(epi_imm_id, n_cols)
        
        ql = df_epi_imms_ass[epi_imm].quantile(0.02)
        qh = df_epi_imms_ass[epi_imm].quantile(0.98)
        
        sns.violinplot(
            data=df_epi_imms_ass.loc[(df_epi_imms_ass[epi_imm] > ql) & (df_epi_imms_ass[epi_imm] < qh), :],
            x=an_col,
            y=epi_imm,
            palette=an_colors,
            scale='width',
            order=an_vals,
            saturation=0.75,
            ax=axs[row_id, col_id],
            legend=False,
            cut=0,
        )
        axs[row_id, col_id].set_ylabel(epi_imm)
        axs[row_id, col_id].ticklabel_format(style='scientific', scilimits=(-1, 1), axis='y', useOffset=True, useMathText=True)
        mannwhitneyu_pval = df_epi_imms_stat.at[epi_imm, "mannwhitneyu_pval_fdr_bh"]
        levene_pval = df_epi_imms_stat.at[epi_imm, "levene_pval_fdr_bh"]
        title = f'Mann-Whitney: {mannwhitneyu_pval:.2e}\nLevene: {levene_pval:.2e}'
        for pval_col in pvals_cols_ancova:
            title += f"\nANCOVA: {df_epi_imms_stat.at[epi_imm, pval_col + '_fdr_bh']:.2e}"
        axs[row_id, col_id].set_title(title)

    fig.savefig(f"{path}/associations/dnam/{an_col}/imms_distribution.png", bbox_inches='tight', dpi=200)
    fig.savefig(f"{path}/associations/dnam/{an_col}/imms_distribution.pdf", bbox_inches='tight')
    plt.close(fig)
    

## Epigenetic scores

In [None]:
epi_scores_passed = []
for epi_score in epi_scores:
    if not df_epi[epi_score].eq(df_epi[epi_score].iloc[0]).all():
        epi_scores_passed.append(epi_score)
len(epi_scores_passed)

In [None]:
df_epi_scores = df_epi[cols_pheno_all + ['Age'] + epi_scores_passed].copy()
for an_col in pheno_associations:
    df_epi_scores_ass = df_epi_scores.loc[df_epi_scores[an_col].isin(pheno_associations[an_col]['groups'])]
    df_epi_scores_stat = pd.DataFrame(index=epi_scores_passed)
    an_col_str = an_col.replace(' ', '_')
    an_col_str = an_col_str.replace(',', '')
    an_col_str = an_col_str.replace('.', '')
    an_col_str = an_col_str.replace('-', '_')
    df_epi_scores_ass[an_col_str] = df_epi_scores_ass[an_col]
    an_vals = pheno_associations[an_col]['groups']
    an_val_base = pheno_associations[an_col]['base']
    an_colors = pheno_associations[an_col]['colors']
    pathlib.Path(f"{path}/associations/dnam/{an_col}").mkdir(parents=True, exist_ok=True)
    for epi_score_id, epi_score in enumerate(epi_scores_passed):
        epi_score_str = epi_score.replace(' ', '_')
        epi_score_str = epi_score_str.replace(':', '_')
        epi_score_str = epi_score_str.replace('%', 'percent')
        epi_score_str = epi_score_str.replace('.', '_')
        epi_score_str = epi_score_str.replace('-', '_')
        epi_score_str = epi_score_str.replace('(', '')
        epi_score_str = epi_score_str.replace(')', '')
        df_epi_scores_ass[epi_score_str] = df_epi_scores_ass[epi_score]
        vals = {}
        for group in an_vals:
            vals[group] = df_epi_scores_ass.loc[df_epi_scores_ass[an_col] == group, epi_score].values
            df_epi_scores_stat.at[epi_score, f"Mean {group}"] = np.mean(vals[group])
            df_epi_scores_stat.at[epi_score, f"Median {group}"] = np.median(vals[group])
            df_epi_scores_stat.at[epi_score, f"Q75 {group}"], df_epi_scores_stat.at[epi_score, f"Q25 {group}"] = np.percentile(vals[group], [75 , 25])
            df_epi_scores_stat.at[epi_score, f"IQR {group}"] = df_epi_scores_stat.at[epi_score, f"Q75 {group}"] - df_epi_scores_stat.at[epi_score, f"Q25 {group}"]
            df_epi_scores_stat.at[epi_score, f"Variation {group}"] = variation(vals[group])
        _, df_epi_scores_stat.at[epi_score, "mannwhitneyu_pval"] = mannwhitneyu(vals[an_vals[0]], vals[an_vals[1]], alternative='two-sided')
        _, df_epi_scores_stat.at[epi_score, "levene_pval"] = levene(vals[an_vals[0]], vals[an_vals[1]])
        regcov = smf.ols(formula=f"{epi_score_str} ~ {an_col_str} + Age", data=df_epi_scores_ass).fit()
        reg_sum = regcov.summary2().tables[1]
        pvals_cols = reg_sum.index[reg_sum.index.str.contains(an_col_str)].values
        for pval_col_id, pval_col in enumerate(pvals_cols):
            df_epi_scores_stat.at[epi_score, f"ancova_{pval_col}_pval"] = reg_sum.at[pval_col, 'P>|t|']
    _, df_epi_scores_stat.loc[epi_scores_passed, "mannwhitneyu_pval_fdr_bh"], _, _ = multipletests(df_epi_scores_stat.loc[epi_scores_passed, "mannwhitneyu_pval"].values, 0.05, method='fdr_bh')
    _, df_epi_scores_stat.loc[epi_scores_passed, "mannwhitneyu_pval_bonferroni"], _, _ = multipletests(df_epi_scores_stat.loc[epi_scores_passed, "mannwhitneyu_pval"].values, 0.05, method='bonferroni')
    _, df_epi_scores_stat.loc[epi_scores_passed, "mannwhitneyu_pval_simes-hochberg"], _, _ = multipletests(df_epi_scores_stat.loc[epi_scores_passed, "mannwhitneyu_pval"].values, 0.05, method='simes-hochberg')
    _, df_epi_scores_stat.loc[epi_scores_passed, "levene_pval_fdr_bh"], _, _ = multipletests(df_epi_scores_stat.loc[epi_scores_passed, "levene_pval"].values, 0.05, method='fdr_bh')
    _, df_epi_scores_stat.loc[epi_scores_passed, "levene_pval_bonferroni"], _, _ = multipletests(df_epi_scores_stat.loc[epi_scores_passed, "levene_pval"].values, 0.05, method='bonferroni')
    _, df_epi_scores_stat.loc[epi_scores_passed, "levene_pval_simes-hochberg"], _, _ = multipletests(df_epi_scores_stat.loc[epi_scores_passed, "levene_pval"].values, 0.05, method='simes-hochberg')
    pvals_cols_ancova = df_epi_scores_stat.columns[df_epi_scores_stat.columns.str.contains(an_col_str)].values
    for pval_col in pvals_cols_ancova:
        _, df_epi_scores_stat.loc[epi_scores_passed, f"{pval_col}_fdr_bh"], _, _ = multipletests(df_epi_scores_stat.loc[epi_scores_passed, pval_col].values, 0.05, method='fdr_bh')
        _, df_epi_scores_stat.loc[epi_scores_passed, f"{pval_col}_bonferroni"], _, _ = multipletests(df_epi_scores_stat.loc[epi_scores_passed, pval_col].values, 0.05, method='bonferroni')
        _, df_epi_scores_stat.loc[epi_scores_passed, f"{pval_col}_simes-hochberg"], _, _ = multipletests(df_epi_scores_stat.loc[epi_scores_passed, pval_col].values, 0.05, method='simes-hochberg')
    df_epi_scores_stat.sort_values([f"mannwhitneyu_pval"], ascending=[True], inplace=True)
    df_epi_scores_stat.to_excel(f"{path}/associations/dnam/{an_col}/scores.xlsx")
    
    for stat_test in [x.replace('_pval', '') for x in pvals_cols_ancova] + ['mannwhitneyu', 'levene']:
        df_fig = df_epi_scores_stat.copy()
        df_fig.sort_values([f"{stat_test}_pval"], ascending=[True], inplace=True)
        df_fig['Features'] = df_fig.index
        df_fig[f'{stat_test}_pval_fdr_bh_log'] = -np.log10(df_fig[f'{stat_test}_pval_fdr_bh'])
        df_fig['color'] = 'pink'
        df_fig.loc[df_fig[f'{stat_test}_pval_fdr_bh'] < 0.05, 'color'] = 'red'
        sns.set_theme(style='ticks')
        fig, ax = plt.subplots(figsize=(3, df_fig.shape[0] * 0.5))
        barplot = sns.barplot(
            data=df_fig,
            y='Features',
            x=f'{stat_test}_pval_fdr_bh_log',
            edgecolor='black',
            palette=df_fig['color'].values,
            ax=ax,
        )
        ax.set_xlabel(r"$-\log_{10}(\mathrm{p-value})$")
        ax.xaxis.tick_top()
        ax.xaxis.set_label_position('top')
        ax.set_ylabel('')
        plt.savefig(f"{path}/associations/dnam/{an_col}/scores_pvals_{stat_test}.png", bbox_inches='tight', dpi=200)
        plt.savefig(f"{path}/associations/dnam/{an_col}/scores_pvals_{stat_test}.pdf", bbox_inches='tight')
        plt.close(fig)
    
    n_rows = 8
    n_cols = 14
    fig_width = 60
    fig_height = 40
    
    sns.set_theme(style='ticks')
    fig, axs = plt.subplots(n_rows, n_cols, figsize=(fig_width, fig_height), gridspec_kw={'wspace':0.15, 'hspace': 0.15}, layout='constrained')
    for epi_score_id, epi_score in enumerate(df_epi_scores_stat.index.values):
        row_id, col_id = divmod(epi_score_id, n_cols)
        
        ql = df_epi_scores_ass[epi_score].quantile(0.02)
        qh = df_epi_scores_ass[epi_score].quantile(0.98)
        
        sns.violinplot(
            data=df_epi_scores_ass.loc[(df_epi_scores_ass[epi_score] > ql) & (df_epi_scores_ass[epi_score] < qh), :],
            x=an_col,
            y=epi_score,
            palette=an_colors,
            scale='width',
            order=an_vals,
            saturation=0.75,
            ax=axs[row_id, col_id],
            legend=False,
            cut=0,
        )
        axs[row_id, col_id].set_ylabel(epi_score)
        axs[row_id, col_id].ticklabel_format(style='scientific', scilimits=(-1, 1), axis='y', useOffset=True, useMathText=True)
        mannwhitneyu_pval = df_epi_scores_stat.at[epi_score, "mannwhitneyu_pval_fdr_bh"]
        levene_pval = df_epi_scores_stat.at[epi_score, "levene_pval_fdr_bh"]
        title = f'Mann-Whitney: {mannwhitneyu_pval:.2e}\nLevene: {levene_pval:.2e}'
        for pval_col in pvals_cols_ancova:
            title += f"\nANCOVA: {df_epi_scores_stat.at[epi_score, pval_col + '_fdr_bh']:.2e}"
        axs[row_id, col_id].set_title(title)

    fig.savefig(f"{path}/associations/dnam/{an_col}/scores_distribution.png", bbox_inches='tight', dpi=200)
    fig.savefig(f"{path}/associations/dnam/{an_col}/scores_distribution.pdf", bbox_inches='tight')
    plt.close(fig)

## EWAS

### Load CpGs

In [None]:
df_mnfst = pd.read_pickle(f"E:/YandexDisk/Work/pydnameth/datasets/GPL33022/manifest.pkl")
df_cpgs = pd.read_pickle(f"{path}/dnam/processed/betas.pkl")
df_cpgs.index = df_cpgs.index.astype(str)

### Probe aggregation with pyaging

In [None]:
cols_for_pya = ['Age', 'Sex', 'Tissue']
df_for_pya = pd.merge(df_epi[cols_for_pya], df_cpgs, left_index=True, right_index=True)
df_for_pya['Female'] = (df_for_pya['Sex'] == 'F').astype(int)
df_for_pya = pya.pp.epicv2_probe_aggregation(df_for_pya, verbose=True)
df_for_pya.drop(cols_for_pya + ['Female'], axis=1, inplace=True)

### Renaming some CpGs for using in formulas

In [None]:
cpgs_to_str = df_for_pya.filter(regex='\.|-', axis=1).columns.values
cpgs_to_str_dict = {}
for cpg in tqdm(cpgs_to_str):
    new_cpg = cpg.replace('.', '_')
    new_cpg = new_cpg.replace('-', '_')
    cpgs_to_str_dict[cpg] = new_cpg
cpgs_to_str_inv_dict = {v: k for k, v in cpgs_to_str_dict.items()}
df_for_pya.rename(columns=cpgs_to_str_dict, inplace=True)
cpgs = df_for_pya.columns.values
df_for_pya = pd.concat([df_epi, df_for_pya], axis=1, join="inner")

In [None]:
for an_col in pheno_associations:
    df_for_pya_ass = df_for_pya.loc[df_for_pya[an_col].isin(pheno_associations[an_col]['groups'])]
    df_cpgs_stat = pd.DataFrame(index=cpgs)
    an_col_str = an_col.replace(' ', '_')
    an_col_str = an_col_str.replace('-', '_')
    df_for_pya_ass[an_col_str] = df_for_pya_ass[an_col]
    an_vals = pheno_associations[an_col]['groups']
    an_val_base = pheno_associations[an_col]['base']
    an_colors = pheno_associations[an_col]['colors']
    pathlib.Path(f"{path}/associations/dnam/{an_col}").mkdir(parents=True, exist_ok=True)
    for cpg in (pbar := tqdm(cpgs)):
        pbar.set_description(f"{cpg}")
        vals = {}
        for group in an_vals:
            vals[group] = df_for_pya_ass.loc[df_for_pya_ass[an_col] == group, cpg].values
        _, df_cpgs_stat.at[cpg, "mannwhitneyu_pval"] = mannwhitneyu(vals[an_vals[0]], vals[an_vals[1]], alternative='two-sided')
        _, df_cpgs_stat.at[cpg, "levene_pval"] = levene(vals[an_vals[0]], vals[an_vals[1]])
        regcov = smf.ols(formula=f"{cpg} ~ {an_col_str} + Age", data=df_for_pya_ass).fit()
        reg_sum = regcov.summary2().tables[1]
        pvals_cols = reg_sum.index[reg_sum.index.str.contains(an_col_str)].values
        for pval_col_id, pval_col in enumerate(pvals_cols):
            df_cpgs_stat.at[cpg, f"ancova_{pval_col}_pval"] = reg_sum.at[pval_col, 'P>|t|']
    _, df_cpgs_stat.loc[cpgs, "mannwhitneyu_pval_fdr_bh"], _, _ = multipletests(df_cpgs_stat.loc[cpgs, "mannwhitneyu_pval"].values, 0.05, method='fdr_bh')
    _, df_cpgs_stat.loc[cpgs, "mannwhitneyu_pval_bonferroni"], _, _ = multipletests(df_cpgs_stat.loc[cpgs, "mannwhitneyu_pval"].values, 0.05, method='bonferroni')
    _, df_cpgs_stat.loc[cpgs, "levene_pval_fdr_bh"], _, _ = multipletests(df_cpgs_stat.loc[cpgs, "levene_pval"].values, 0.05, method='fdr_bh')
    _, df_cpgs_stat.loc[cpgs, "levene_pval_bonferroni"], _, _ = multipletests(df_cpgs_stat.loc[cpgs, "levene_pval"].values, 0.05, method='bonferroni')
    pvals_cols_ancova = df_cpgs_stat.columns[df_cpgs_stat.columns.str.contains(an_col_str)].values
    for pval_col in pvals_cols_ancova:
        _, df_cpgs_stat.loc[cpgs, f"{pval_col}_fdr_bh"], _, _ = multipletests(df_cpgs_stat.loc[cpgs, pval_col].values, 0.05, method='fdr_bh')
        _, df_cpgs_stat.loc[cpgs, f"{pval_col}_bonferroni"], _, _ = multipletests(df_cpgs_stat.loc[cpgs, pval_col].values, 0.05, method='bonferroni')
    df_cpgs_stat.sort_values([f"mannwhitneyu_pval"], ascending=[True], inplace=True)
    df_cpgs_stat.rename(index=cpgs_to_str_inv_dict, inplace=True)
    df_cpgs_stat.to_excel(f"{path}/associations/dnam/{an_col}/cpgs.xlsx")

### Only passed CpGs checking

In [None]:
an_col = 'Radioactive hazards'
pvals_cols = ['mannwhitneyu_pval', 'levene_pval', 'ancova_Radioactive_hazards[T.Yes]_pval']
df_cpgs_stat = pd.read_excel(f"{path}/associations/dnam/{an_col}/cpgs.xlsx", index_col=0)
cpgs_passed = pd.read_csv(f"{path}/dnam/processed/cpgs_passed.csv", index_col=0)
df_cpgs_stat = df_cpgs_stat.loc[cpgs_passed.index.intersection(df_cpgs_stat.index).values, :]
for pvals_col in pvals_cols:
    _, df_cpgs_stat.loc[df_cpgs_stat.index, f"{pvals_col}_fdr_bh"], _, _ = multipletests(df_cpgs_stat.loc[df_cpgs_stat.index, pvals_col].values, 0.05, method='fdr_bh')
    _, df_cpgs_stat.loc[df_cpgs_stat.index, f"{pvals_col}_bonferroni"], _, _ = multipletests(df_cpgs_stat.loc[df_cpgs_stat.index, pvals_col].values, 0.05, method='bonferroni')
df_cpgs_stat.to_excel(f"{path}/associations/dnam/{an_col}/cpgs_passed.xlsx")

## Certain genes analysis

In [None]:
def process_str_elem(x, delimiter: str = ';', missed = ''):
    if isinstance(x, str):
        elems = x.split(';')
        elems = list(set(elems))
        elems = delimiter.join(elems)
    else:
        elems = missed
    return elems

df_mnfst = pd.read_pickle(f"E:/YandexDisk/Work/pydnameth/datasets/GPL33022/manifest.pkl")
df_mnfst['UCSC_RefGene_Name'] = df_mnfst['UCSC_RefGene_Name'].apply(process_str_elem, missed='non-genic')
df_mnfst['UCSC_RefGene_Group'] = df_mnfst['UCSC_RefGene_Group'].apply(process_str_elem)
df_mnfst['UCSC_RefGene_Accession'] = df_mnfst['UCSC_RefGene_Accession'].apply(process_str_elem)

df_cpgs = pd.read_pickle(f"{path}/dnam/processed/betas.pkl")
df_cpgs.index = df_cpgs.index.astype(str)

cpgs_to_str = df_cpgs.filter(regex='\.|-', axis=1).columns.values
cpgs_to_str_dict = {}
for cpg in tqdm(cpgs_to_str):
    new_cpg = cpg.replace('.', '_')
    new_cpg = new_cpg.replace('-', '_')
    cpgs_to_str_dict[cpg] = new_cpg
cpgs_to_str_inv_dict = {v: k for k, v in cpgs_to_str_dict.items()}

df_betas = pd.merge(df_epi[cols_pheno_all + ['Age']], df_cpgs, left_index=True, right_index=True)

colors_island = {
    'Island': px.colors.qualitative.Set1[0],
    'Shore': px.colors.qualitative.Set1[1],
    'Shelf': px.colors.qualitative.Set1[3],
    'OpenSea': px.colors.qualitative.Set1[8],
}

colors_gene_groups = {
    'TSS1500': px.colors.qualitative.Vivid[0],
    'TSS200': px.colors.qualitative.Vivid[1],
    '5UTR': px.colors.qualitative.Vivid[2],
    'Exons': px.colors.qualitative.Vivid[3],
    '3UTR': px.colors.qualitative.Vivid[5],
    '': px.colors.qualitative.Vivid[8],
}

In [None]:
groups_options = sorted(list(set(df_mnfst['UCSC_RefGene_Group'].str.split(';',expand=True).stack().values)))

### Lists of Genes

In [None]:
an_col = list(pheno_associations.keys())[0]
an_vals = pheno_associations[an_col]['groups']
an_val_base = pheno_associations[an_col]['base']
an_colors = pheno_associations[an_col]['colors']

an_col_str = an_col.replace(' ', '_')
df_betas[an_col_str] = df_betas[an_col]

genes_set = 'genes_radioactive'
pathlib.Path(f"{path}/associations/dnam/{an_col}/{genes_set}").mkdir(parents=True, exist_ok=True)

genes = pd.read_excel(f"{path}/dnam/processed/{genes_set}.xlsx", index_col=0).index.values

colors = distinctipy.get_colors(len(genes), [mcolors.hex2color(mcolors.CSS4_COLORS['black']), mcolors.hex2color(mcolors.CSS4_COLORS['white'])], rng=1337, pastel_factor=0.1)
colors_genes = {gene: mcolors.to_hex(colors[gene_id], keep_alpha=False) for gene_id, gene in enumerate(genes)}
colors_genes['non-genic'] = 'black'

for gene in genes:
    print(gene)
    cpgs_gene_1 = df_mnfst.index[df_mnfst['UCSC_RefGene_Name'] == gene].values
    cpgs_gene_2 = df_mnfst.index[df_mnfst['UCSC_RefGene_Name'].str.contains(f";{gene};")].values
    cpgs_gene_3 = df_mnfst.index[df_mnfst['UCSC_RefGene_Name'].str.startswith(f"{gene};")].values
    cpgs_gene_4 = df_mnfst.index[df_mnfst['UCSC_RefGene_Name'].str.endswith(f";{gene}")].values

    cpgs_gene = cpgs_gene_1

    df_mnfst_gene = df_mnfst.loc[cpgs_gene, :]
    df_mnfst_gene.sort_values(['pos'], ascending=[True], inplace=True)
    gene_cpgs = df_mnfst_gene.index.values
    print(len(gene_cpgs))
    
    if len(gene_cpgs) > 0:
    
        df_gene_cpgs_stat = pd.DataFrame(index=gene_cpgs)
        for cpg in (pbar := tqdm(gene_cpgs)):
            vals = {}
            for group in an_vals:
                vals[group] = df_betas.loc[df_betas[an_col] == group, cpg].values
            _, df_gene_cpgs_stat.at[cpg, "mannwhitneyu_pval"] = mannwhitneyu(vals[an_vals[0]], vals[an_vals[1]], alternative='two-sided')
            _, df_gene_cpgs_stat.at[cpg, "levene_pval"] = levene(vals[an_vals[0]], vals[an_vals[1]])
            df_for_reg = df_betas.loc[:, [cpg, an_col_str, 'Age']]
            df_for_reg.rename(columns=cpgs_to_str_dict, inplace=True)
            if cpg in cpgs_to_str_dict:
                regcov = smf.ols(formula=f"{cpgs_to_str_dict[cpg]} ~ {an_col_str} + Age", data=df_for_reg).fit()
            else:
                regcov = smf.ols(formula=f"{cpg} ~ {an_col_str} + Age", data=df_for_reg).fit()
            reg_sum = regcov.summary2().tables[1]
            pvals_cols = reg_sum.index[reg_sum.index.str.contains(an_col_str)].values
            for pval_col_id, pval_col in enumerate(pvals_cols):
                df_gene_cpgs_stat.at[cpg, f"ancova_{pval_col}_pval"] = reg_sum.at[pval_col, 'P>|t|']
        _, df_gene_cpgs_stat.loc[gene_cpgs, "mannwhitneyu_pval_fdr_bh"], _, _ = multipletests(df_gene_cpgs_stat.loc[gene_cpgs, "mannwhitneyu_pval"].values, 0.05, method='fdr_bh')
        _, df_gene_cpgs_stat.loc[gene_cpgs, "mannwhitneyu_pval_bonferroni"], _, _ = multipletests(df_gene_cpgs_stat.loc[gene_cpgs, "mannwhitneyu_pval"].values, 0.05, method='bonferroni')
        _, df_gene_cpgs_stat.loc[gene_cpgs, "levene_pval_fdr_bh"], _, _ = multipletests(df_gene_cpgs_stat.loc[gene_cpgs, "levene_pval"].values, 0.05, method='fdr_bh')
        _, df_gene_cpgs_stat.loc[gene_cpgs, "levene_pval_bonferroni"], _, _ = multipletests(df_gene_cpgs_stat.loc[gene_cpgs, "levene_pval"].values, 0.05, method='bonferroni')
        pvals_cols_ancova = df_gene_cpgs_stat.columns[df_gene_cpgs_stat.columns.str.contains(an_col_str)].values
        for pval_col in pvals_cols_ancova:
            _, df_gene_cpgs_stat.loc[gene_cpgs, f"{pval_col}_fdr_bh"], _, _ = multipletests(df_gene_cpgs_stat.loc[gene_cpgs, pval_col].values, 0.05, method='fdr_bh')
            _, df_gene_cpgs_stat.loc[gene_cpgs, f"{pval_col}_bonferroni"], _, _ = multipletests(df_gene_cpgs_stat.loc[gene_cpgs, pval_col].values, 0.05, method='bonferroni')
        # df_gene_cpgs_stat.sort_values([f"mannwhitneyu_pval"], ascending=[True], inplace=True)
        df_gene_cpgs_stat.rename(index=cpgs_to_str_inv_dict, inplace=True)
        df_gene_cpgs_stat.to_excel(f"{path}/associations/dnam/{an_col}/{genes_set}/{gene}.xlsx")

        gene_groups = set()
        gene_names = set()
        for cpg in gene_cpgs:
            gene_groups_raw = df_mnfst_gene.at[cpg, 'UCSC_RefGene_Group']
            gene_names_raw = df_mnfst_gene.at[cpg, 'UCSC_RefGene_Name']
            if isinstance(gene_groups_raw, str):
                for gg in gene_groups_raw.split(';'):
                    if gg.startswith('exon'):
                        gene_groups.update(set(['Exons']))
                    else:
                        gene_groups.update(set([gg]))
            if isinstance(gene_names_raw, str):
                gene_names.update(set(gene_names_raw.split(';')))

        gene_groups_pos = {}
        curr_pos = 0
        for gg in colors_gene_groups:
            if gg in gene_groups:
                gene_groups_pos[gg] = curr_pos
                curr_pos += 1
        
        gene_names_pos = {}
        for gn_id, gn in enumerate(gene_names):
            gene_names_pos[gn] = gn_id

        y_neg_means = []
        y_pos_means = []

        gene_islands_presence = {}
        gene_groups_presence = {}
        gene_names_presence = {}

        ptp_betas = np.ptp(df_betas.loc[:, gene_cpgs].values.flatten())
            
        fig = go.Figure()

        for cpg_id, cpg in enumerate(gene_cpgs):

            vals_neg = df_betas.loc[df_betas[an_col] == an_vals[0], cpg].values
            y_neg_means.append(np.mean(vals_neg))
            fig.add_trace(
                go.Violin(
                    x=[cpg_id] * len(vals_neg),
                    y=vals_neg,
                    name=cpg,
                    box_visible=True,
                    meanline_visible=True,
                    showlegend=False,
                    line_color='black',
                    fillcolor=an_colors[an_vals[0]],
                    marker=dict(color=an_colors[an_vals[0]], line=dict(color='black', width=0.3), opacity=0.8),
                    points='all',
                    bandwidth=np.ptp(vals_neg) / 16,
                    opacity=0.8,
                    legendgroup=cpg,
                    scalegroup=cpg,
                    side='negative',
                    scalemode="width",
                    pointpos=-1.5
                )
            )

            vals_pos = df_betas.loc[df_betas[an_col] == an_vals[1], cpg].values
            y_pos_means.append(np.mean(vals_pos))
            fig.add_trace(
                go.Violin(
                    x=[cpg_id] * len(vals_pos),
                    y=vals_pos,
                    name=cpg,
                    box_visible=True,
                    meanline_visible=True,
                    showlegend=False,
                    line_color='black',
                    fillcolor=an_colors[an_vals[1]],
                    marker=dict(color=an_colors[an_vals[1]], line=dict(color='black',width=0.3), opacity=0.8),
                    points='all',
                    bandwidth=np.ptp(vals_pos) / 16,
                    opacity=0.8,
                    legendgroup=cpg,
                    scalegroup=cpg,
                    scalemode="width",
                    side='positive',
                    pointpos=1.5
                )
            )

            island_pos = df_mnfst_gene.at[cpg, 'Relation_to_Island']
            if island_pos not in gene_islands_presence:
                gene_islands_presence[island_pos] = 1
                show_legend = True
            else:
                gene_islands_presence[island_pos] += 1
                show_legend = False
            fig.add_trace(
                go.Scatter(
                    x=[cpg_id-0.505, cpg_id+0.505],
                    y=[-0.15 * ptp_betas, -0.15 * ptp_betas],
                    showlegend=show_legend,
                    legendgroup=f"Relation to Island    ",
                    legendgrouptitle=dict(text=f"Relation to Island    ", font=dict(size=25)),
                    name=island_pos,
                    mode='lines',
                    line=dict(color=colors_island[island_pos], width=10)
                )
            )

            cpg_gene_groups_raw = df_mnfst_gene.at[cpg, 'UCSC_RefGene_Group']
            cpg_gene_groups = cpg_gene_groups_raw.split(';')
            for gg in cpg_gene_groups:
                gg_check = gg
                if gg.startswith('exon'):
                    gg_check = 'Exons'
                if gg_check not in gene_groups_presence:
                    gene_groups_presence[gg_check] = 1
                    show_legend = True
                else:
                    gene_groups_presence[gg_check] += 1
                    show_legend = False
                fig.add_trace(
                    go.Scatter(
                        x=[cpg_id-0.505, cpg_id+0.505],
                        y=[(-0.25 - 0.03 * gene_groups_pos[gg_check]) * ptp_betas, (-0.25 - 0.03 * gene_groups_pos[gg_check]) * ptp_betas],
                        showlegend=show_legend,
                        legendgroup=f"UCSC RefGene Group    ",
                        legendgrouptitle=dict(text=f"UCSC RefGene Group    ", font=dict(size=25)),
                        name=gg_check,
                        mode='lines',
                        line=dict(color=colors_gene_groups[gg_check], width=10)
                    )
                )

            cpg_gene_names_raw = df_mnfst_gene.at[cpg, 'UCSC_RefGene_Name']
            cpg_gene_names = gene_names_raw.split(';')
            for gn in cpg_gene_names:
                if gn not in gene_names_presence:
                    gene_names_presence[gn] = 1
                    show_legend = True
                else:
                    gene_names_presence[gn] += 1
                    show_legend = False
                fig.add_trace(
                    go.Scatter(
                        x=[cpg_id-0.505, cpg_id+0.505],
                        y=[(-0.32 - 0.03 * len(gene_groups_pos) - 0.03 * gene_names_pos[gn]) * ptp_betas, (-0.32 - 0.03 * len(gene_groups_pos) - 0.03 * gene_names_pos[gn]) * ptp_betas],
                        showlegend=show_legend,
                        legendgroup=f"UCSC_RefGene_Name    ",
                        legendgrouptitle=dict(text=f"UCSC_RefGene_Name    ", font=dict(size=25)),
                        name=gn,
                        mode='lines',
                        line=dict(color=colors_genes[gn], width=10)
                    )
                )

        fig.add_trace(
            go.Scatter(
                x=list(range(df_mnfst_gene.shape[0])),
                y=y_neg_means,
                showlegend=False,
                mode='lines+markers',
                line=dict(color='black', width=6, shape='spline'),
                marker=dict(color='black', line=dict(color='black', width=0.3), opacity=0.8),
            )
        )
        fig.add_trace(
            go.Scatter(
                x=list(range(df_mnfst_gene.shape[0])),
                y=y_pos_means,
                showlegend=False,
                mode='lines+markers',
                line=dict(color='black', width=6, shape='spline'),
                marker=dict(color='black', line=dict(color='black', width=0.3), opacity=0.8),
            )
        )
        fig.add_trace(
            go.Scatter(
                x=list(range(df_mnfst_gene.shape[0])),
                y=y_neg_means,
                showlegend=False,
                mode='lines+markers',
                line=dict(color=an_colors[an_vals[0]], width=5, shape='spline'),
                marker=dict(color=an_colors[an_vals[0]], line=dict(color='black', width=0.3), opacity=0.8),
            )
        )
        fig.add_trace(
            go.Scatter(
                x=list(range(df_mnfst_gene.shape[0])),
                y=y_pos_means,
                showlegend=False,
                mode='lines+markers',
                line=dict(color=an_colors[an_vals[1]], width=5, shape='spline'),
                marker=dict(color=an_colors[an_vals[1]], line=dict(color='black', width=0.3), opacity=0.8),
            )
        )

        title = f""
        add_layout(fig, "", f"Methylation level", title)

        x_labels = []
        for cpg in gene_cpgs:
            x_label = f"{cpg}<br>"
            mannwhitneyu_pval = df_gene_cpgs_stat.loc[cpg, 'mannwhitneyu_pval_fdr_bh']
            levene_pval = df_gene_cpgs_stat.loc[cpg, 'levene_pval_fdr_bh']
            ancova_pval = df_gene_cpgs_stat.loc[cpg, f'{pvals_cols_ancova[0]}_fdr_bh']
            if mannwhitneyu_pval < 0.05:
                x_label += f"<span style='color:{str('crimson')}'><em>" + f"Mann-Whitney: {mannwhitneyu_pval:0.2e}<br>" + "</em></span>"
            else:
                x_label += f"Mann-Whitney: {mannwhitneyu_pval:0.2e}<br>"
            if levene_pval < 0.05:
                x_label += f"<span style='color:{str('crimson')}'><em>" + f"Levene: {levene_pval:0.2e}<br>" + "</em></span>"
            else:
                x_label += f"Levene: {levene_pval:0.2e}<br>"
            if ancova_pval < 0.05:
                x_label += f"<span style='color:{str('crimson')}'><em>" + f"ANCOVA: {ancova_pval:0.2e}<br>" + "</em></span>"
            else:
                x_label += f"ANCOVA: {ancova_pval:0.2e}<br>"
            x_labels.append(x_label)

        fig.update_layout(
            title=dict(xref='paper', x=1.0),
            legend=dict(
                orientation="h",
                yanchor="bottom",
                y=1.01,
                xanchor="left",
                x=0.0001,
                itemsizing='constant',
                font_size=22
            ),
            xaxis=dict(
                tickmode='array',
                tickvals=list(range(df_mnfst_gene.shape[0])),
                ticktext=x_labels,
                tickfont=dict(size=14)
            ),
            yaxis=dict(
                tickmode='array',
                tickvals=[0.0, 0.2, 0.4, 0.6, 0.8, 1.0],
                ticktext=[0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
            )
        )
        fig.update_layout(
            violingap=0.39,
            violingroupgap=0.39,
            width=120 * df_mnfst_gene.shape[0],
            height=1200,
            margin=go.layout.Margin(
                l=100,
                r=50,
                b=250,
                t=150,
                pad=0,
            )
        )
        fig.update_xaxes(autorange=False, range=[-0.5, df_mnfst_gene.shape[0] - 0.5], automargin=True)
        fig.update_yaxes(autorange=True, automargin=True,)
        fig.update_xaxes(tickangle=270)
        fig.write_image(f"{path}/associations/dnam/{an_col}/{genes_set}/{gene}.png")
        # fig.write_image(f"{path}/associations/dnam/{an_col}/{genes_set}/{gene}.pdf", format="pdf")


### Very special gene analysis

In [None]:
gene_name = 'BRCA1'
gene_cpgs = [
    'cg26276233_TC21',
    'cg06001716_TC21',
    'cg02286533_BC11',
    'cg14947218_BC11',
    'cg16006004_BC11',
    'cg18372208_BC21',
    'cg14687474_BC11',
    'cg25288140_TC11',
]
df_betas_gene = df_betas.loc[:, gene_cpgs]
for cpg in gene_cpgs:
    df_betas_gene[f"{cpg} zscore"] = zscore(df_betas_gene.loc[:, cpg])
df_mnfst_gene = df_mnfst.loc[gene_cpgs, :]
df_mnfst_gene.sort_values(['pos'], ascending=[True], inplace=True)
gene_cpgs = df_mnfst_gene.index.values
df_gene_stats = pd.DataFrame(index=df_betas_gene.index)
df_gene_stats['Mean'] = df_betas_gene.loc[:, gene_cpgs].mean(axis=1)
df_gene_stats['Mean Z-score'] = df_betas_gene.loc[:, [f"{cpg} zscore" for cpg in gene_cpgs]].mean(axis=1)
df_gene_diffs = pd.DataFrame(index=df_betas_gene.index, columns=[f'Diff {x}' for x in range(len(gene_cpgs) - 1)])
for cpg_id in range(len(gene_cpgs) - 1):
    df_gene_diffs[f'Diff {cpg_id}'] = np.abs(zscore(df_betas_gene.loc[:, gene_cpgs[cpg_id + 1]]) - zscore(df_betas_gene.loc[:, gene_cpgs[cpg_id]]))
df_gene_stats['MeanDiff'] = df_gene_diffs.mean(axis=1)

In [None]:
sample_id = '12257'

plots_dict = {
    'Methylation': '',
    'Z-score': ' zscore'
}

for plot_type in plots_dict:
    
    gene_groups = set()
    gene_names = set()
    for cpg in gene_cpgs:
        gene_groups_raw = df_mnfst_gene.at[cpg, 'UCSC_RefGene_Group']
        gene_names_raw = df_mnfst_gene.at[cpg, 'UCSC_RefGene_Name']
        if isinstance(gene_groups_raw, str):
            for gg in gene_groups_raw.split(';'):
                if gg.startswith('exon'):
                    gene_groups.update(set(['Exons']))
                else:
                    gene_groups.update(set([gg]))
        if isinstance(gene_names_raw, str):
            gene_names.update(set(gene_names_raw.split(';')))

    gene_groups_pos = {}
    curr_pos = 0
    for gg in colors_gene_groups:
        if gg in gene_groups:
            gene_groups_pos[gg] = curr_pos
            curr_pos += 1

    gene_names_pos = {}
    for gn_id, gn in enumerate(gene_names):
        gene_names_pos[gn] = gn_id
    
    gene_islands_presence = {}
    gene_groups_presence = {}
    gene_names_presence = {}
    
    y_sample = []
    
    ptp_betas = np.ptp(df_betas_gene.loc[:, [f"{cpg}{plots_dict[plot_type]}" for cpg in gene_cpgs]].values.flatten())
    y_min = np.min(df_betas_gene.loc[:, [f"{cpg}{plots_dict[plot_type]}" for cpg in gene_cpgs]].values.flatten())
    
    fig = go.Figure()

    for cpg_id, cpg in enumerate(gene_cpgs):

        vals_cpg = df_betas_gene.loc[:, f"{cpg}{plots_dict[plot_type]}"].values
        y_sample.append(df_betas_gene.at[sample_id, f"{cpg}{plots_dict[plot_type]}"])
        fig.add_trace(
            go.Violin(
                x=[cpg_id] * len(vals_cpg),
                y=vals_cpg,
                name=cpg,
                box_visible=True,
                meanline_visible=True,
                showlegend=False,
                line_color='black',
                fillcolor='lavender',
                marker=dict(color='lavender', line=dict(color='black', width=0.3), opacity=0.8),
                points='outliers',
                bandwidth=np.ptp(vals_cpg) / 16,
                opacity=0.8,
                legendgroup=cpg,
                scalegroup=cpg,
                scalemode="width",
                pointpos=0.0
            )
        )

        island_pos = df_mnfst_gene.at[cpg, 'Relation_to_Island']
        if island_pos not in gene_islands_presence:
            gene_islands_presence[island_pos] = 1
            show_legend = True
        else:
            gene_islands_presence[island_pos] += 1
            show_legend = False
        fig.add_trace(
            go.Scatter(
                x=[cpg_id-0.505, cpg_id+0.505],
                y=[y_min - 0.15 * ptp_betas, y_min - 0.15 * ptp_betas],
                showlegend=show_legend,
                legendgroup=f"Relation to Island    ",
                legendgrouptitle=dict(text=f"Relation to Island    ", font=dict(size=25)),
                name=island_pos,
                mode='lines',
                line=dict(color=colors_island[island_pos], width=10)
            )
        )

        cpg_gene_groups_raw = df_mnfst_gene.at[cpg, 'UCSC_RefGene_Group']
        cpg_gene_groups = cpg_gene_groups_raw.split(';')
        for gg in cpg_gene_groups:
            gg_check = gg
            if gg.startswith('exon'):
                gg_check = 'Exons'
            if gg_check not in gene_groups_presence:
                gene_groups_presence[gg_check] = 1
                show_legend = True
            else:
                gene_groups_presence[gg_check] += 1
                show_legend = False
            fig.add_trace(
                go.Scatter(
                    x=[cpg_id-0.505, cpg_id+0.505],
                    y=[y_min + (-0.25 - 0.03 * gene_groups_pos[gg_check]) * ptp_betas, y_min + (-0.25 - 0.03 * gene_groups_pos[gg_check]) * ptp_betas],
                    showlegend=show_legend,
                    legendgroup=f"UCSC RefGene Group    ",
                    legendgrouptitle=dict(text=f"UCSC RefGene Group    ", font=dict(size=25)),
                    name=gg_check,
                    mode='lines',
                    line=dict(color=colors_gene_groups[gg_check], width=10)
                )
            )

        cpg_gene_names_raw = df_mnfst_gene.at[cpg, 'UCSC_RefGene_Name']
        cpg_gene_names = gene_names_raw.split(';')
        for gn in cpg_gene_names:
            if gn not in gene_names_presence:
                gene_names_presence[gn] = 1
                show_legend = True
            else:
                gene_names_presence[gn] += 1
                show_legend = False
            fig.add_trace(
                go.Scatter(
                    x=[cpg_id-0.505, cpg_id+0.505],
                    y=[y_min + (-0.32 - 0.03 * len(gene_groups_pos) - 0.03 * gene_names_pos[gn]) * ptp_betas, y_min + (-0.32 - 0.03 * len(gene_groups_pos) - 0.03 * gene_names_pos[gn]) * ptp_betas],
                    showlegend=show_legend,
                    legendgroup=f"UCSC_RefGene_Name    ",
                    legendgrouptitle=dict(text=f"UCSC_RefGene_Name    ", font=dict(size=25)),
                    name=gn,
                    mode='lines',
                    line=dict(color='dodgerblue', width=10)
                )
            )

    fig.add_trace(
        go.Scatter(
            x=list(range(df_mnfst_gene.shape[0])),
            y=y_sample,
            showlegend=False,
            mode='lines+markers',
            line=dict(color='black', width=6, shape='spline'),
            marker=dict(color='black', line=dict(color='black', width=0.3), opacity=0.8),
        )
    )
    fig.add_trace(
        go.Scatter(
            x=list(range(df_mnfst_gene.shape[0])),
            y=y_sample,
            showlegend=False,
            mode='lines+markers',
            line=dict(color='red', width=5, shape='spline'),
            marker=dict(color='red', line=dict(color='black', width=0.3), opacity=0.8),
        )
    )


    title = f""
    add_layout(fig, "", plot_type, title)

    x_labels = []
    for cpg in gene_cpgs:
        x_label = f"{cpg}"
        x_labels.append(x_label)

    fig.update_layout(
        title=dict(text=f"Sample {sample_id}", xref='paper', x=1.0, font_size=30),
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.01,
            xanchor="left",
            x=0.0001,
            itemsizing='constant',
            font_size=22
        ),
        xaxis=dict(
            tickmode='array',
            tickvals=list(range(df_mnfst_gene.shape[0])),
            ticktext=x_labels,
            tickfont=dict(size=14)
        ),
        yaxis=dict(
            tickmode='array',
            # tickvals=[0.0, 0.2, 0.4, 0.6, 0.8, 1.0],
            # ticktext=[0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
        )
    )
    fig.update_layout(
        violingap=0.39,
        violingroupgap=0.39,
        width=120 * df_mnfst_gene.shape[0],
        height=1200,
        margin=go.layout.Margin(
            l=100,
            r=50,
            b=250,
            t=150,
            pad=0,
        )
    )
    fig.update_xaxes(autorange=False, range=[-0.5, df_mnfst_gene.shape[0] - 0.5], automargin=True)
    fig.update_yaxes(autorange=True, automargin=True,)
    fig.update_xaxes(tickangle=270)
    fig.write_image(f"{path}/associations/dnam/{sample_id}{plots_dict[plot_type]}.png")

## Individual EpiAge score

In [None]:
df_epi_ages = df_epi[cols_pheno + ['Age'] + epi_ages].copy()
pathlib.Path(f"{path}/individual/EpiAgeScore").mkdir(parents=True, exist_ok=True)
epi_ages_mae = {}
for epiage_id, epiage in enumerate(epi_ages):
    linreg = smf.ols(formula=f"{epiage} ~ Age", data=df_epi_ages.loc[:, :]).fit()
    df_epi_ages[f"{epiage}_linear_pred"] = linreg.predict(df_epi_ages)
    df_epi_ages[f"{epiage} acceleration"] = df_epi_ages[epiage] - df_epi_ages[f"{epiage}_linear_pred"]
    df_epi_ages[f"{epiage} corrected"] = df_epi_ages["Age"] + df_epi_ages[f"{epiage} acceleration"]
    epi_ages_mae[epiage] = np.mean(np.abs(df_epi_ages[f"{epiage} acceleration"].values))
    df_epi_ages[f"{epiage} acceleration by MAE"] = df_epi_ages[f"{epiage} acceleration"] / epi_ages_mae[epiage]
    df_epi_ages.loc[:, f"{epiage} acceleration type"] = 0
    df_epi_ages.loc[df_epi_ages[f"{epiage} acceleration by MAE"] > 1.0, f"{epiage} acceleration type"] = 1.0
    df_epi_ages.loc[df_epi_ages[f"{epiage} acceleration by MAE"] < -1.0, f"{epiage} acceleration type"] = -1.0
df_epi_ages[f"Epigenetic Ages Summary"] = df_epi_ages.loc[:, [f"{epiage} acceleration type" for epiage in epi_ages]].sum(axis=1)
df_epi_ages[f"Epigenetic profile"] = 'Neutral'
epi_profile_thld = 6
df_epi_ages.loc[df_epi_ages[f"Epigenetic Ages Summary"] > epi_profile_thld, f"Epigenetic profile"] = 'Accelerated aging'
df_epi_ages.loc[df_epi_ages[f"Epigenetic Ages Summary"] < -epi_profile_thld, f"Epigenetic profile"] = 'Decelerated aging'
df_epi_ages.to_excel(f"{path}/individual/EpiAgeScore/data_with_onehot_diseases.xlsx")

### Stat tests

In [None]:
pathlib.Path(f"{path}/individual/EpiAgeScore/categorical_tests").mkdir(parents=True, exist_ok=True)

# df_chi_test = pd.merge(df_epi, df_epi_ages[["Epigenetic profile"]], left_index=True, right_index=True)
# for f in cols_pheno2 + ['Status']:

df_chi_test = df_epi

for f in ['Status']:
    df_cross = pd.crosstab(df_chi_test[f], df_chi_test["Epigenetic profile"])
    res = chi2_contingency(df_cross, correction=True)
    ax = df_cross.plot(kind="bar", rot=0, color={'Accelerated aging': 'crimson', 'Decelerated aging': 'dodgerblue', 'Neutral': 'gray'})
    ax.set_title(r'$\chi^2$' + f' p-value: {res.pvalue:0.3e}')
    fig = ax.get_figure()
    fig.savefig(f"{path}/individual/EpiAgeScore/categorical_tests/{f}.png", bbox_inches='tight', dpi=200)
    fig.savefig(f"{path}/individual/EpiAgeScore/categorical_tests/{f}.pdf", bbox_inches='tight')
    plt.close(fig)

print(df_chi_test["Epigenetic profile"].value_counts())

### Plot ages

In [None]:
for suffix in ['', ' corrected']:

    sns.set_theme(style='ticks')
    fig = plt.figure(
        figsize=(16.5, 25.7),
        layout="constrained"
    )
    nrows = 10
    ncols = 3
    subfigs = fig.subfigures(
        nrows=10,
        ncols=3,
        wspace=0.3
        # wspace=0.001,
        # hspace=0.001,
    )
    for epiage_id, epiage in enumerate(epi_ages):
        row_id, col_id = divmod(epiage_id, ncols)
        
        epiage_str = f"{epiage}{suffix}"

        axs = subfigs[row_id, col_id].subplot_mosaic(
            [
                ['11', '12'],
                ['21', '22'],
            ],
            height_ratios=[1, 4],
            width_ratios=[3, 1.5],
            gridspec_kw={
                "bottom": 0.1,
                "top": 0.95,
                # "left": 0.1,
                # "right": 0.5,
                "wspace": 0.03,
                "hspace": 0.01,
            },
        )
        
        ds_table = pd.DataFrame(index=['MAE', fr"Pearson $\rho$", "Bias"], columns=[epiage])
        mae = mean_absolute_error(df_epi_ages['Age'].values, df_epi_ages[epiage_str].values)
        rho, _ = stats.pearsonr(df_epi_ages['Age'].values, df_epi_ages[epiage_str].values)
        bias = np.mean(df_epi_ages[epiage_str] - df_epi_ages['Age'])
        ds_table.at['MAE', epiage] = f"{mae:0.2f}"
        ds_table.at[fr"Pearson $\rho$", epiage] = f"{rho:0.2f}"
        ds_table.at["Bias", epiage] = f"{bias:0.2f}"
        col_defs = [
            ColumnDefinition(
                name="index",
                title=epiage,
                textprops={"ha": "left"},
                width=4.5,
            ),
            ColumnDefinition(
                name=epiage,
                title='',
                textprops={"ha": "center"},
                width=2.0,
            ),
        ]
        table = Table(
            ds_table,
            column_definitions=col_defs,
            row_dividers=True,
            footer_divider=False,
            ax=axs['11'],
            textprops={"fontsize": 7},
            row_divider_kw={"linewidth": 1, "linestyle": (0, (1, 1))},
            col_label_divider_kw={"linewidth": 1, "linestyle": "-"},
            column_border_kw={"linewidth": 1, "linestyle": "-"},
        ).autoset_fontcolors(colnames=[epiage])
        
        axs['12'].axis('off')
        
        xy_min = df_epi_ages[['Age', epiage_str]].min().min()
        xy_max = df_epi_ages[['Age', epiage_str]].max().max()
        xy_ptp = xy_max - xy_min
        bisect = sns.lineplot(
            x=[xy_min - 0.1 * xy_ptp, xy_max + 0.1 * xy_ptp],
            y=[xy_min - 0.1 * xy_ptp, xy_max + 0.1 * xy_ptp],
            linestyle='--',
            color='black',
            linewidth=1.0,
            zorder=0,
            ax=axs['21']
        )
        regplot = sns.regplot(
            data=df_epi_ages,
            x='Age',
            y=epiage_str,
            color='crimson',
            scatter=False,
            truncate=False,
            ax=axs['21']
        )
        scatter = sns.scatterplot(
            data=df_epi_ages,
            x='Age',
            y=epiage_str,
            color='crimson',
            linewidth=0.5,
            alpha=0.75,
            edgecolor="k",
            s=20,
            legend=True,
            ax=axs['21'],
        )
        axs['21'].set_xlim(xy_min - 0.1 * xy_ptp, xy_max + 0.1 * xy_ptp)
        axs['21'].set_ylim(xy_min - 0.1 * xy_ptp, xy_max + 0.1 * xy_ptp)
        
        sns.violinplot(
            data=df_epi_ages,
            y=f"{epiage} acceleration",
            density_norm='width',
            saturation=0.75,
            color='crimson',
            linewidth=1.0,
            ax=axs['22'],
            legend=False,
            cut=0,
        )
        axs['22'].set_ylabel(f"{epiage} acceleration")

    fig.savefig(f"{path}/individual/EpiAgeScore/ages{suffix}.png", bbox_inches='tight', dpi=200)
    fig.savefig(f"{path}/individual/EpiAgeScore/ages{suffix}.pdf", bbox_inches='tight')
    plt.close(fig)

### Generate prompts and figures

In [None]:
for sample_id in df_epi_ages.index.values:
    df_sample_epi_ages = pd.DataFrame(index=epi_ages, columns=['acc/mae', 'Возрастная акселерация', 'Часы'])
    df_sample_epi_ages['EpiAges'] = df_sample_epi_ages.index.values
    for epiage in epi_ages:
        df_sample_epi_ages.at[epiage, 'acc/mae'] = df_epi_ages.at[sample_id, f"{epiage} acceleration"] / epi_ages_mae[epiage]
        df_sample_epi_ages.at[epiage, 'Возрастная акселерация'] = df_epi_ages.at[sample_id, f"{epiage} acceleration"]
        if df_sample_epi_ages.at[epiage, 'acc/mae'] > 1.0:
            df_sample_epi_ages.at[epiage, 'Часы'] = 'Значительное\nускорение'
        elif df_sample_epi_ages.at[epiage, 'acc/mae'] < -1.0:
            df_sample_epi_ages.at[epiage, 'Часы'] = 'Значительное\nзамедление'
        else:
            df_sample_epi_ages.at[epiage, 'Часы'] = 'Незначительная\nакселерация'

    colors_clocks = {
        'Значительное\nускорение': 'crimson',
        'Значительное\nзамедление': 'dodgerblue',
        'Незначительная\nакселерация': 'gainsboro'
    }
    df_sample_epi_ages.sort_values(by='Возрастная акселерация', key=abs, ascending=False, inplace=True)
    # df_sample_epi_ages.sort_values(by='acc/mae', key=abs, ascending=False, inplace=True)
    # df_sample_epi_ages['color'] = 'gainsboro'
    # df_sample_epi_ages.loc[df_sample_epi_ages['acc/mae'] < -1.0, 'color'] = 'dodgerblue'
    # df_sample_epi_ages.loc[df_sample_epi_ages['acc/mae'] > 1.0, 'color'] = 'crimson'

    sns.set_theme(style='ticks')
    fig, ax = plt.subplots(figsize=(8, 8))
    # ax.axvline(x=-1, color='black', linestyle='-', linewidth=3.0, zorder=0)
    # ax.axvline(x=1, color='black', linestyle='-', linewidth=3.0, zorder=0)
    # ax.axvline(x=-1, color='blue', linestyle='-', linewidth=1.5, zorder=0)
    # ax.axvline(x=1, color='red', linestyle='-', linewidth=1.5, zorder=0)
    barplot = sns.barplot(
        data=df_sample_epi_ages,
        y='EpiAges',
        x='Возрастная акселерация',
        #x='acc/mae',
        edgecolor='black',
        #palette=df_sample_epi_ages['color'].values,
        hue='Часы',
        palette=colors_clocks,
        ax=ax,
    )
    #xlabel = r"$\frac{\mathrm{Age\ acceleration}}{\mathrm{MAE}}$" 
    #ax.set_xlabel(xlabel, fontsize=16)
    # max_x = df_sample_epi_ages['acc/mae'].abs().max()
    max_x = df_sample_epi_ages['Возрастная акселерация'].abs().max()
    ax.set_xlim([-max_x * 1.2, max_x * 1.2])
    ax.set_ylabel('')
    # ax.set_title(f"Хронологический возраст = {df_epi_ages.at[sample_id, 'Age']:0.2f}", fontsize='16')
    plt.setp(ax.get_legend().get_texts(), fontsize='10') # for legend text
    plt.setp(ax.get_legend().get_title(), fontsize='12')
    plt.savefig(f"{path}/individual/EpiAgeScore/{sample_id}_barplot.png", bbox_inches='tight', dpi=200)
    plt.savefig(f"{path}/individual/EpiAgeScore/{sample_id}_barplot.pdf", bbox_inches='tight')
    plt.close(fig)

    sample_prompt = Path(f"{path}/individual/EpiAgeScore/prompt.txt").read_text()
    sample_prompt += '\n\n' + f"Age {df_epi_ages.at[sample_id, 'Age']:0.2f}"
    for epiage in epi_ages:
        sample_signicance = df_sample_epi_ages.at[epiage, 'Часы'].replace('\n', ' ')
        if  sample_signicance == 'Незначительная акселерация':
            sample_signicance = 'Нет статистической значимости'
        sample_prompt += '\n' + f"{epiage} {df_epi_ages.at[sample_id, f'{epiage} corrected']:0.2f}" + f" ({sample_signicance})"
    Path(f"{path}/individual/EpiAgeScore/{sample_id}_prompt.txt").write_text(sample_prompt)
    # Path(f"{path}/individual/EpiAgeScore/{sample_id}_llm.txt").write_text('')

    df_sample_epi_ages.to_excel(f"{path}/individual/EpiAgeScore/{sample_id}.xlsx")
    

    sns.set_theme(style='ticks')
    fig = plt.figure(
        figsize=(16.5, 25.7),
        layout="constrained"
    )
    nrows = 10
    ncols = 3
    subfigs = fig.subfigures(
        nrows=10,
        ncols=3,
        wspace=0.3
        # wspace=0.001,
        # hspace=0.001,
    )
    for epiage_id, epiage in enumerate(df_sample_epi_ages.index.values):
        row_id, col_id = divmod(epiage_id, ncols)
        
        epiage_str = f"{epiage} corrected"
        
        epiage_color = colors_clocks[df_sample_epi_ages.at[epiage, 'Часы']]
        if epiage_color == 'gainsboro':
            epiage_color = 'lightslategray'

        axs = subfigs[row_id, col_id].subplot_mosaic(
            [
                ['11', '12'],
                ['21', '22'],
            ],
            height_ratios=[1, 4],
            width_ratios=[3, 1.5],
            gridspec_kw={
                "bottom": 0.1,
                "top": 0.95,
                # "left": 0.1,
                # "right": 0.5,
                "wspace": 0.03,
                "hspace": 0.01,
            },
        )
        
        ds_table = pd.DataFrame(index=['MAE', fr"Pearson $\rho$"], columns=[epiage])
        mae = mean_absolute_error(df_epi_ages['Age'].values, df_epi_ages[epiage_str].values)
        rho, _ = stats.pearsonr(df_epi_ages['Age'].values, df_epi_ages[epiage_str].values)
        bias = np.mean(df_epi_ages[epiage_str] - df_epi_ages['Age'])
        ds_table.at['MAE', epiage] = f"{mae:0.2f}"
        ds_table.at[fr"Pearson $\rho$", epiage] = f"{rho:0.2f}"
        # ds_table.at["Bias", epiage] = f"{bias:0.2f}"
        col_defs = [
            ColumnDefinition(
                name="index",
                title=epiage,
                textprops={"ha": "left"},
                width=4.5,
            ),
            ColumnDefinition(
                name=epiage,
                title='',
                textprops={"ha": "center"},
                width=2.0,
            ),
        ]
        table = Table(
            ds_table,
            column_definitions=col_defs,
            row_dividers=True,
            footer_divider=False,
            ax=axs['11'],
            textprops={"fontsize": 5},
            row_divider_kw={"linewidth": 1, "linestyle": (0, (1, 1))},
            col_label_divider_kw={"linewidth": 1, "linestyle": "-"},
            column_border_kw={"linewidth": 1, "linestyle": "-"},
        ).autoset_fontcolors(colnames=[epiage])
        
        axs['12'].axis('off')
        
        xy_min = df_epi_ages[['Age', epiage_str]].min().min()
        xy_max = df_epi_ages[['Age', epiage_str]].max().max()
        xy_ptp = xy_max - xy_min
        bisect = sns.lineplot(
            x=[xy_min - 0.1 * xy_ptp, xy_max + 0.1 * xy_ptp],
            y=[xy_min - 0.1 * xy_ptp, xy_max + 0.1 * xy_ptp],
            linestyle='--',
            color='black',
            linewidth=1.0,
            zorder=0,
            ax=axs['21']
        )
        scatter = sns.scatterplot(
            data=df_epi_ages.loc[df_epi_ages.index != sample_id, :],
            x='Age',
            y=epiage_str,
            color='gainsboro',
            linewidth=0.5,
            alpha=0.75,
            edgecolor="k",
            s=15,
            legend=True,
            ax=axs['21'],
        )
        scatter = sns.scatterplot(
            data=df_epi_ages.loc[df_epi_ages.index == sample_id, :],
            x='Age',
            y=epiage_str,
            color=epiage_color,
            linewidth=0.5,
            alpha=0.75,
            edgecolor="k",
            s=40,
            legend=True,
            ax=axs['21'],
        )
        axs['21'].set_xlim(xy_min - 0.1 * xy_ptp, xy_max + 0.1 * xy_ptp)
        axs['21'].set_ylim(xy_min - 0.1 * xy_ptp, xy_max + 0.1 * xy_ptp)
        
        sns.violinplot(
            data=df_epi_ages.loc[df_epi_ages.index != sample_id, :],
            x=[0] * df_epi_ages.loc[df_epi_ages.index != sample_id, :].shape[0],
            y=f"{epiage} acceleration",
            density_norm='width',
            saturation=0.75,
            color='gainsboro',
            linewidth=1.0,
            ax=axs['22'],
            legend=False,
            cut=0,
        )
        swarm = sns.swarmplot(
            data=df_epi_ages.loc[df_epi_ages.index == sample_id, :],
            x=[0] * df_epi_ages.loc[df_epi_ages.index == sample_id, :].shape[0],
            y=f"{epiage} acceleration",
            color=epiage_color,
            linewidth=0.5,
            ax=axs['22'],
            size=10,
            legend=False,
        )
        axs['22'].set_ylabel(f"{epiage} acceleration")
        axs['22'].set_xlabel('')
        axs['22'].set(xticklabels=[]) 
        axs['22'].set(xticks=[])

    # fig.savefig(f"{path}/individual/EpiAgeScore/{sample_id}_ages.png", bbox_inches='tight', dpi=200)
    fig.savefig(f"{path}/individual/EpiAgeScore/{sample_id}_ages.pdf", bbox_inches='tight')
    plt.close(fig)


In [None]:
sample_id = '4436'
df_sample_epi_ages = pd.DataFrame(index=epi_ages, columns=['acc/mae', 'Возрастная акселерация', 'Часы'])
df_sample_epi_ages['EpiAges'] = df_sample_epi_ages.index.values
for epiage in epi_ages:
    df_sample_epi_ages.at[epiage, 'acc/mae'] = df_epi_ages.at[sample_id, f"{epiage} acceleration"] / epi_ages_mae[epiage]
    df_sample_epi_ages.at[epiage, 'Возрастная акселерация'] = df_epi_ages.at[sample_id, f"{epiage} acceleration"]
    if df_sample_epi_ages.at[epiage, 'acc/mae'] > 1.0:
        df_sample_epi_ages.at[epiage, 'Часы'] = 'Значительное\nускорение'
    elif df_sample_epi_ages.at[epiage, 'acc/mae'] < -1.0:
        df_sample_epi_ages.at[epiage, 'Часы'] = 'Значительное\nзамедление'
    else:
        df_sample_epi_ages.at[epiage, 'Часы'] = 'Незначительная\nакселерация'


df_sample_epi_ages.sort_values(by='Возрастная акселерация', key=abs, ascending=False, inplace=True)
df_sample_epi_ages.to_excel(f"{path}/individual/EpiAgeScore/{sample_id}.xlsx")

sample_prompt = Path(f"{path}/individual/EpiAgeScore/prompt.txt").read_text()
sample_prompt += '\n\n' + f"Age {df_epi_ages.at[sample_id, 'Age']:0.2f}"
for epiage in epi_ages:
    sample_signicance = df_sample_epi_ages.at[epiage, 'Часы'].replace('\n', ' ')
    if  sample_signicance == 'Незначительная акселерация':
        sample_signicance = 'Нет статистической значимости'
    sample_prompt += '\n' + f"{epiage} {df_epi_ages.at[sample_id, f'{epiage} corrected']:0.2f}" + f" ({sample_signicance})"
Path(f"{path}/individual/EpiAgeScore/{sample_id}_prompt.txt").write_text(sample_prompt)

In [None]:
client = OpenAI(api_key="", base_url="https://api.deepseek.com")

response = client.chat.completions.create(
    model="deepseek-reasoner",
    messages=[
        {"role": "user", "content": sample_prompt},
    ],
)

df_llm_stat = pd.DataFrame(
    {
        'completion_tokens': response.usage.completion_tokens,
        'prompt_tokens': response.usage.prompt_tokens,
        'reasoning_tokens': response.usage.completion_tokens_details.reasoning_tokens,
        'total_tokens': response.usage.total_tokens,
        'prompt_cache_hit_tokens': response.usage.prompt_cache_hit_tokens,
        'prompt_cache_miss_tokens': response.usage.prompt_cache_miss_tokens
    }.items(), columns=['Feature', 'Value']
)
df_llm_stat.to_excel(f"{path}/individual/EpiAgeScore/{sample_id}_df_llm_stat.xlsx", index=False)
Path(f"{path}/individual/EpiAgeScore/{sample_id}_llm_content.txt").write_text(response.choices[0].message.content)
Path(f"{path}/individual/EpiAgeScore/{sample_id}_llm_reasoning.txt").write_text(response.choices[0].message.reasoning_content)

print(response.choices[0].message.content)

In [None]:
print(response.choices[0].message.reasoning_content)

### Generate doc files 

In [None]:
for sample_id in df_epi_ages.index.values:
    doc = Document()
    doc.styles['Title'].font.size = Pt(16)
    sections = doc.sections
    for section in sections:
        section.page_width = Mm(210)
        section.page_height = Mm(297)
        section.top_margin = Mm(20)
        section.bottom_margin = Mm(20)
        section.left_margin = Mm(30)
        section.right_margin = Mm(15)

    doc.add_heading(f"Анализ эпигенетического возрастного ускорения для {sample_id}", level=0)

    table = doc.add_table(rows=1, cols=2)
    delete_paragraph(table.cell(0, 0).paragraphs[0])
    paragraph = table.cell(0, 0).add_paragraph(style='Normal')
    paragraph.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.CENTER
    run = paragraph.add_run(f'Возраст: ')
    run.font.color.rgb = RGBColor(0, 0, 0)
    run.font.size=Pt(14)
    run = paragraph.add_run(f"{df_epi_ages.at[sample_id, 'Age']:0.2f}")
    run.font.color.rgb = RGBColor(0, 0, 0)
    run.font.size=Pt(14)
    run.font.bold = True
    delete_paragraph(table.cell(0, 1).paragraphs[0])
    paragraph = table.cell(0, 1).add_paragraph(style='Normal')
    paragraph.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.CENTER
    run = paragraph.add_run(f'Пол: ')
    run.font.color.rgb = RGBColor(0, 0, 0)
    run.font.size=Pt(14)
    sex_str = df_epi.at[sample_id, 'Sex']
    if sex_str == 'M':
        run = paragraph.add_run(f'Мужской')
    else:
        run = paragraph.add_run(f'Женский')
    run.font.color.rgb = RGBColor(0, 0, 0)
    run.font.size=Pt(14)
    run.font.bold = True

    doctors = [
        'Терапевт',
        'Хирург',
        'Невропатолог',
        'Дерматолог',
        'Отоларинголог',
        'Офтальмолог',
    ]
    table = doc.add_table(rows=2, cols=len(doctors))
    table.style = 'TableGrid'
    for doctor_id, doctor in enumerate(doctors):
        delete_paragraph(table.cell(0, doctor_id).paragraphs[0])
        paragraph = table.cell(0, doctor_id).add_paragraph(style='Normal')
        paragraph.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.CENTER
        run = paragraph.add_run(f'{doctor}')
        run.font.color.rgb = RGBColor(0, 0, 0)
        run.font.size=Pt(10)
        delete_paragraph(table.cell(1, doctor_id).paragraphs[0])
        paragraph = table.cell(1, doctor_id).add_paragraph(style='Normal')
        # paragraph.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.CENTER
        run = paragraph.add_run(df_epi.at[sample_id, doctor])
        run.font.color.rgb = RGBColor(0, 0, 0)
        run.font.size=Pt(8)
        
    # paragraph = table.cell(0, 0).add_paragraph(style='List Bullet')
    # run = paragraph.add_run('С положительной акселерацией = 5')
    # run.font.color.rgb = RGBColor(255, 0, 0)
    # run.font.size=Pt(11)
    # paragraph = table.cell(0, 0).add_paragraph(style='List Bullet')
    # run = paragraph.add_run('С отрицательной акселерацией = 5')
    # run.font.color.rgb = RGBColor(0, 0, 255)
    # run.font.size=Pt(11)
    # paragraph = table.cell(0, 0).add_paragraph(style='List Bullet')
    # run = paragraph.add_run('Без значительной акселерации = 5')
    # run.font.color.rgb = RGBColor(0, 0, 0)
    # run.font.size=Pt(11)
    # paragraph = table.cell(0, 0).add_paragraph(style='Normal')
    # run = paragraph.add_run('Ассоциации с болезнями:')
    # run.font.color.rgb = RGBColor(0, 0, 0)
    # run.font.size=Pt(14)

    paragraph = doc.add_paragraph()
    paragraph = doc.add_paragraph()
    paragraph.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.CENTER
    paragraph.add_run().add_picture(f"{path}/individual/EpiAgeScore/{sample_id}_barplot.png", width=Mm(160))

    # Таблица
    # table = doc.add_table(rows=1, cols=2)
    # delete_paragraph(table.cell(0, 0).paragraphs[0])
    # paragraph = table.cell(0, 0).add_paragraph(style='Normal')
    # run = paragraph.add_run('EpiAgeScore = ')
    # run.font.color.rgb = RGBColor(0, 0, 0)
    # run.font.size=Pt(14)
    # run = paragraph.add_run('5')
    # run.font.color.rgb = RGBColor(255, 0, 0)
    # run.font.size=Pt(14)
    # run.font.bold = True
    # paragraph = table.cell(0, 0).add_paragraph(style='List Bullet')
    # run = paragraph.add_run('С положительной акселерацией = 5')
    # run.font.color.rgb = RGBColor(255, 0, 0)
    # run.font.size=Pt(11)
    # paragraph = table.cell(0, 0).add_paragraph(style='List Bullet')
    # run = paragraph.add_run('С отрицательной акселерацией = 5')
    # run.font.color.rgb = RGBColor(0, 0, 255)
    # run.font.size=Pt(11)
    # paragraph = table.cell(0, 0).add_paragraph(style='List Bullet')
    # run = paragraph.add_run('Без значительной акселерации = 5')
    # run.font.color.rgb = RGBColor(0, 0, 0)
    # run.font.size=Pt(11)
    # paragraph = table.cell(0, 0).add_paragraph(style='Normal')
    # run = paragraph.add_run('Ассоциации с болезнями:')
    # run.font.color.rgb = RGBColor(0, 0, 0)
    # run.font.size=Pt(14)
    # table.cell(0, 1).paragraphs[0].paragraph_format.alignment = WD_ALIGN_PARAGRAPH.CENTER
    # table.cell(0, 1).paragraphs[0].add_run().add_picture(f"{path}/individual/EpiAgeScore/{sample_id}_barplot.png", width=Mm(80))

    llm_answer = Path(f"{path}/individual/EpiAgeScore/{sample_id}_llm.txt").read_text()
    doc = markdown_to_docx(llm_answer, doc)

    # doc.add_page_break()
    doc.save(f"{path}/individual/EpiAgeScore/{sample_id}.docx")

# Immuno

## Load data

In [None]:
path = f"E:/YandexDisk/Work/bbd/fmba"
path_imm_old = f"E:/YandexDisk/Work/pydnameth/datasets/GPL21145/GSEUNN"

df_pheno = pd.read_excel(f"{path}/pheno.xlsx", index_col=0)
df_pheno.index = df_pheno.index.astype(str)
df_pheno['Текущая основная вредность - Физические факторы'] = df_pheno['Текущая основная вредность - Физические факторы'].replace(
    {
        'Ионизирующие излученияК, радиоактивные веществаК;': 'Yes',
        'нет': 'No'
    }
)
df_pheno.rename(columns={
    'Текущая основная вредность - Физические факторы': 'Radioactive hazards',
    'невропатолог - код_заболевания': 'Невропатолог',
    'отоларинголог - код_заболевания': 'Отоларинголог',
    'офтальмолог - код_заболевания': 'Офтальмолог',
    'дерматолог - код_заболевания': 'Дерматолог',
    'хирург - код_заболевания': 'Хирург',
    'терапевт - код_заболевания': 'Терапевт',
    }, inplace=True
)
cols_pheno = [
    'Radioactive hazards',
    'Status',
    'Терапевт',
    'Хирург',
    'Невропатолог',
    'Дерматолог',
    'Отоларинголог',
    'Офтальмолог',
]
cols_pheno_all = copy.deepcopy(cols_pheno)

feats_imm = pd.read_excel(f"{path_imm_old}/data/immuno/feats_con.xlsx", index_col=0).index.to_list()
feats_imm_fimmu = pd.read_excel(f"{path_imm_old}/data/immuno/models/SImAge/feats_con_top10.xlsx", index_col=0).index.to_list()
feats_imm_slctd = pd.read_excel(f"{path_imm_old}/special/059_imm_data_selection/feats_selected.xlsx", index_col=0).index.to_list()
df_imm = pd.read_excel(f"{path}/immuno/data_full.xlsx", index_col=0)
df_imm.index = df_imm.index.astype(str)
df_imm = df_imm.loc[~df_imm['Region'].notna(), :]
cols_imm = ['Age', 'Sex'] + feats_imm_slctd + [f"{x}_log" for x in feats_imm_slctd] + ['SImAge']

n_cmn = df_pheno.index.intersection(df_imm.index)

dfs = [df_pheno[cols_pheno], df_imm[cols_imm]]
df = reduce(lambda left,right: pd.merge(left, right, left_index=True, right_index=True), dfs)

pheno_associations = {
    # 'Status': {
    #     'groups': ['Control', 'Case'],
    #     'base': 'Control',
    #     'colors': {'Control': 'dodgerblue', 'Case': 'crimson'}
    # },
    'Epigenetic profile': {
        'groups': ['Decelerated aging', 'Accelerated aging'],
        'base': 'Decelerated aging',
        'colors': {'Decelerated aging': 'dodgerblue', 'Accelerated aging': 'crimson'}
    },
}

### Load epigenetic aging profile (if necessary)

In [None]:
df_epi_age_pf = pd.read_excel(f"{path}/individual/EpiAgeScore/data.xlsx", index_col=0)
df_epi_age_pf.index = df_epi_age_pf.index.astype(str)
cols_pheno_all += ["Epigenetic profile"]
df = pd.merge(df, df_epi_age_pf[["Epigenetic profile"]], left_index=True, right_index=True)

### Check number of samples in categories

In [None]:
for an_col in pheno_associations:
    an_vals = pheno_associations[an_col]['groups']
    an_val_base = pheno_associations[an_col]['base']
    an_colors = pheno_associations[an_col]['colors']
    for group in an_vals:
        print(f"{an_col} ({group}): {len(df.index[df[an_col] == group])}")

## SImAge

In [None]:
df_ages = df[cols_pheno_all + ['Age'] + ['SImAge']].copy()
for an_col in pheno_associations:
    pathlib.Path(f"{path}/associations/imm/{an_col}").mkdir(parents=True, exist_ok=True)
    an_vals = pheno_associations[an_col]['groups']
    an_val_base = pheno_associations[an_col]['base']
    an_colors = pheno_associations[an_col]['colors']
    
    df_ages_ass = df_ages.loc[df_ages[an_col].isin(pheno_associations[an_col]['groups'])]
    
    # linreg = smf.ols(formula=f"SImAge ~ Age", data=df_epi_ages_ann.loc[:, :]).fit()
    linreg = smf.ols(formula=f"SImAge ~ Age", data=df_ages_ass.loc[df_ages_ass[an_col] == an_val_base, :]).fit()
    df_ages_ass[f"SImAge_linear_pred"] = linreg.predict(df_ages_ass)
    df_ages_ass[f"SImAge acceleration"] = df_ages_ass['SImAge'] - df_ages_ass[f"SImAge_linear_pred"]
        
    sns.set_theme(style='ticks')
    fig = plt.figure(
        figsize=(6, 5),
        layout="constrained"
    )
    axs = fig.subplot_mosaic(
        [
            ['11', '12'],
            ['21', '22'],
        ],
        height_ratios=[2, 5],
        width_ratios=[3, 1.5],
        gridspec_kw={
            "bottom": 0.14,
            "top": 0.95,
            # "left": 0.1,
            # "right": 0.5,
            #"wspace": 0.33,
            #"hspace": 0.01,
        },
    )
    
    ds_table = pd.DataFrame(index=['MAE (from diagonal)', 'MAE (from regression)', fr"Pearson $\rho$", "Bias"], columns=['All'] + an_vals)
    mae_diag = mean_absolute_error(df_ages_ass['Age'].values, df_ages_ass['SImAge'].values)
    mae_regr = np.mean(np.abs(df_ages_ass[f"SImAge acceleration"].values))
    rho, _ = stats.pearsonr(df_ages_ass['Age'].values, df_ages_ass['SImAge'].values)
    bias = np.mean(df_ages_ass['SImAge'] - df_ages_ass['Age'])
    ds_table.at['MAE (from diagonal)', 'All'] = f"{mae_diag:0.2f}"
    ds_table.at['MAE (from regression)', 'All'] = f"{mae_regr:0.2f}"
    ds_table.at[fr"Pearson $\rho$", 'All'] = f"{rho:0.2f}"
    ds_table.at["Bias", 'All'] = f"{bias:0.2f}"
    for an_val in an_vals:
        mae_diag = mean_absolute_error(df_ages_ass.loc[df_ages_ass[an_col] == an_val, 'Age'].values, df_ages_ass.loc[df_ages_ass[an_col] == an_val, 'SImAge'].values)
        mae_regr = np.mean(np.abs(df_ages_ass.loc[df_ages_ass[an_col] == an_val, f"SImAge acceleration"].values))
        rho, _ = stats.pearsonr(df_ages_ass.loc[df_ages_ass[an_col] == an_val, 'Age'].values, df_ages_ass.loc[df_ages_ass[an_col] == an_val, 'SImAge'].values)
        bias = np.mean(df_ages_ass.loc[df_ages_ass[an_col] == an_val, 'SImAge'] - df_ages_ass.loc[df_ages_ass[an_col] == an_val, 'Age'])
        ds_table.at['MAE (from diagonal)', an_val] = f"{mae_diag:0.2f}"
        ds_table.at['MAE (from regression)', an_val] = f"{mae_regr:0.2f}"
        ds_table.at[fr"Pearson $\rho$", an_val] = f"{rho:0.2f}"
        ds_table.at["Bias", an_val] = f"{bias:0.2f}"
        
    col_defs = [
        ColumnDefinition(
            name="index",
            title='SImAge',
            textprops={"ha": "left"},
            width=4.5,
        ),
        ColumnDefinition(
            name='All',
            title='All',
            textprops={"ha": "center"},
            width=2.0,
        ),
    ]
    for an_val in an_vals:
        col_defs.append(
                ColumnDefinition(
                name=an_val,
                title=an_val,
                textprops={"ha": "center"},
                width=2.0,
            )
        )
    table = Table(
        ds_table,
        column_definitions=col_defs,
        row_dividers=True,
        footer_divider=False,
        ax=axs['11'],
        textprops={"fontsize": 7},
        row_divider_kw={"linewidth": 1, "linestyle": (0, (1, 1))},
        col_label_divider_kw={"linewidth": 1, "linestyle": "-"},
        column_border_kw={"linewidth": 1, "linestyle": "-"},
    )
    
    axs['12'].axis('off')
    
    xy_min = df_ages_ass[['Age', 'SImAge']].min().min()
    xy_max = df_ages_ass[['Age', 'SImAge']].max().max()
    xy_ptp = xy_max - xy_min
    bisect = sns.lineplot(
        x=[xy_min - 0.1 * xy_ptp, xy_max + 0.1 * xy_ptp],
        y=[xy_min - 0.1 * xy_ptp, xy_max + 0.1 * xy_ptp],
        linestyle='--',
        color='black',
        linewidth=1.0,
        ax=axs['21']
    )
    regplot = sns.regplot(
        data=df_ages_ass.loc[df_ages_ass[an_col] == an_val_base, :],
        x='Age',
        y='SImAge',
        color=an_colors[an_val_base],
        scatter=False,
        truncate=False,
        ax=axs['21']
    )
    scatter = sns.scatterplot(
        data=df_ages_ass,
        x='Age',
        y='SImAge',
        hue=an_col,
        palette=an_colors,
        linewidth=0.5,
        alpha=0.75,
        edgecolor="k",
        s=20,
        hue_order=list(an_colors.keys()),
        legend=True,
        ax=axs['21'],
    )
    axs['21'].set_xlim(xy_min - 0.1 * xy_ptp, xy_max + 0.1 * xy_ptp)
    axs['21'].set_ylim(xy_min - 0.1 * xy_ptp, xy_max + 0.1 * xy_ptp)
    
    sns.violinplot(
        data=df_ages_ass,
        x=an_col,
        y=f"SImAge acceleration",
        hue=an_col,
        palette=an_colors,
        density_norm='width',
        order=an_vals,
        saturation=0.75,
        linewidth=1.0,
        ax=axs['22'],
        legend=False,
        cut=0,
    )
    axs['22'].set_ylabel(f"SImAge acceleration")
    
    _, mannwhitneyu_pval = mannwhitneyu(
        df_ages_ass.loc[df_ages_ass[an_col] == an_vals[0], f"SImAge acceleration"].values,
        df_ages_ass.loc[df_ages_ass[an_col] == an_vals[1], f"SImAge acceleration"].values,
        alternative='two-sided'
    )
    _, levene_pval = levene(
        df_ages_ass.loc[df_ages_ass[an_col] == an_vals[0], f"SImAge acceleration"].values,
        df_ages_ass.loc[df_ages_ass[an_col] == an_vals[1], f"SImAge acceleration"].values,
    )

    axs['22'].set_title(f'Mann-Whitney: {mannwhitneyu_pval:.2e}\nLevene: {levene_pval:.2e}')

    fig.savefig(f"{path}/associations/imm/{an_col}/ages_distribution.png", bbox_inches='tight', dpi=200)
    fig.savefig(f"{path}/associations/imm/{an_col}/ages_distribution.pdf", bbox_inches='tight')
    plt.close(fig)

## Immunomarkers

In [None]:
df_imms = df[cols_pheno_all + ['Age'] + feats_imm_slctd + [f"{x}_log" for x in feats_imm_slctd]].copy()

feats_sets = {
    'lin': feats_imm_slctd,
    'log': [f"{x}_log" for x in feats_imm_slctd]
}

for fs in feats_sets:
    
    feats_curr = feats_sets[fs]

    for an_col in pheno_associations:
        pathlib.Path(f"{path}/associations/imm/{an_col}").mkdir(parents=True, exist_ok=True)
        
        df_imms_ass = df_imms.loc[df_imms[an_col].isin(pheno_associations[an_col]['groups'])]
        df_imms_stat = pd.DataFrame(index=feats_curr)
        an_col_str = an_col.replace(' ', '_')
        an_col_str = an_col_str.replace('-', '_')
        df_imms_ass[an_col_str] = df_imms_ass[an_col]
        an_vals = pheno_associations[an_col]['groups']
        an_val_base = pheno_associations[an_col]['base']
        an_colors = pheno_associations[an_col]['colors']
        
        for f_id, f in enumerate(feats_curr):
            f_str = f.replace(' ', '_')
            f_str = f_str.replace('(', '')
            f_str = f_str.replace(')', '')
            df_imms_ass[f_str] = df_imms_ass[f]
            vals = {}
            for group in an_vals:
                vals[group] = df_imms_ass.loc[df_imms_ass[an_col] == group, f].values
                df_imms_stat.at[f, f"Mean {group}"] = np.mean(vals[group])
                df_imms_stat.at[f, f"Median {group}"] = np.median(vals[group])
                df_imms_stat.at[f, f"Q75 {group}"], df_imms_stat.at[f, f"Q25 {group}"] = np.percentile(vals[group], [75 , 25])
                df_imms_stat.at[f, f"IQR {group}"] = df_imms_stat.at[f, f"Q75 {group}"] - df_imms_stat.at[f, f"Q25 {group}"]
                df_imms_stat.at[f, f"Variation {group}"] = variation(vals[group])
            _, df_imms_stat.at[f, "mannwhitneyu_pval"] = mannwhitneyu(vals[an_vals[0]], vals[an_vals[1]], alternative='two-sided')
            _, df_imms_stat.at[f, "levene_pval"] = levene(vals[an_vals[0]], vals[an_vals[1]])
            regcov = smf.ols(formula=f"{f_str} ~ {an_col_str} + Age", data=df_imms_ass).fit()
            reg_sum = regcov.summary2().tables[1]
            pvals_cols = reg_sum.index[reg_sum.index.str.contains(an_col_str)].values
            for pval_col_id, pval_col in enumerate(pvals_cols):
                df_imms_stat.at[f, f"ancova_{pval_col}_pval"] = reg_sum.at[pval_col, 'P>|t|']
        _, df_imms_stat.loc[feats_curr, "mannwhitneyu_pval_fdr_bh"], _, _ = multipletests(df_imms_stat.loc[feats_curr, "mannwhitneyu_pval"].values, 0.05, method='fdr_bh')
        _, df_imms_stat.loc[feats_curr, "mannwhitneyu_pval_bonferroni"], _, _ = multipletests(df_imms_stat.loc[feats_curr, "mannwhitneyu_pval"].values, 0.05, method='bonferroni')
        _, df_imms_stat.loc[feats_curr, "mannwhitneyu_pval_simes-hochberg"], _, _ = multipletests(df_imms_stat.loc[feats_curr, "mannwhitneyu_pval"].values, 0.05, method='simes-hochberg')
        _, df_imms_stat.loc[feats_curr, "levene_pval_fdr_bh"], _, _ = multipletests(df_imms_stat.loc[feats_curr, "levene_pval"].values, 0.05, method='fdr_bh')
        _, df_imms_stat.loc[feats_curr, "levene_pval_bonferroni"], _, _ = multipletests(df_imms_stat.loc[feats_curr, "levene_pval"].values, 0.05, method='bonferroni')
        _, df_imms_stat.loc[feats_curr, "levene_pval_simes-hochberg"], _, _ = multipletests(df_imms_stat.loc[feats_curr, "levene_pval"].values, 0.05, method='simes-hochberg')
        pvals_cols_ancova = df_imms_stat.columns[df_imms_stat.columns.str.contains(an_col_str)].values
        for pval_col in pvals_cols_ancova:
            _, df_imms_stat.loc[feats_curr, f"{pval_col}_fdr_bh"], _, _ = multipletests(df_imms_stat.loc[feats_curr, pval_col].values, 0.05, method='fdr_bh')
            _, df_imms_stat.loc[feats_curr, f"{pval_col}_bonferroni"], _, _ = multipletests(df_imms_stat.loc[feats_curr, pval_col].values, 0.05, method='bonferroni')
            _, df_imms_stat.loc[feats_curr, f"{pval_col}_simes-hochberg"], _, _ = multipletests(df_imms_stat.loc[feats_curr, pval_col].values, 0.05, method='simes-hochberg')
        df_imms_stat.sort_values([f"mannwhitneyu_pval"], ascending=[True], inplace=True)
        df_imms_stat.to_excel(f"{path}/associations/imm/{an_col}/imms_{fs}.xlsx")
        
        for stat_test in [x.replace('_pval', '') for x in pvals_cols_ancova] + ['mannwhitneyu', 'levene']:
            df_fig = df_imms_stat.copy()
            df_fig.sort_values([f"{stat_test}_pval"], ascending=[True], inplace=True)
            df_fig['Features'] = df_fig.index
            df_fig[f'{stat_test}_pval_fdr_bh_log'] = -np.log10(df_fig[f'{stat_test}_pval_fdr_bh'])
            df_fig['color'] = 'pink'
            df_fig.loc[df_fig[f'{stat_test}_pval_fdr_bh'] < 0.05, 'color'] = 'red'
            sns.set_theme(style='ticks')
            fig, ax = plt.subplots(figsize=(3, df_fig.shape[0] * 0.5))
            barplot = sns.barplot(
                data=df_fig,
                y='Features',
                x=f'{stat_test}_pval_fdr_bh_log',
                edgecolor='black',
                palette=df_fig['color'].values,
                ax=ax,
            )
            ax.set_xlabel(r"$-\log_{10}(\mathrm{p-value})$")
            ax.xaxis.tick_top()
            ax.xaxis.set_label_position('top')
            ax.set_ylabel('')
            plt.savefig(f"{path}/associations/imm/{an_col}/imms_pvals_{stat_test}_{fs}.png", bbox_inches='tight', dpi=200)
            plt.savefig(f"{path}/associations/imm/{an_col}/imms_pvals_{stat_test}_{fs}.pdf", bbox_inches='tight')
            plt.close(fig)
        
        n_rows = 4
        n_cols = 8
        fig_width = 35
        fig_height = 16
        
        sns.set_theme(style='ticks')
        fig, axs = plt.subplots(n_rows, n_cols, figsize=(fig_width, fig_height), gridspec_kw={'wspace':0.15, 'hspace': 0.15}, layout='constrained')
        for f_id, f in enumerate(df_imms_stat.index.values):
            row_id, col_id = divmod(f_id, n_cols)
            
            ql = df_imms_ass[f].quantile(0.01)
            qh = df_imms_ass[f].quantile(0.99)
            
            sns.violinplot(
                data=df_imms_ass.loc[(df_imms_ass[f] > ql) & (df_imms_ass[f] < qh), :],
                x=an_col,
                y=f,
                palette=an_colors,
                scale='width',
                order=an_vals,
                saturation=0.75,
                ax=axs[row_id, col_id],
                legend=False,
                cut=0,
            )
            axs[row_id, col_id].set_ylabel(f)
            axs[row_id, col_id].ticklabel_format(style='scientific', scilimits=(-1, 1), axis='y', useOffset=True, useMathText=True)
            mannwhitneyu_pval = df_imms_stat.at[f, "mannwhitneyu_pval_fdr_bh"]
            levene_pval = df_imms_stat.at[f, "levene_pval_fdr_bh"]
            title = f'Mann-Whitney: {mannwhitneyu_pval:.2e}\nLevene: {levene_pval:.2e}'
            for pval_col in pvals_cols_ancova:
                title += f"\nANCOVA: {df_imms_stat.at[f, pval_col + '_fdr_bh']:.2e}"
            axs[row_id, col_id].set_title(title)

        fig.savefig(f"{path}/associations/imm/{an_col}/imms_distribution_{fs}.png", bbox_inches='tight', dpi=200)
        fig.savefig(f"{path}/associations/imm/{an_col}/imms_distribution_{fs}.pdf", bbox_inches='tight')
        plt.close(fig)

# Phenotype

## Load data

In [None]:
path = f"E:/YandexDisk/Work/bbd/fmba"

df_pheno = pd.read_excel(f"{path}/pheno.xlsx", index_col=0)
df_pheno.index = df_pheno.index.astype(str)

df_pheno = df_pheno[df_pheno['Status'].notna()]
df_pheno['дата рождения'] = pd.to_datetime(df_pheno['дата рождения'])
df_pheno['date_now'] = pd.to_datetime("2024-11-11")
df_pheno['Age'] = (df_pheno['date_now'] - df_pheno['дата рождения']) / np.timedelta64(1, 'D') / 365.25
df_pheno = df_pheno[df_pheno['Age'].notna()]

df_pheno['Текущая основная вредность - Физические факторы'] = df_pheno['Текущая основная вредность - Физические факторы'].replace(
    {
        'Ионизирующие излученияК, радиоактивные веществаК;': 'Yes',
        'нет': 'No'
    }
)

df_pheno.rename(columns={
    'Текущая основная вредность - Физические факторы': 'Radioactive hazards',
    'невропатолог - код_заболевания': 'Невропатолог',
    'отоларинголог - код_заболевания': 'Отоларинголог',
    'офтальмолог - код_заболевания': 'Офтальмолог',
    'дерматолог - код_заболевания': 'Дерматолог',
    'хирург - код_заболевания': 'Хирург',
    'терапевт - код_заболевания': 'Терапевт',
    }, inplace=True
)

pheno_associations = {
    'Status': {
        'groups': ['Control', 'Case'],
        'base': 'Control',
        'colors': {'Control': 'dodgerblue', 'Case': 'crimson'}
    },
}



### Check number of samples in categories

In [None]:
for an_col in pheno_associations:
    an_vals = pheno_associations[an_col]['groups']
    for group in an_vals:
        print(f"{an_col} ({group}): {len(df_pheno.index[df_pheno[an_col] == group])}")

## Samples distribution

In [None]:
pathlib.Path(f"{path}/associations/pheno/{an_col}").mkdir(parents=True, exist_ok=True)

for an_col in pheno_associations:
    an_vals = pheno_associations[an_col]['groups']
    an_val_base = pheno_associations[an_col]['base']
    an_colors = pheno_associations[an_col]['colors']
    hist_bins = np.linspace(5, 115, 23)
    fig, ax = plt.subplots(figsize=(6, 4))
    histplot = sns.histplot(
        data=df_pheno,
        bins=hist_bins,
        edgecolor='k',
        linewidth=1,
        x="Age",
        hue=an_col,
        palette=an_colors,
        ax=ax
    )
    histplot.set(xlim=(0, 120))
    plt.savefig(f"{path}/associations/pheno/{an_col}/hist_age.png", bbox_inches='tight', dpi=200)
    plt.savefig(f"{path}/associations/pheno/{an_col}/hist_age.pdf", bbox_inches='tight')
    plt.close(fig)

# PhenoAge and CognitiveAge

## Load data

In [None]:
path = f"E:/YandexDisk/Work/bbd/fmba"

df_pheno = pd.read_excel(f"{path}/pheno.xlsx", index_col=0)
df_pheno.index = df_pheno.index.astype(str)
df_pheno['Текущая основная вредность - Физические факторы'] = df_pheno['Текущая основная вредность - Физические факторы'].replace(
    {
        'Ионизирующие излученияК, радиоактивные веществаК;': 'Yes',
        'нет': 'No'
    }
)
df_pheno.rename(columns={
    'Текущая основная вредность - Физические факторы': 'Radioactive hazards',
    'невропатолог - код_заболевания': 'Невропатолог',
    'отоларинголог - код_заболевания': 'Отоларинголог',
    'офтальмолог - код_заболевания': 'Офтальмолог',
    'дерматолог - код_заболевания': 'Дерматолог',
    'хирург - код_заболевания': 'Хирург',
    'терапевт - код_заболевания': 'Терапевт',
    }, inplace=True
)
cols_pheno = ['Status']

df_ages = pd.read_excel(f"{path}/PhenoAge_CognitiveAge/Lesnoy_phenoage_v2_export.xlsx", index_col='id_person_org')
df_ages.index = df_ages.index.astype(str)
df_ages.rename(columns={
    'AgeBloodSample': 'Chronological Age (PhenoAge)',
    'UNNPhenoAgeR2': 'PhenoAge',
    'AgeTests': 'Chronological Age (CognitiveAge)',
    'CognitiveAgeModel5V': 'CognitiveAge',
    }, inplace=True
)
cols_ages = [
    'Chronological Age (PhenoAge)',
    'PhenoAge',
    'Chronological Age (CognitiveAge)',
    'CognitiveAge',
]

df_epi_age_pf = pd.read_excel(f"{path}/individual/EpiAgeScore/data.xlsx", index_col=0)
df_epi_age_pf.index = df_epi_age_pf.index.astype(str)
cols_epi_age_pf = ["Epigenetic profile"]

n_cmn = df_pheno.index.intersection(df_ages.index).intersection(df_epi_age_pf.index)
print(len(n_cmn))

dfs = [df_pheno[cols_pheno], df_ages[cols_ages], df_epi_age_pf[cols_epi_age_pf]]

df = pd.concat(dfs, axis=1)
df = df[df['Status'].notna()]

df_phenoage = df[df['PhenoAge'].notna()]
mae_phenoage = mean_absolute_error(df_phenoage['Chronological Age (PhenoAge)'].values, df_phenoage['PhenoAge'].values)
print(f"mae_phenoage: {mae_phenoage}")
df['PhenoAge acceleration'] = df['PhenoAge'] - df['Chronological Age (PhenoAge)']
df.loc[df['PhenoAge acceleration'].notna(), 'PhenoAge profile'] = 'Neutral'
df.loc[df['PhenoAge acceleration'] > mae_phenoage, 'PhenoAge profile'] = 'Accelerated aging'
df.loc[df['PhenoAge acceleration'] < -mae_phenoage, 'PhenoAge profile'] = 'Decelerated aging'
print(df["PhenoAge profile"].value_counts())

df_cognage = df[df['CognitiveAge'].notna()]
mae_cognage = mean_absolute_error(df_cognage['Chronological Age (CognitiveAge)'].values, df_cognage['CognitiveAge'].values)
print(f"mae_cognage: {mae_cognage}")
df['CognitiveAge acceleration'] = df['CognitiveAge'] - df['Chronological Age (CognitiveAge)']
df.loc[df['CognitiveAge acceleration'].notna(), 'CognitiveAge profile'] = 'Neutral'
df.loc[df['CognitiveAge acceleration'] > mae_cognage, 'CognitiveAge profile'] = 'Accelerated aging'
df.loc[df['CognitiveAge acceleration'] < -mae_cognage, 'CognitiveAge profile'] = 'Decelerated aging'
print(df["CognitiveAge profile"].value_counts())

pheno_associations = {
    'Status': {
        'groups': ['Control', 'Case'],
        'base': 'Control',
        'colors': {'Control': 'dodgerblue', 'Case': 'crimson'}
    },
    'Epigenetic profile': {
        'groups': ['Decelerated aging', 'Accelerated aging'],
        'base': 'Decelerated aging',
        'colors': {'Decelerated aging': 'dodgerblue', 'Accelerated aging': 'crimson'}
    },
}

for an_col in pheno_associations:
    an_vals = pheno_associations[an_col]['groups']
    for group in an_vals:
        print(f"{an_col} ({group}): {len(df.index[df[an_col] == group])}")

In [None]:
df.to_excel(f"{path}/PhenoAge_CognitiveAge/df.xlsx")

## Age acceleration

In [None]:
ages = ['PhenoAge', 'CognitiveAge']

for an_col in pheno_associations:
    
    pathlib.Path(f"{path}/associations/PhenoAge_CognitiveAge/{an_col}").mkdir(parents=True, exist_ok=True)
    
    an_col_str = an_col.replace(' ', '_')
    an_col_str = an_col_str.replace(',', '')
    an_col_str = an_col_str.replace('.', '')
    an_col_str = an_col_str.replace('-', '_')
    
    an_vals = pheno_associations[an_col]['groups']
    an_val_base = pheno_associations[an_col]['base']
    an_colors = pheno_associations[an_col]['colors']
    
    for age in ages:
        
        df_fig = df.loc[df[an_col].isin(pheno_associations[an_col]['groups'])]
        df_fig = df_fig[(df_fig[age].notna()) & (df_fig[f"Chronological Age ({age})"].notna())]
        
        fig_an_colors = {f"{an_val} ({df_fig.loc[df_fig[an_col] == an_val, :].shape[0]})": an_colors[an_val] for an_val in an_vals}
        fig_an_val_base = f"{an_val_base} ({df_fig.loc[df_fig[an_col] == an_val_base, :].shape[0]})"
        fig_an_vals = [f"{an_val} ({df_fig.loc[df_fig[an_col] == an_val, :].shape[0]})" for an_val in an_vals]
        df_fig[an_col].replace({an_val: f"{an_val} ({df_fig.loc[df_fig[an_col] == an_val, :].shape[0]})" for an_val in an_vals}, inplace=True)
        
        df_fig[an_col_str] = df_fig[an_col]
        df_fig[f"ChronologicalAge{age}"] = df_fig[f"Chronological Age ({age})"]
        
        df_cross = pd.crosstab(df_fig[an_col], df_fig[f"{age} profile"])
        res = chi2_contingency(df_cross, correction=True)
        ax = df_cross.plot(kind="bar", rot=0, color={'Accelerated aging': 'crimson', 'Decelerated aging': 'dodgerblue', 'Neutral': 'gray'})
        ax.set_title(r'$\chi^2$' + f' p-value: {res.pvalue:0.3e}')
        fig = ax.get_figure()
        fig.savefig(f"{path}/associations/PhenoAge_CognitiveAge/{an_col}/chisquare_{age}.png", bbox_inches='tight', dpi=200)
        fig.savefig(f"{path}/associations/PhenoAge_CognitiveAge/{an_col}/chisquare_{age}.pdf", bbox_inches='tight')
        plt.close(fig)
        
        hist_bins = np.linspace(5, 115, 23)
        fig, ax = plt.subplots(figsize=(6, 4))
        histplot = sns.histplot(
            data=df_fig,
            bins=hist_bins,
            edgecolor='k',
            linewidth=1,
            x=f"Chronological Age ({age})",
            hue=an_col,
            palette=fig_an_colors,
            ax=ax
        )
        histplot.set_xlabel("Age")
        histplot.set(xlim=(0, 120))
        plt.savefig(f"{path}/associations/PhenoAge_CognitiveAge/{an_col}/hist_chronological_age_({age}).png", bbox_inches='tight', dpi=200)
        plt.savefig(f"{path}/associations/PhenoAge_CognitiveAge/{an_col}/hist_chronological_age_({age}).pdf", bbox_inches='tight')
        plt.close(fig)
        
        # linreg = smf.ols(formula=f"{age} ~ ChronologicalAge{age}", data=df_fig.loc[:, :]).fit()
        linreg = smf.ols(formula=f"{age} ~ ChronologicalAge{age}", data=df_fig.loc[df_fig[an_col] == fig_an_val_base, :]).fit()
        df_fig[f"{age}_linear_pred"] = linreg.predict(df_fig)
        df_fig[f"{age} acceleration"] = df_fig[age] - df_fig[f"{age}_linear_pred"]
        # df_fig[f"{age} acceleration"] = df_fig[age] - df_fig[f"ChronologicalAge{age}"]
        
        vals = {}
        for group in fig_an_vals:
            vals[group] = df_fig.loc[df_fig[an_col] == group, f"{age} acceleration"].values
        mw_stat, mw_pval = mannwhitneyu(vals[fig_an_vals[0]], vals[fig_an_vals[1]], alternative='two-sided')
        ln_stat, ln_pval = levene(vals[fig_an_vals[0]], vals[fig_an_vals[1]])
        regcov = smf.ols(formula=f"{age} ~ {an_col_str} + ChronologicalAge{age}", data=df_fig).fit()
        reg_sum = regcov.summary2().tables[1]
        pvals_cols = reg_sum.index[reg_sum.index.str.contains(an_col_str)].values
        ancova_pval = reg_sum.at[pvals_cols[0], 'P>|t|']

        fig, axs = plt.subplot_mosaic(
            [
                ['11', '12'],
                ['21', '22'],
            ],
            figsize=(8, 6),
            height_ratios=[1, 4],
            width_ratios=[3, 1.5],
            gridspec_kw={
                "bottom": 0.14,
                "top": 0.95,
                # "left": 0.1,
                # "right": 0.5,
                "wspace": 0.33,
                "hspace": 0.01,
            },
        )
        
        ds_table = pd.DataFrame(index=['MAE (from diagonal)', 'MAE (from regression)', fr"Pearson $\rho$", "Bias"], columns=[age])
        mae_diag = mean_absolute_error(df_fig[f'ChronologicalAge{age}'].values, df_fig[age].values)
        mae_regr = np.mean(np.abs(df_fig[f"{age} acceleration"].values))
        rho, _ = stats.pearsonr(df_fig[f'ChronologicalAge{age}'].values, df_fig[age].values)
        bias = np.mean(df_fig[age] - df_fig[f'ChronologicalAge{age}'])
        ds_table.at['MAE (from diagonal)', age] = f"{mae_diag:0.2f}"
        ds_table.at['MAE (from regression)', age] = f"{mae_regr:0.2f}"
        ds_table.at[fr"Pearson $\rho$", age] = f"{rho:0.2f}"
        ds_table.at["Bias", age] = f"{bias:0.2f}"
        col_defs = [
            ColumnDefinition(
                name="index",
                title=age,
                textprops={"ha": "left"},
                width=4.5,
            ),
            ColumnDefinition(
                name=age,
                title='',
                textprops={"ha": "center"},
                width=2.0,
            ),
        ]
        table = Table(
            ds_table,
            column_definitions=col_defs,
            row_dividers=True,
            footer_divider=False,
            ax=axs['11'],
            textprops={"fontsize": 7},
            row_divider_kw={"linewidth": 1, "linestyle": (0, (1, 1))},
            col_label_divider_kw={"linewidth": 1, "linestyle": "-"},
            column_border_kw={"linewidth": 1, "linestyle": "-"},
        ).autoset_fontcolors(colnames=[age])
        
        axs['12'].axis('off')
        
        xy_min = df_fig[[f'ChronologicalAge{age}', age]].min().min()
        xy_max = df_fig[[f'ChronologicalAge{age}', age]].max().max()
        xy_ptp = xy_max - xy_min
        bisect = sns.lineplot(
            x=[xy_min - 0.1 * xy_ptp, xy_max + 0.1 * xy_ptp],
            y=[xy_min - 0.1 * xy_ptp, xy_max + 0.1 * xy_ptp],
            linestyle='--',
            color='black',
            linewidth=1.0,
            ax=axs['21']
        )
        regplot = sns.regplot(
            data=df_fig.loc[df_fig[an_col] == fig_an_val_base, :],
            x=f'ChronologicalAge{age}',
            y=age,
            color=fig_an_colors[fig_an_val_base],
            scatter=False,
            truncate=False,
            ax=axs['21']
        )
        scatter = sns.scatterplot(
            data=df_fig,
            x=f'ChronologicalAge{age}',
            y=age,
            hue=an_col,
            palette=fig_an_colors,
            linewidth=0.5,
            alpha=0.75,
            edgecolor="k",
            s=20,
            hue_order=list(fig_an_colors.keys()),
            legend=True,
            ax=axs['21'],
        )
        axs['21'].set_xlabel(f"Age")
        axs['21'].set_xlim(xy_min - 0.1 * xy_ptp, xy_max + 0.1 * xy_ptp)
        axs['21'].set_ylim(xy_min - 0.1 * xy_ptp, xy_max + 0.1 * xy_ptp)
        
        sns.violinplot(
            data=df_fig,
            x=an_col,
            y=f"{age} acceleration",
            hue=an_col,
            palette=fig_an_colors,
            density_norm='width',
            order=fig_an_vals,
            saturation=0.75,
            linewidth=1.0,
            ax=axs['22'],
            legend=False,
            cut=0,
        )
        axs['22'].set_ylabel(f"{age} acceleration")
        axs['22'].set_xticklabels([])
        axs['22'].set_xlabel('')
        title = f'Mann-Whitney: {mw_pval:.2e}\nLevene: {ln_pval:.2e}'
        title += f"\nANCOVA: {ancova_pval:.2e}"
        axs['22'].set_title(title)

        fig.savefig(f"{path}/associations/PhenoAge_CognitiveAge/{an_col}/{age}.png", bbox_inches='tight', dpi=200)
        fig.savefig(f"{path}/associations/PhenoAge_CognitiveAge/{an_col}/{age}.pdf", bbox_inches='tight')
        plt.close(fig)

# Add columns to initial table

In [None]:
path = f"E:/YandexDisk/Work/bbd/fmba"

df_init = pd.read_excel(f"{path}/ПриложениеГ (КОНФИДЕНЦИАЛЬНО).xlsx", index_col='работник_ID')
df_init.index = df_init.index.astype(str)

df_ages = pd.read_excel(f"{path}/PhenoAge_CognitiveAge/Lesnoy_phenoage_v2_export.xlsx", index_col='id_person_org')
df_ages.index = df_ages.index.astype(str)
df_ages.rename(columns={
    'AgeBloodSample': 'Chronological Age (PhenoAge)',
    'UNNPhenoAgeR2': 'PhenoAge',
    'AgeTests': 'Chronological Age (CognitiveAge)',
    'CognitiveAgeModel5V': 'CognitiveAge',
    }, inplace=True
)
df_ages['PhenoAge acceleration'] = df_ages['PhenoAge'] - df_ages['Chronological Age (PhenoAge)']
df_ages['CognitiveAge acceleration'] = df_ages['CognitiveAge'] - df_ages['Chronological Age (CognitiveAge)']
cols_ages = [
    'Chronological Age (PhenoAge)',
    'PhenoAge',
    'Chronological Age (CognitiveAge)',
    'CognitiveAge',
    'PhenoAge acceleration',
    'CognitiveAge acceleration'
]

df_epi_age_pf = pd.read_excel(f"{path}/individual/EpiAgeScore/data.xlsx", index_col=0)
df_epi_age_pf.index = df_epi_age_pf.index.astype(str)
path_pyaging = "E:/YandexDisk/Work/pydnameth/datasets/pyaging"
pyaging_meta = pd.read_excel(f"{path_pyaging}/clocks_meta.xlsx", index_col='Clock Name')
pyaging_meta['Clock Name'] = pyaging_meta.index
pyaging_meta.drop(index=['Knight', 'LeeControl', 'LeeRefinedRobust', 'LeeRobust', 'PedBE', 'RepliTali', 'ENCen100'], inplace=True)
epi_ages = pyaging_meta[pyaging_meta['Type'] == 'Age'].index.to_list()
cols_epi_age_pf = ["Epigenetic profile"] + [f"{x} acceleration" for x in epi_ages] + [f"{x} corrected" for x in epi_ages]

dfs = [df_init, df_ages[cols_ages], df_epi_age_pf[cols_epi_age_pf]]
df = pd.concat(dfs, axis=1)
df.rename(columns={f"{x} corrected": x for x in epi_ages}, inplace=True)

df.to_excel(f"{path}/data_with_epi_pheno_cognitive_ages.xlsx")

# Testing stuff

In [None]:
import re
from docx import Document

def markdown_to_docx(doc, markdown_text):
    def add_formatted_text(paragraph, text):
        # Обработка жирного текста
        parts = []
        current_pos = 0
        for match in re.finditer(r'\*\*(.*?)\*\*', text):
            start, end = match.start(), match.end()
            if start > current_pos:
                parts.append(('text', text[current_pos:start]))
            parts.append(('bold', match.group(1)))
            current_pos = end
        if current_pos < len(text):
            parts.append(('text', text[current_pos:]))

        # Обработка курсива
        processed_parts = []
        for part_type, content in parts:
            if part_type == 'text':
                sub_current = 0
                for it_match in re.finditer(r'\*(.*?)\*', content):
                    sub_start, sub_end = it_match.start(), it_match.end()
                    if sub_start > sub_current:
                        processed_parts.append(('text', content[sub_current:sub_start]))
                    processed_parts.append(('italic', it_match.group(1)))
                    sub_current = sub_end
                if sub_current < len(content):
                    processed_parts.append(('text', content[sub_current:]))
            else:
                processed_parts.append((part_type, content))

        # Добавление runs в параграф
        for p_type, p_content in processed_parts:
            run = paragraph.add_run(p_content)
            if p_type == 'bold':
                run.bold = True
            elif p_type == 'italic':
                run.italic = True

    lines = markdown_text.split('\n')
    list_level_stack = [0]  # Для отслеживания текущего уровня списка
    
    for line in lines:
        stripped = line.strip()
        original_line = line  # Сохраняем оригинал для определения отступов
        
        # Пропускаем пустые строки и разделители
        if not stripped or stripped == '---':
            continue
        
        # Обработка заголовков
        if stripped.startswith('#'):
            parts = stripped.split(' ', 1)
            if len(parts) < 2 or not parts[0].strip('#'):
                p = doc.add_paragraph()
                add_formatted_text(p, stripped)
                continue
            
            level = len(parts[0])
            header_text = parts[1].strip()
            doc.add_heading(header_text, level=min(level-1, 5))
        
        # Обработка списков
        elif original_line.lstrip().startswith('-'):
            # Определяем уровень вложенности
            indent = len(original_line) - len(original_line.lstrip(' '))
            level = indent // 4  # Стандартный отступ Markdown 4 пробела
            
            # Получаем текст пункта
            list_text = original_line.lstrip(' ').lstrip('-').strip()
            
            # Добавляем пункт списка
            p = doc.add_paragraph(style='ListBullet')
            p.paragraph_format.left_indent = Pt(30 * (level + 1))
            add_formatted_text(p, list_text)
        
        # Обычный текст
        else:
            p = doc.add_paragraph()
            add_formatted_text(p, stripped)

# Пример использования
doc = Document()
markdown_text = """
# Заголовок 1
## Заголовок 2

- Пункт **жирного** списка
  - Вложенный *курсивный* пункт
  - Еще один пункт
- **Полностью жирный элемент**

Текст с *курсивом* и **жирным** форматированием
---
"""

markdown_to_docx(doc, markdown_text)
doc.save(f"{path}/individual/EpiAgeScore/wtf.docx")

In [None]:
from openai import OpenAI

client = OpenAI(api_key="", base_url="https://api.deepseek.com")

response = client.chat.completions.create(
    model="deepseek-chat",
    messages=[
        {"role": "system", "content": "You are a helpful assistant"},
        {"role": "user", "content": "Привет! Ты сможешь пройти тест Тьюринга?"},
    ],
    stream=False
)

print(response.choices[0].message.content)

In [None]:
from gigachat import GigaChat

# Используйте ключ авторизации, полученный в личном кабинете, в проекте GigaChat API.
with GigaChat(credentials="", ca_bundle_file="C:/Users/user/Downloads/russian_trusted_root_ca.cer") as giga:
    response = giga.chat("Какие факторы влияют на стоимость страховки на дом?")
    print(response.choices[0].message.content)