# Debugging autoreload

In [None]:
%load_ext autoreload
%autoreload 2

# Load packages

In [None]:
from pytorch_tabular.utils import load_covertype_dataset
from rich.pretty import pprint
from plotly.subplots import make_subplots
from pytorch_tabular import TabularModel
import plotly.express as px
import torch
import random
import plotly.graph_objects as go
from scipy import stats
import shap
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.impute import KNNImputer
from glob import glob
import ast
import matplotlib.pyplot as plt
import seaborn as sns
import copy
import itertools
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from pytorch_tabular import model_sweep
from src.pt.model_sweep import model_sweep_custom
import warnings
from src.utils.configs import read_parse_config
from src.pt.hyper_opt import train_hyper_opt
from src.utils.hash import dict_hash
import pathlib
from tqdm import tqdm
import distinctipy
import matplotlib.patheffects as pe
import matplotlib.colors as mcolors
from statannotations.Annotator import Annotator
from scipy.stats import mannwhitneyu
from plottable import ColumnDefinition, Table
from scipy.stats import chi2_contingency
from plottable.plots import bar
from plottable.cmap import normed_cmap, centered_cmap
import optuna
from matplotlib.colors import LinearSegmentedColormap
import matplotlib.cm
import matplotlib as mpl
from statsmodels.stats.multitest import multipletests
import re
import datetime
from collections import Counter
from matplotlib.ticker import MaxNLocator
from itertools import chain
from sklearn.metrics import mean_absolute_error
from scipy.stats import mannwhitneyu, variation, levene, zscore
import pyaging as pya
import matplotlib.lines as mlines
from src.models.simage.tabular.widedeep.ft_transformer import WDFTTransformerModel
import statsmodels.formula.api as smf
from itertools import chain
from pingouin import ancova
from sklearn.preprocessing import LabelEncoder 
from functools import reduce
import upsetplot
from src.plot.plotly_layout import add_layout
from docx import Document
from docx.shared import Inches, Cm, Mm, Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.enum.section import WD_ORIENT
from docx.enum.style import WD_STYLE_TYPE
from docx.shared import RGBColor
from pathlib import Path
import re
from openai import OpenAI
from matplotlib_venn import venn3, venn3_circles
import functools


def conjunction(conditions):
    return functools.reduce(np.logical_and, conditions)


def disjunction(conditions):
    return functools.reduce(np.logical_or, conditions)

# June 2025

In [None]:
path = f"E:/YandexDisk/Work/bbd/fmba"

df = pd.read_excel(f"{path}/pheno_fixed.xlsx", index_col=0)
df.index = df.index.astype(str)
df['дата рождения'] = pd.to_datetime(df['дата рождения'])
df['date_now'] = pd.to_datetime("2024-11-11")
df['Age'] = (df['date_now'] - df['дата рождения']) / np.timedelta64(1, 'D') / 365.25
df.dropna(subset=['Age', 'Status'], inplace=True)

special_diseases = [
    "Терапевт, I10",        # гипертония
    "Терапевт, I10.0",      # гипертония
    "Терапевт, I11",        # гипертония
    "Терапевт, I11.0",      # гипертония
    "Терапевт, I11.9",      # гипертония
    "Терапевт, I20",        # ИБС
    "Терапевт, I25.0",      # ИБС
    "Терапевт, I25",        # ИБС
    "Терапевт, E78",        # холестерин (гиперлипидемия, гиперхолестеринемия)
    "Терапевт, E66",        # ожирение
    "Терапевт, E66.0",      # ожирение
]
df['Special Diseases'] = np.where(disjunction([df[m] == 1 for m in special_diseases]), 1, 0)
df.loc[(df['Special Diseases'] == 0) & (df['Status'] == 'Control'), 'Special Status'] = 'Control'
df.loc[(df['Special Diseases'] == 1) & (df['Status'] == 'Case'), 'Special Status'] = 'Case'

df_epi = pd.read_excel(f"{path}/dnam/processed/pheno.xlsx", index_col=0)
df_epi.index = df_epi.index.astype(str)
df['Has DNAm?'] = 0
df.loc[df.index.intersection(df_epi.index), 'Has DNAm?'] = 1
df.loc[:, ['Age', 'Status', 'Special Diseases', 'Special Status', 'Has DNAm?'] + special_diseases].to_excel(f"{path}/04_select_96_for_DNAm/df.xlsx")

In [None]:
df_selected = pd.read_excel(f"{path}/04_select_96_for_DNAm/selected.xlsx", index_col=0)
df_selected.index = df_selected.index.astype(str)

groups = {
    'All Controls': df.index[(df['Special Status'] == 'Control') & (df['Has DNAm?'] == 0)].values,
    'All Cases': df.index[(df['Special Status'] == 'Case') & (df['Has DNAm?'] == 0)].values,
    'Selected Controls': df_selected.index[(df_selected['Special Status'] == 'Control') & (df_selected['Has DNAm?'] == 0)].values,
    'Selected Cases': df_selected.index[(df_selected['Special Status'] == 'Case') & (df_selected['Has DNAm?'] == 0)].values,
}

groups_colors = {
    'All Controls': 'firebrick',
    'All Cases': 'blue',
    'Selected Controls': 'crimson',
    'Selected Cases': 'dodgerblue',
}

In [None]:
n_rows = 2
n_cols = 2
fig_width = 10
fig_height = 8
hist_bins = np.linspace(5, 115, 23)

sns.set_theme(style='ticks')
fig, axs = plt.subplots(n_rows, n_cols, figsize=(fig_width, fig_height), gridspec_kw={}, sharey=True, sharex=True)
for g_id, (g, g_ids) in enumerate(groups.items()):
    row_id, col_id = divmod(g_id, n_cols)
    
    histplot = sns.histplot(
        data=df.loc[g_ids, :],
        bins=hist_bins,
        edgecolor='k',
        linewidth=1,
        x="Age",
        color=groups_colors[g],
        ax=axs[row_id, col_id]
    )
    axs[row_id, col_id].set(xlim=(15, 80))
    axs[row_id, col_id].set_title(f"{g} ({len(g_ids)})")
fig.tight_layout()    
fig.savefig(f"{path}/04_select_96_for_DNAm/hist_age.png", bbox_inches='tight', dpi=200)
fig.savefig(f"{path}/04_select_96_for_DNAm/hist_age.pdf", bbox_inches='tight')
plt.close(fig)

# September 2025

In [None]:
path = f"E:/YandexDisk/Work/bbd/fmba"

df = pd.read_excel(f"{path}/pheno_fixed.xlsx", index_col=0)
df.index = df.index.astype(str)
df['дата рождения'] = pd.to_datetime(df['дата рождения'])
df['date_now'] = pd.to_datetime("2024-11-11")
df['Age'] = (df['date_now'] - df['дата рождения']) / np.timedelta64(1, 'D') / 365.25
df.dropna(subset=['Age', 'Status'], inplace=True)

special_diseases = [
    "Терапевт, I10",        # гипертония
    "Терапевт, I10.0",      # гипертония
    "Терапевт, I11",        # гипертония
    "Терапевт, I11.0",      # гипертония
    "Терапевт, I11.9",      # гипертония
    "Терапевт, I20",        # ИБС
    "Терапевт, I25.0",      # ИБС
    "Терапевт, I25",        # ИБС
    "Терапевт, I42",        # Cardiomyopathy
    "Терапевт, I42.0",      # Cardiomyopathy
    "Терапевт, I49",        # Other cardiac arrhythmias
    "Терапевт, E78",        # холестерин (гиперлипидемия, гиперхолестеринемия)
    "Терапевт, E66",        # ожирение
    "Терапевт, E66.0",      # ожирение
]
i_diseases = [
    "Терапевт, I10",        # гипертония
    "Терапевт, I10.0",      # гипертония
    "Терапевт, I11",        # гипертония
    "Терапевт, I11.0",      # гипертония
    "Терапевт, I11.9",      # гипертония
    "Терапевт, I20",        # ИБС
    "Терапевт, I25.0",      # ИБС
    "Терапевт, I25",        # ИБС
    "Терапевт, I42",        # Cardiomyopathy
    "Терапевт, I42.0",      # Cardiomyopathy
    "Терапевт, I49",        # Other cardiac arrhythmias
]
df['Special Diseases'] = np.where(disjunction([df[m] == 1 for m in special_diseases]), 1, 0)
df.loc[(df['Special Diseases'] == 0) & (df['Status'] == 'Control'), 'Special Status'] = 'Control'
df.loc[(df['Special Diseases'] == 1) & (df['Status'] == 'Case'), 'Special Status'] = 'Case'
df[f"Number of Diseases"] = df.loc[:, special_diseases].sum(axis=1)
df[f"Number of I Diseases"] = df.loc[:, i_diseases].sum(axis=1)

df_epi = pd.read_excel(f"{path}/dnam/processed/pheno.xlsx", index_col=0)
df_epi.index = df_epi.index.astype(str)
df['Has DNAm?'] = 0
df.loc[df.index.intersection(df_epi.index), 'Has DNAm?'] = 1

df_gen = pd.read_excel(f"{path}/exome/Образцы экзомы Лесной секвенирование 100 шт.xlsx", index_col=0)
df_gen.index = df_gen.index.astype(str)
df['Has exome?'] = 0
df.loc[df.index.intersection(df_gen.index), 'Has exome?'] = 1

df.loc[:, ['Age', 'Status', 'Special Diseases', 'Special Status', 'Has DNAm?', 'Has exome?', 'Number of Diseases', 'Number of I Diseases'] + special_diseases].to_excel(f"{path}/04_select_96_for_DNAm/df.xlsx")

In [None]:
df_selected = pd.read_excel(f"{path}/04_select_96_for_DNAm/selected.xlsx", index_col=0)
df_selected.index = df_selected.index.astype(str)

groups = {
    'All Controls': df.index[(df['Special Status'] == 'Control') & (df['Has DNAm?'] == 0) & (df['Has exome?'] == 0)].values,
    'All Cases': df.index[(df['Special Status'] == 'Case') & (df['Has DNAm?'] == 0) & (df['Has exome?'] == 0)].values,
    'Selected Controls': df_selected.index[(df_selected['Special Status'] == 'Control') & (df_selected['Has DNAm?'] == 0) & (df_selected['Has exome?'] == 0)].values,
    'Selected Cases': df_selected.index[(df_selected['Special Status'] == 'Case') & (df_selected['Has DNAm?'] == 0) & (df_selected['Has exome?'] == 0)].values,
}

groups_colors = {
    'All Controls': 'firebrick',
    'All Cases': 'blue',
    'Selected Controls': 'crimson',
    'Selected Cases': 'dodgerblue',
}

n_rows = 2
n_cols = 2
fig_width = 10
fig_height = 8
hist_bins = np.linspace(5, 115, 23)

sns.set_theme(style='ticks')
fig, axs = plt.subplots(n_rows, n_cols, figsize=(fig_width, fig_height), gridspec_kw={}, sharey=True, sharex=True)
for g_id, (g, g_ids) in enumerate(groups.items()):
    row_id, col_id = divmod(g_id, n_cols)
    
    histplot = sns.histplot(
        data=df.loc[g_ids, :],
        bins=hist_bins,
        edgecolor='k',
        linewidth=1,
        x="Age",
        color=groups_colors[g],
        ax=axs[row_id, col_id]
    )
    axs[row_id, col_id].set(xlim=(15, 80))
    axs[row_id, col_id].set_title(f"{g} ({len(g_ids)})")
fig.tight_layout()    
fig.savefig(f"{path}/04_select_96_for_DNAm/hist_age.png", bbox_inches='tight', dpi=200)
fig.savefig(f"{path}/04_select_96_for_DNAm/hist_age.pdf", bbox_inches='tight')
plt.close(fig)