# Debugging autoreload

In [None]:
%load_ext autoreload
%autoreload 2

# Load packages

In [1]:
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
from scipy.stats import pearsonr, mannwhitneyu, levene

# Load data

In [2]:
path = f"E:/YandexDisk/bbd/fmba"
path_pyaging = "E:/YandexDisk/pydnameth/datasets/pyaging"

epi_ages = []

df_pheno = pd.read_excel(f"{path}/pheno_fixed.xlsx", index_col=0)
df_pheno.index = df_pheno.index.astype(str)

df_ages = pd.read_excel(f"{path}/PhenoAge_CognitiveAge/df.xlsx", index_col=0)
df_ages.index = df_ages.index.astype(str)
linreg_pheno = smf.ols(formula="PhenoAge ~ Q('Chronological Age (PhenoAge)')", data=df_ages).fit()
df_ages["PhenoAge linear pred"] = linreg_pheno.predict(df_ages)
df_ages["PhenoAge acceleration lin"] = df_ages["PhenoAge"] - df_ages["PhenoAge linear pred"]
linreg_cogn = smf.ols(formula="CognitiveAge ~ Q('Chronological Age (CognitiveAge)')", data=df_ages).fit()
df_ages["CognitiveAge linear pred"] = linreg_cogn.predict(df_ages)
df_ages["CognitiveAge acceleration lin"] = df_ages["CognitiveAge"] - df_ages["CognitiveAge linear pred"]
df_ages["CognitiveAge acceleration lin"] = df_ages["CognitiveAge"] - linreg_cogn.predict(df_ages)
cols_ages = [
    'Chronological Age (PhenoAge)',
    'PhenoAge',
    'Chronological Age (CognitiveAge)',
    'CognitiveAge',
    'PhenoAge acceleration',
    'CognitiveAge acceleration',
    'PhenoAge linear pred',
    'CognitiveAge linear pred',
    'PhenoAge acceleration lin',
    'CognitiveAge acceleration lin'
]

df_phenoage_nmk = pd.read_excel(f"{path}/03_pheno_age/data_PhenoAge.xlsx", index_col=0)
df_phenoage_nmk.index = df_phenoage_nmk.index.astype(str)
df_phenoage_nmk.rename(columns={
    'PhenoAge': 'PhenoAge nmk',
    'PhenoAge acceleration': 'PhenoAge acceleration nmk',
    'PhenoAge Linear Pred': 'PhenoAge linear pred nmk',
    'PhenoAge acceleration corrected': 'PhenoAge acceleration lin nmk',
    }, inplace=True
)
cols_phenoage_nmk = [
    'PhenoAge nmk',
    'PhenoAge acceleration nmk',
    'PhenoAge linear pred nmk',
    'PhenoAge acceleration lin nmk'
]

df_epi_age_pf = pd.read_excel(f"{path}/individual/EpiAgeScore/data.xlsx", index_col=0)
df_epi_age_pf.index = df_epi_age_pf.index.astype(str)

pyaging_meta = pd.read_excel(f"{path_pyaging}/clocks_meta.xlsx", index_col='Clock Name')
pyaging_meta['Clock Name'] = pyaging_meta.index
pyaging_meta.drop(index=['Knight', 'LeeControl', 'LeeRefinedRobust', 'LeeRobust', 'PedBE', 'RepliTali', 'ENCen100'], inplace=True)
epi_ages = pyaging_meta[pyaging_meta['Type'] == 'Age'].index.to_list()
cols_epi_age_pf = [f"{x} acceleration" for x in epi_ages] + [f"{x} corrected" for x in epi_ages]

dfs = [df_pheno, df_ages[cols_ages], df_phenoage_nmk[cols_phenoage_nmk], df_epi_age_pf[cols_epi_age_pf]]
df = pd.concat(dfs, axis=1)
df.rename(columns={f"{x} corrected": x for x in epi_ages}, inplace=True)

df['дата рождения'] = pd.to_datetime(df['дата рождения'])
df['Age'] = (pd.to_datetime("2024-11-11") - df['дата рождения']) / np.timedelta64(1, 'D') / 365.25

df['smoking'] = df['терапевт - фактор_риска'].str.contains('Курение табака')
df['smoking'] = df['smoking'].replace({True: 1, False: 0})

result_indexes = [f"{x} acceleration" for x in epi_ages] + ['PhenoAge acceleration', 'PhenoAge acceleration lin', 'PhenoAge acceleration nmk', 'PhenoAge acceleration lin nmk'] + ['CognitiveAge acceleration', 'CognitiveAge acceleration lin']
df_result = pd.DataFrame(index=result_indexes)

  df['smoking'] = df['smoking'].replace({True: 1, False: 0})


# SCORE

In [3]:
cols_for_score = ["терапевт - артериальное давление верхнее",
                  "биохимический анализ крови - холестерин",
                  "smoking",
                  "Age"
]
for col in cols_for_score:
    df[col] = df[col].replace('нет', np.nan)
    df[col] = df[col].replace('норма', np.nan)
    df[col] = pd.to_numeric(df[col])

chd_s0_age = np.exp(-np.exp(-21.0) * np.power((df['Age'] - 20.0), 4.62))
chd_s0_age_10 = np.exp(-np.exp(-21.0) * np.power((df['Age'] - 10.0), 4.62))
chd_w = 0.24 * (df['биохимический анализ крови - холестерин'] - 6.0) + 0.018 * (df['терапевт - артериальное давление верхнее'] - 120.0) + 0.71 * df['smoking']
chd_s_age = np.power(chd_s0_age, np.exp(chd_w))
chd_s_age_10 = np.power(chd_s0_age_10, np.exp(chd_w))
chd_s10_age = chd_s_age_10 / chd_s_age
chd_risk_10 = 1.0 - chd_s10_age

nchd_s0_age = np.exp(-np.exp(-25.7) * np.power((df['Age'] - 20.0), 5.47))
nchd_s0_age_10 = np.exp(-np.exp(-25.7) * np.power((df['Age'] - 10.0), 5.47))
nchd_w = 0.02 * (df['биохимический анализ крови - холестерин'] - 6.0) + 0.022 * (df['терапевт - артериальное давление верхнее'] - 120.0) + 0.63 * df['smoking']
nchd_s_age = np.power(nchd_s0_age, np.exp(nchd_w))
nchd_s_age_10 = np.power(nchd_s0_age_10, np.exp(nchd_w))
nchd_s10_age = nchd_s_age_10 / nchd_s_age
nchd_risk_10 = 1.0 - nchd_s10_age

risk_10_age = chd_risk_10 + nchd_risk_10

df['SCORE'] = risk_10_age
df['SCORE group'] = np.where(df['SCORE']>=0.02, 'High', 'Low')

  df[col] = df[col].replace('норма', np.nan)


In [4]:
for clock in result_indexes:
    filtered_df = df.dropna(subset=[clock, 'SCORE'])
    rho, pval = pearsonr(filtered_df[clock].values, filtered_df['SCORE'].values)
    df_result.at[clock, 'SCORE Pearson Rho'] = rho
    df_result.at[clock, 'SCORE Pearson pval'] = pval
    vals = {}
    for group in ['High', 'Low']:
        vals[group] = filtered_df.loc[filtered_df['SCORE group'] == group, clock].values
    _, df_result.at[clock, "SCORE Mann-Whitney pval"] = mannwhitneyu(vals['High'], vals['Low'], alternative='two-sided')
    _, df_result.at[clock, "SCORE Levene pval"] = levene(vals['High'], vals['Low'])