In [None]:
import pandas as pd
import numpy as np
import scipy
from sklearn.linear_model import ElasticNet, ElasticNetCV
from sklearn.model_selection import RepeatedKFold, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scripts.python.routines.betas import betas_drop_na
from plotly.subplots import make_subplots
from scipy import stats
import pickle
import random
import plotly.express as px
import copy
import statsmodels.formula.api as smf
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scripts.python.pheno.datasets.filter import filter_pheno
from scripts.python.pheno.datasets.features import get_column_name, get_status_dict, get_sex_dict
from scripts.python.routines.plot.scatter import add_scatter_trace
from scipy.stats import mannwhitneyu
import plotly.graph_objects as go
import pathlib
from scripts.python.routines.manifest import get_manifest
from scripts.python.routines.plot.save import save_figure
from scripts.python.routines.plot.layout import add_layout, get_axis
from scripts.python.routines.plot.p_value import add_p_value_annotation
from statsmodels.stats.multitest import multipletests
from sklearn.metrics import mean_absolute_error
import plotly.io as pio
pio.kaleido.scope.mathjax = None

In [None]:
dataset = "GSEUNN"
path = f"E:/YandexDisk/Work/pydnameth/datasets"
datasets_info = pd.read_excel(f"{path}/datasets.xlsx", index_col='dataset')
platform = datasets_info.loc[dataset, 'platform']
manifest = get_manifest(platform)

path_save = f"{path}/{platform}/{dataset}/special/025_subjects_processing"
pathlib.Path(f"{path_save}").mkdir(parents=True, exist_ok=True)

status_col = get_column_name(dataset, 'Status').replace(' ','_')
age_col = get_column_name(dataset, 'Age').replace(' ','_')
sex_col = get_column_name(dataset, 'Sex').replace(' ','_')
status_dict = get_status_dict(dataset)
status_passed_fields = status_dict['Control'] + status_dict['Case']
sex_dict = get_sex_dict(dataset)
continuous_vars = {}
categorical_vars = {status_col: [x.column for x in status_passed_fields], sex_col: list(sex_dict.values())}
pheno = pd.read_pickle(f"{path}/{platform}/{dataset}/pheno_xtd.pkl")
pheno = filter_pheno(dataset, pheno, continuous_vars, categorical_vars)
pheno.set_index('ID', inplace=True)
df = pheno.copy()

In [None]:
df_base = pd.read_excel(f"{path}/{platform}/{dataset}/special/021_ml_data/immuno/data_paper.xlsx", index_col='index')
df_base_train = pd.read_excel(f"{path}/{platform}/{dataset}/special/021_ml_data/immuno/data_paper_train.xlsx", index_col='index')
df_full = pd.read_excel(f"{path}/{platform}/{dataset}/special/021_ml_data/immuno/data_all.xlsx", index_col='index')
df_trgt = pd.read_excel(f"{path}/{platform}/{dataset}/special/021_ml_data/immuno/data_thld_25.xlsx", index_col='index')
features = list(set(pd.read_excel(f"{path}/{platform}/{dataset}/special/021_ml_data/immuno/features.xlsx", index_col='features').index.values))
features.sort()
df_trgt_only = df_trgt.loc[~df_trgt.index.isin(df_base.index), :]
df_new_only = df_full.loc[~df_full.index.isin(df_base.index), :]
df_base_test_only = df_full.loc[~df_full.index.isin(df_base_train.index), :]
df_base_test_trgt_only = df_trgt.loc[~df_trgt.index.isin(df_base_train.index), :]

parts = {
    f'{df_base.shape[0]}_{df_trgt_only.shape[0]}': (df_base, df_trgt_only),
    f'{df_base.shape[0]}_{df_new_only.shape[0]}': (df_base, df_new_only),
    f'{df_base_train.shape[0]}_{df_base_test_only.shape[0]}': (df_base_train, df_base_test_only)
}

In [None]:
num_cols = 5
num_rows = int(np.ceil(len(features) / num_cols))

for name, part in parts.items():
    pathlib.Path(f"{path_save}/{name}/biomarkers").mkdir(parents=True, exist_ok=True)

    test_mw = {'features': features, 'pvals': []}
    for f in features:
        x_base_all = part[0][f].values
        x_part_all = part[1][f].values
        stat, pval = mannwhitneyu(x_base_all, x_part_all, alternative='two-sided')
        test_mw['pvals'].append(pval)
    _, test_mw['pvals_fdr'], _, _ = multipletests(test_mw['pvals'], 0.05, method='fdr_bh')
    titles = [f"{test_mw['features'][x]} Mann-Whitney p-value: {test_mw['pvals_fdr'][x]:0.2e}" for x in range(len(test_mw['pvals_fdr']))]
    test_mw_df = pd.DataFrame(test_mw)
    test_mw_df.to_excel(f"{path_save}/{name}/features_mw.xlsx", index=False)

    fig_features_hist = make_subplots(rows=num_rows, cols=num_cols, shared_yaxes=False)
    fig_features_vio = make_subplots(rows=num_rows, cols=num_cols, shared_yaxes=False, subplot_titles=titles)

    for f_id, f in enumerate(features + ["Age"]):

        r_id, c_id = divmod(f_id, num_cols)

        iqr = scipy.stats.iqr(df_full[f].values)
        bin_size = iqr / 8

        q1 = df_full[f].quantile(0.25)
        q3 = df_full[f].quantile(0.75)
        iqr = q3 - q1
        filter_base = (part[0][f] >= q1 - 2.5 * iqr) & (part[0][f] <= q3 + 2.5 * iqr)
        part[0][f"{f}_outlier"] = filter_base
        filter_part = (part[1][f] >= q1 - 2.5 * iqr) & (part[1][f] <= q3 + 2.5 * iqr)
        part[1][f"{f}_outlier"] = filter_part

        if f == "Age":
            x_base = part[0][f].values
        else:
            x_base = part[0].loc[part[0][f"{f}_outlier"]==True, f].values
        fig = go.Figure()
        fig.add_trace(
            go.Histogram(
                x=x_base,
                name=f"ipAGE data ({len(part[0][f].values)})",
                showlegend=True,
                marker=dict(
                    opacity=0.6,
                    line=dict(
                        width=0.1
                    ),
                ),
                xbins=dict(size=bin_size)
            )
        )

        if f == "Age":
            x_part = part[1][f].values
        else:
            x_part = part[1].loc[part[1][f"{f}_outlier"] == True, f].values
        fig.add_trace(
            go.Histogram(
                x=x_part,
                name=f"New data ({len(part[1][f].values)})",
                showlegend=True,
                marker=dict(
                    opacity=0.6,
                    line=dict(
                        width=0.1
                    ),
                ),
                xbins=dict(size=bin_size)
            )
        )
        add_layout(fig, f"{f}", "Count", "")
        fig.update_layout(margin=go.layout.Margin(l=90, r=20, b=75, t=50, pad=0))
        fig.update_layout(legend= {'itemsizing': 'constant'})
        fig.update_layout(legend_font_size=20)
        fig.update_layout({'colorway': ["lime", "fuchsia", "green"]}, barmode='overlay')
        if f == "Age":
            save_figure(fig, f"{path_save}/{name}/{f}_hist")
        else:
            save_figure(fig, f"{path_save}/{name}/biomarkers/{f}_hist")

        if f != "Age":
            show_legend = False
            if f_id == 2:
                show_legend = True

            fig_features_hist.add_trace(
                go.Histogram(
                    x=x_base,
                    name=f"ipAGE data ({len(part[0][f].values)})",
                    showlegend=show_legend,
                    marker=dict(
                        opacity=0.7,
                        line=dict(
                            width=0.01
                        ),
                    ),
                    xbins=dict(size=bin_size)
                ),
                row=r_id + 1,
                col=c_id + 1
            )
            fig_features_hist.add_trace(
                go.Histogram(
                    x=x_part,
                    name=f"New data ({len(part[1][f].values)})",
                    showlegend=show_legend,
                    marker=dict(
                        opacity=0.7,
                        line=dict(
                            width=0.01
                        ),
                    ),
                    xbins=dict(size=bin_size)
                ),
                row=r_id + 1,
                col=c_id + 1
            )
            fig_features_hist.update_xaxes(
                autorange=True,
                title_text=f,
                row=r_id + 1,
                col=c_id + 1,
                showgrid=True,
                zeroline=False,
                linecolor='black',
                showline=True,
                gridcolor='gainsboro',
                gridwidth=0.05,
                mirror=True,
                ticks='outside',
                titlefont=dict(
                    color='black',
                    size=20
                ),
                showticklabels=True,
                tickangle=0,
                tickfont=dict(
                    color='black',
                    size=20
                ),
                exponentformat='e',
                showexponent='all'
            )
            fig_features_hist.update_yaxes(
                autorange=True,
                title_text="Count",
                row=r_id + 1,
                col=c_id + 1,
                showgrid=True,
                zeroline=False,
                linecolor='black',
                showline=True,
                gridcolor='gainsboro',
                gridwidth=0.05,
                mirror=True,
                ticks='outside',
                titlefont=dict(
                    color='black',
                    size=20
                ),
                showticklabels=True,
                tickangle=0,
                tickfont=dict(
                    color='black',
                    size=20
                ),
                exponentformat='e',
                showexponent='all'
            )

            fig_features_vio.add_trace(
                go.Violin(
                    y=x_base,
                    name=f"ipAGE data ({len(part[0][f].values)})",
                    showlegend=show_legend,
                    box_visible=True,
                    meanline_visible=True,
                    line_color='black',
                    fillcolor="lime",
                    marker=dict(color="lime", line=dict(color='black', width=0.1), opacity=0.8),
                    points='all',
                    opacity=0.8,
                    scalemode='width',
                    width=0.5
                ),
                row=r_id + 1,
                col=c_id + 1
            )
            fig_features_vio.add_trace(
                go.Violin(
                    y=x_part,
                    name=f"New data ({len(part[1][f].values)})",
                    showlegend=show_legend,
                    box_visible=True,
                    meanline_visible=True,
                    line_color='black',
                    fillcolor="fuchsia",
                    marker=dict(color="fuchsia", line=dict(color='black', width=0.1), opacity=0.8),
                    points='all',
                    opacity=0.8,
                    scalemode='width',
                    width=0.5
                ),
                row=r_id + 1,
                col=c_id + 1
            )
            fig_features_vio.update_xaxes(
                autorange=True,
                title_text="",
                row=r_id + 1,
                col=c_id + 1,
                showgrid=True,
                zeroline=False,
                linecolor='black',
                showline=True,
                gridcolor='gainsboro',
                gridwidth=0.05,
                mirror=True,
                ticks='outside',
                titlefont=dict(
                    color='black',
                    size=20
                ),
                showticklabels=True,
                tickangle=0,
                tickfont=dict(
                    color='black',
                    size=20
                ),
                exponentformat='e',
                showexponent='all'
            )
            fig_features_vio.update_yaxes(
                autorange=True,
                title_text=f,
                row=r_id + 1,
                col=c_id + 1,
                showgrid=True,
                zeroline=False,
                linecolor='black',
                showline=True,
                gridcolor='gainsboro',
                gridwidth=0.05,
                mirror=True,
                ticks='outside',
                titlefont=dict(
                    color='black',
                    size=20
                ),
                showticklabels=True,
                tickangle=0,
                tickfont=dict(
                    color='black',
                    size=20
                ),
                exponentformat='e',
                showexponent='all'
            )

    fig_features_hist.update_layout(
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.01,
            xanchor="center",
            x=0.5
        ),
        title=dict(
            text="",
            font=dict(size=25)
        ),
        template="none",
        autosize=False,
        width=3000,
        height=4000,
        margin=go.layout.Margin(
            l=100,
            r=40,
            b=100,
            t=100,
            pad=0
        )
    )
    fig_features_hist.update_layout(legend_font_size=50)
    fig_features_hist.update_layout(legend= {'itemsizing': 'constant'})
    fig_features_hist.update_layout({'colorway': ["lime", "fuchsia"]}, barmode='overlay')
    save_figure(fig_features_hist, f"{path_save}/{name}/features_hist")

    fig_features_vio.update_layout(
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.01,
            xanchor="center",
            x=0.5
        ),
        title=dict(
            text="",
            font=dict(size=25)
        ),
        template="none",
        autosize=False,
        width=3000,
        height=4000,
        margin=go.layout.Margin(
            l=100,
            r=40,
            b=100,
            t=100,
            pad=0
        )
    )
    fig_features_vio.update_layout(legend_font_size=50)
    fig_features_vio.update_traces(scalemode='count')
    fig_features_vio.update_layout(legend= {'itemsizing': 'constant'})
    fig_features_vio.update_layout({'colorway': ["lime", "fuchsia"]}, barmode='overlay')
    save_figure(fig_features_vio, f"{path_save}/{name}/features_vio")

In [None]:
ipAGE_df = pd.read_excel(f"{path}/{platform}/{dataset}/special/011_immuno_part3_and_part4_check_clocks/legacy/Control/v2/clock.xlsx")
ipAGE_feats = ipAGE_df['feature'].to_list()
ipAGE_coefs = ipAGE_df['coef'].to_list()

ipAGE_trgt_only = np.full(df_trgt_only.shape[0], ipAGE_coefs[0])
ipAGE_new_only = np.full(df_new_only.shape[0], ipAGE_coefs[0])
ipAGE_base_test_only = np.full(df_base_test_only.shape[0], ipAGE_coefs[0])
ipAGE_base_test_trgt_only = np.full(df_base_test_trgt_only.shape[0], ipAGE_coefs[0])
for feat_id in range(1, len(ipAGE_feats)):
    ipAGE_trgt_only += df_trgt_only.loc[:, ipAGE_feats[feat_id]].values * ipAGE_coefs[feat_id]
    ipAGE_new_only += df_new_only.loc[:, ipAGE_feats[feat_id]].values * ipAGE_coefs[feat_id]
    ipAGE_base_test_only += df_base_test_only.loc[:, ipAGE_feats[feat_id]].values * ipAGE_coefs[feat_id]
    ipAGE_base_test_trgt_only += df_base_test_trgt_only.loc[:, ipAGE_feats[feat_id]].values * ipAGE_coefs[feat_id]

df_trgt_only['ipAGE'] = ipAGE_trgt_only
df_new_only['ipAGE'] = ipAGE_new_only
df_base_test_only['ipAGE'] = ipAGE_base_test_only
df_base_test_trgt_only['ipAGE'] = ipAGE_base_test_trgt_only

mae_trgt_only = mean_absolute_error(df_trgt_only['Age'].values, df_trgt_only['ipAGE'].values)
mae_new_only = mean_absolute_error(df_new_only['Age'].values, df_new_only['ipAGE'].values)
mae_base_test_only = mean_absolute_error(df_base_test_only['Age'].values, df_base_test_only['ipAGE'].values)
mae_base_test_trgt_only = mean_absolute_error(df_base_test_trgt_only['Age'].values, df_base_test_trgt_only['ipAGE'].values)

print(f"mae_new_only: {mae_trgt_only}")
print(f"mae_new_only: {mae_new_only}")
print(f"mae_base_test_only: {mae_base_test_only}")
print(f"mae_base_test_trgt_only: {mae_base_test_trgt_only}")

In [None]:
df_base_test_only['ipAGE_diff'] = np.abs(df_base_test_only['ipAGE'].values - df_base_test_only['Age'].values)
mean_ipAge_diff = df_base_test_only['ipAGE_diff'].mean()
df_base_test_only['ipAGE_diff_diff'] = np.abs(df_base_test_only['ipAGE_diff'].values - mean_ipAge_diff)
df_base_test_only_random = df_base_test_only.sort_values(by=['ipAGE_diff_diff'], ascending=[True])
selected_indexes = df_base_test_only_random.index.values[0:df_base_train.shape[0]]

df_base_train_2 = df_full.loc[selected_indexes, :]

In [None]:
alpha_1 = 3.16227766016838
alpha_2 = 15.0
l1_ratio = 0.5

model_1 = ElasticNet(
    alpha=alpha_1,
    l1_ratio=l1_ratio,
    max_iter=10000,
    tol=1e-2,
).fit(df_base_train.loc[:, features].values, df_base_train.loc[:, 'Age'].values)
df_base_train['ipAGE_1'] = model_1.predict(df_base_train.loc[:, features].values)
df_base_train_2['ipAGE_1'] = model_1.predict(df_base_train_2.loc[:, features].values)
print(f"model_1 train mae: {mean_absolute_error(df_base_train['Age'].values, df_base_train['ipAGE_1'].values)}")
print(f"model_1 test mae: {mean_absolute_error(df_base_train_2['Age'].values, df_base_train_2['ipAGE_1'].values)}")
formula = f"ipAGE_1 ~ Age"
model_linear = smf.ols(formula=formula, data=df_base_train).fit()
df_base_train.loc[:, "ipAGE_1 acceleration"] = df_base_train.loc[:, 'ipAGE_1'].values - model_linear.predict(df_base_train)
df_base_train_2.loc[:, "ipAGE_1 acceleration"] = df_base_train_2.loc[:, 'ipAGE_1'].values - model_linear.predict(df_base_train_2)
fig = go.Figure()
add_scatter_trace(fig, df_base_train.loc[:, 'Age'].values, df_base_train.loc[:, 'ipAGE_1'].values, f"Train")
add_scatter_trace(fig, df_base_train.loc[:, 'Age'].values, model_linear.fittedvalues.values, "", "lines")
add_scatter_trace(fig, df_base_train_2.loc[:, 'Age'].values, df_base_train_2.loc[:, 'ipAGE_1'].values, f"Test")
add_layout(fig, 'Age', f"ipAGE", f"")
fig.update_layout({'colorway': ['blue', 'blue', 'red', 'green']})
fig.update_layout(legend_font_size=20)
fig.update_layout(margin=go.layout.Margin(l=90, r=20, b=80, t=65, pad=0))
save_figure(fig, f"{path_save}/elastic_net_strange_versus/ipAGE_1_scatter")
model_dict_1 = {'feature': ['Intercept'], 'coef': [model_1.intercept_]}
for f_id, f in enumerate(features):
    model_dict_1['feature'].append(f)
    model_dict_1['coef'].append(model_1.coef_[f_id])
model_df_1 = pd.DataFrame(model_dict_1)
model_df_1.to_excel(f'{path_save}/elastic_net_strange_versus/model_1.xlsx', index=False)
with open(f'{path_save}/elastic_net_strange_versus/model_1.pkl', 'wb') as handle:
    pickle.dump(model_1, handle, protocol=pickle.HIGHEST_PROTOCOL)

model_2 = ElasticNet(
    alpha=alpha_2,
    l1_ratio=l1_ratio,
    max_iter=10000,
    tol=1e-2,
).fit(df_base_train_2.loc[:, features].values, df_base_train_2.loc[:, 'Age'].values)
df_base_train['ipAGE_2'] = model_2.predict(df_base_train.loc[:, features].values)
df_base_train_2['ipAGE_2'] = model_2.predict(df_base_train_2.loc[:, features].values)
print(f"model_2 train mae: {mean_absolute_error(df_base_train_2['Age'].values, df_base_train_2['ipAGE_2'].values)}")
print(f"model_2 test mae: {mean_absolute_error(df_base_train['Age'].values, df_base_train['ipAGE_2'].values)}")
formula = f"ipAGE_2 ~ Age"
model_linear = smf.ols(formula=formula, data=df_base_train_2).fit()
df_base_train_2.loc[:, "ipAGE_2 acceleration"] = df_base_train_2.loc[:, 'ipAGE_2'].values - model_linear.predict(df_base_train_2)
df_base_train.loc[:, "ipAGE_2 acceleration"] = df_base_train.loc[:, 'ipAGE_2'].values - model_linear.predict(df_base_train)
fig = go.Figure()
add_scatter_trace(fig, df_base_train_2.loc[:, 'Age'].values, df_base_train_2.loc[:, 'ipAGE_2'].values, f"Train")
add_scatter_trace(fig, df_base_train_2.loc[:, 'Age'].values, model_linear.fittedvalues.values, "", "lines")
add_scatter_trace(fig, df_base_train.loc[:, 'Age'].values, df_base_train.loc[:, 'ipAGE_2'].values, f"Test")
add_layout(fig, 'Age', f"ipAGE_on_test", f"")
fig.update_layout({'colorway': ['blue', 'blue', 'red', 'green']})
fig.update_layout(legend_font_size=20)
fig.update_layout(margin=go.layout.Margin(l=90, r=20, b=80, t=65, pad=0))
save_figure(fig, f"{path_save}/elastic_net_strange_versus/ipAGE_2_scatter")
model_dict_2 = {'feature': ['Intercept'], 'coef': [model_2.intercept_]}
for f_id, f in enumerate(features):
    model_dict_2['feature'].append(f)
    model_dict_2['coef'].append(model_2.coef_[f_id])
model_df_2 = pd.DataFrame(model_dict_2)
model_df_2.to_excel(f'{path_save}/elastic_net_strange_versus/model_2.xlsx', index=False)
with open(f'{path_save}/elastic_net_strange_versus/model_2.pkl', 'wb') as handle:
    pickle.dump(model_2, handle, protocol=pickle.HIGHEST_PROTOCOL)

models_df = pd.merge(model_df_1.set_index('feature').rename(columns={'coef': 'ipAGE_1'}), model_df_2.set_index('feature').rename(columns={'coef': 'ipAGE_2'}), left_index=True, right_index=True)
models_df['rel_diff_percent'] = np.abs(models_df['ipAGE_1'] - models_df['ipAGE_2']) / np.abs(models_df['ipAGE_1']) * 100
for f_id, f in enumerate(features):
    corr, pval = stats.pearsonr(df_base_train.loc[:, 'Age'].values, df_base_train.loc[:, f].values)
    models_df.loc[f, 'pearson_corr'] = corr
    models_df.loc[f, 'pearson_pval'] = pval
_, models_df.loc[features, 'pearson_pval_fdr'], _, _ = multipletests(models_df.loc[features, 'pearson_pval'].values, 0.05, method='fdr_bh')
models_df.sort_values(by=['pearson_pval_fdr'], ascending=[True], inplace=True)
models_df.to_excel(f'{path_save}/elastic_net_strange_versus/models.xlsx', index=True)

iqr = scipy.stats.iqr(df_full['Age'].values)
bin_size = iqr / 8

fig = go.Figure()
fig.add_trace(
    go.Histogram(
        x=df_base_train.loc[:, 'Age'].values,
        name=f"ipAGE data ({len(df_base_train.loc[:, 'Age'].values)})",
        showlegend=True,
        marker=dict(
            opacity=0.6,
            line=dict(
                width=0.1
            ),
        ),
        xbins=dict(size=bin_size)
    )
)
fig.add_trace(
    go.Histogram(
        x=df_base_train_2['Age'].values,
        name=f"New data ({len(df_base_train_2['Age'].values)})",
        showlegend=True,
        marker=dict(
            opacity=0.6,
            line=dict(
                width=0.1
            ),
        ),
        xbins=dict(size=bin_size)
    )
)
add_layout(fig, f"{f}", "Count", "")
fig.update_layout(margin=go.layout.Margin(l=90, r=20, b=75, t=50, pad=0))
fig.update_layout(legend= {'itemsizing': 'constant'})
fig.update_layout(legend_font_size=20)
fig.update_layout({'colorway': ["lime", "fuchsia", "green"]}, barmode='overlay')
pathlib.Path(f"{path_save}/elastic_net_strange_versus").mkdir(parents=True, exist_ok=True)
save_figure(fig, f"{path_save}/elastic_net_strange_versus/Age_hist")

df_base_train.to_excel(f"{path_save}/elastic_net_strange_versus/df_1.xlsx", index=True)
df_base_train_2.to_excel(f"{path_save}/elastic_net_strange_versus/df_2.xlsx", index=True)