In [None]:
import pandas as pd
import numpy as np
import scipy
from sklearn.linear_model import ElasticNet, ElasticNetCV
from sklearn.model_selection import RepeatedKFold, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scripts.python.routines.betas import betas_drop_na
from plotly.subplots import make_subplots
from scipy import stats
import pickle
import random
import plotly.express as px
import copy
import statsmodels.formula.api as smf
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scripts.python.pheno.datasets.filter import filter_pheno
from scripts.python.pheno.datasets.features import get_column_name, get_status_dict, get_sex_dict
from scripts.python.routines.plot.scatter import add_scatter_trace
import plotly.graph_objects as go
import pathlib
from scripts.python.routines.manifest import get_manifest
from scripts.python.routines.plot.save import save_figure
from scripts.python.routines.plot.layout import add_layout, get_axis
from scripts.python.routines.plot.p_value import add_p_value_annotation
from statsmodels.stats.multitest import multipletests
from sklearn.metrics import mean_absolute_error
from impyute.imputation.cs import fast_knn, mean, median, random, mice, mode, em
import plotly.io as pio
pio.kaleido.scope.mathjax = None
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=False)
from functools import reduce
from scipy.stats import kruskal, mannwhitneyu

# Init data

In [None]:
dataset = "GSEUNN"
path = f"D:/YandexDisk/Work/pydnameth/datasets"
datasets_info = pd.read_excel(f"{path}/datasets.xlsx", index_col='dataset')
platform = datasets_info.loc[dataset, 'platform']
manifest = get_manifest(platform, path=path)

path_save = f"{path}/{platform}/{dataset}/special/027_immuno_checking"
pathlib.Path(f"{path_save}").mkdir(parents=True, exist_ok=True)

status_col = get_column_name(dataset, 'Status').replace(' ','_')
age_col = get_column_name(dataset, 'Age').replace(' ','_')
sex_col = get_column_name(dataset, 'Sex').replace(' ','_')
status_dict = get_status_dict(dataset)
status_passed_fields = status_dict['Control'] + status_dict['Case']
sex_dict = get_sex_dict(dataset)
continuous_vars = {}
categorical_vars = {status_col: [x.column for x in status_passed_fields], sex_col: list(sex_dict.values())}
pheno = pd.read_excel(f"{path}/{platform}/{dataset}/pheno.xlsx", index_col="index")

df_immuno_genes = pd.read_excel(f"{path}/{platform}/{dataset}/data/immuno/immuno_markers_genes.xlsx")
dict_immuno_genes = dict(zip(df_immuno_genes['immuno_marker'], df_immuno_genes['gene']))
df_age_sex_base = pd.read_excel(f"{path}/{platform}/{dataset}/data/parsed_L_Q.xlsx", index_col="Code")
feats_immuno = df_immuno_genes['immuno_marker']
feats_gene = df_immuno_genes['gene']

files = [
    "Aging L, Q, H, I",
    "Aging-Covid_05.01.2022",
    "Aging-Covid-05.05.22",
    "Covid_results_02_2021",
    "Covid-25.11.20",
    "MULTIPLEX_20_11_2020_ AGING",
    "Yakutiya + TR",
    "Мультиплекс_Agind&Covid",
]

# Yakutia and TR analysis: immunology and methylation checking

In [None]:
df_file = pd.read_excel(f"{path}/{platform}/{dataset}/data/immuno/files/Immunology data.xlsx", index_col="index")
indexes_dnam_immuno = list(df_file.index.intersection(pheno.index))
indexes_immuno_only = list(df_file.index.difference(pheno.index))
indexes_dnam_only = list(pheno.index.difference(df_file.index))

# Yakutia and TR analysis: filling Age, Sex, from Yakutia file

In [None]:
df_file = pd.read_excel(f"{path}/{platform}/{dataset}/data/immuno/files/processed/Yakutiya + TR.xlsx", index_col="Sample")
df_yak = pd.read_excel(f"{path}/{platform}/{dataset}/data/age_sex_Якутск.xlsx", index_col="index")
index_age_missed = list(df_file.index[df_file["Age"] == "Missed"])
index_sex_missed = list(df_file.index[df_file["Sex"] == "Missed"])
df_file.loc[index_sex_missed, "Age"] = df_yak.loc[index_sex_missed, "Age"]
df_file.loc[index_sex_missed, "Sex"] = df_yak.loc[index_sex_missed, "Sex"]
df_file.to_excel(f"{path}/{platform}/{dataset}/data/immuno/files/processed/YakutiyaxxxTR1.xlsx", index_label="Sample")

# Yakutia and TR analysis: filling Age, Sex, Status from DNAm

In [None]:
df_file = pd.read_excel(f"{path}/{platform}/{dataset}/data/immuno/files/processed/Yakutiya_TR.xlsx", index_col="Sample")
indexes_dnam_immuno = list(df_file.index.intersection(pheno.index))
indexes_immuno_only = list(df_file.index.difference(pheno.index))
df_file.loc[indexes_dnam_immuno, "Age"] = pheno.loc[indexes_dnam_immuno, "Age"]
df_file.loc[indexes_dnam_immuno, "Sex"] = pheno.loc[indexes_dnam_immuno, "Sex"]
df_file.loc[indexes_dnam_immuno, "Status"] = pheno.loc[indexes_dnam_immuno, "Status"]
df_file.loc[indexes_immuno_only, "Age"] = "Missed"
df_file.loc[indexes_immuno_only, "Sex"] = "Missed"
df_file.loc[indexes_immuno_only, "Status"] = "Missed"
df_file.to_excel(f"{path}/{platform}/{dataset}/data/immuno/files/processed/Yakutiya_TR1.xlsx", index_label="Sample")

# Data proocessing: adding columns, remove duplicates, filling nans

In [None]:
replace_method = 'quarter'
df_data_files = {}
df_age_sex_files = {}
df_status_files = {}
df_nans = {}
for file in files:
    print(file)
    df_file = pd.read_excel(f"{path}/{platform}/{dataset}/data/immuno/files/processed/{file}.xlsx", index_col="Sample")
    df_file.rename(columns={'Пол': 'Sex', 'Gender': 'Sex', 'Возраст': 'Age'}, inplace=True)
    df_file.replace({'ж': 'F', 'м': 'M', 'Ж': 'F', 'М': 'M'}, inplace=True)

    df_file.index = df_file.index.map(str)

    # case control setup
    if file == "Aging L, Q, H, I":
        df_file["Status"] = "Control"
        df_file["Region"] = "Central"
        # df_file.loc[df_file.index.str.startswith(('Q', 'H')), "Status"] = "Case"
        df_file.loc[df_file.index.str.startswith(('H')), "Status"] = "ESRD"
        df_file.loc[df_file.index.str.startswith(('Q')), "Status"] = "DownSyndrome"
    elif file == "Aging-Covid_05.01.2022":
        df_file["Region"] = "Central"
        df_file["Status"] = "Control"
    elif file == "Aging-Covid-05.05.22":
        df_file["Region"] = "Central"
        # df_file["Status"] = "Case"
        df_file["Status"] = "TR"
        df_file.loc[df_file.index.str.startswith(('C')), "Status"] = "COVID"
        df_file.loc[df_file.index.str.contains(r'^TD.*before.*$'), "Status"] = "Control"
        df_file.loc[df_file.index.str.contains(r'^L\d*$'), "Status"] = "Control"
    elif file == "Covid_results_02_2021":
        df_file["Region"] = "Central"
        # df_file["Status"] = "Case"
        df_file["Status"] = "COVID"
        df_file.loc[df_file.index.str.startswith(('I')), "Status"] = "Control"
        df_file.loc[df_file.index.str.startswith(('H')), "Status"] = "ESRD"
    elif file == "Covid-25.11.20":
        df_file["Region"] = "Central"
        # df_file["Status"] = "Case"
        df_file["Status"] = "COVID"
        df_file.loc[df_file.index.str.startswith(('I')), "Status"] = "Control"
    elif file == "MULTIPLEX_20_11_2020_ AGING":
        df_file["Region"] = "Central"
        # df_file["Status"] = "Case"
        df_file["Status"] = "ESRD"
        df_file.loc[df_file.index.str.startswith(('I')), "Status"] = "Control"
    elif file == "Yakutiya + TR":
        pass
    elif file == "Мультиплекс_Agind&Covid":
        df_file["Region"] = "Central"
        # df_file["Status"] = "Case"
        df_file["Status"] = "COVID"
        df_file.loc[df_file.index.str.startswith(('F', 'MQ', 'L', 'FQ', 'S', 'TD', 'I')), "Status"] = "Control"
        df_file.loc[df_file.index.str.startswith(('H')), "Status"] = "ESRD"

    df_file['file'] = file

    # duplicates processing
    if file == "MULTIPLEX_20_11_2020_ AGING":
        df_file_doubled_unique = df_file.loc[~df_file.index.duplicated(keep=False), :]
        df_file_doubled_1 = df_file.loc[df_file.index.duplicated(keep='first'), :]
        df_file_doubled_2 = df_file.loc[df_file.index.duplicated(keep='last'), :]
        df_file_duplicates_final = pd.concat([df_file_doubled_2, df_file_doubled_unique], axis=0)
        df_file = df_file_duplicates_final
    df_file_duplicates = df_file.loc[df_file.index.duplicated(keep=False), :]
    if df_file_duplicates.shape[0] > 0:
        print(df_file_duplicates.index)

    if replace_method == 'half':
        thld_low_values = set()
        for feat in feats_immuno:
            df_file[feat] = df_file[feat].astype(str)
            thld_low_values.update(set.union(*df_file[feat].str.findall(r'^(<.*)$').apply(set).to_list()))
        for val in thld_low_values:
            target_val = float(val[2::]) * 0.5
            df_file.replace({val: target_val}, inplace=True)
    elif replace_method == "quarter":
        for feat in feats_immuno:
            thld_low_values = set.union(*df_file[feat].astype(str).str.findall(r'^(<.*)$').apply(set).to_list())
            if len(thld_low_values) > 0:
                values = df_file[feat].apply(pd.to_numeric, errors='coerce').values
                values = values[~np.isnan(values)]
                min_val = np.min(values)
                thld_val = min([float(x[2::]) for x in thld_low_values])
                replace_val = 0.25 * min(min_val, thld_val)
                replace_dict = {x: replace_val for x in thld_low_values}
                df_file[feat].replace(replace_dict, inplace=True)

    df_data_file = df_file.loc[:, feats_immuno].copy()
    df_data_file.replace(r'^([<>].*)$', 'NaN', inplace=True, regex=True)
    df_data_file = df_data_file.apply(pd.to_numeric, errors='coerce')
    df_data_files[file] = df_data_file

    df_nans[file] = df_data_file.isna().sum().sum() / (df_data_file.shape[0] * len(feats_immuno)) * 100

    if {'Sex', 'Age', 'Region'}.issubset(df_file.columns):
        df_age_sex_files[file] = df_file.loc[:, ['Sex', 'Age', 'Region']]

    df_status_files[file] = df_file.loc[:, ['Status', 'file']]

# Get union dataframes, check integrity

In [None]:
df_age_sex_res = df_age_sex_base.copy()
for file, df_age_sex_file in df_age_sex_files.items():
    print(file)
    tmp = df_age_sex_file.dropna(axis=0)
    intersection = set(tmp.index).intersection(set(df_age_sex_res.index))
    if len(intersection) > 0:
        print(f"Overwrite intersection in age_sex: {intersection}")
        df_age_sex_res = df_age_sex_res.loc[~df_age_sex_res.index.isin(intersection),:]
    df_age_sex_res = pd.concat([df_age_sex_res, tmp], verify_integrity=True)

df_data_res = pd.DataFrame()
for file, df_data_file in df_data_files.items():
    print(file)
    intersection = set(df_data_file.index).intersection(set(df_data_res.index))
    if len(intersection) > 0:
        print(f"Overwrite intersection in data: {intersection}")
        df_data_res = df_data_res.loc[~df_data_res.index.isin(intersection),:]
    df_data_res = pd.concat([df_data_res, df_data_file], verify_integrity=False)

df_status_res = pd.DataFrame()
for file, df_status_file in df_status_files.items():
    print(file)
    intersection = set(df_status_file.index).intersection(set(df_status_res.index))
    if len(intersection) > 0:
        print(f"Overwrite intersection in status: {intersection}")
        df_status_res = df_status_res.loc[~df_status_res.index.isin(intersection),:]
    df_status_res = pd.concat([df_status_res, df_status_file], verify_integrity=False)

# Get result dataframe

In [None]:
df_all = pd.merge(df_data_res, df_status_res, left_index=True, right_index=True)
only_data_ids = list(set(df_all.index) - set(df_age_sex_res.index))
if len(only_data_ids) > 0:
    print(f"only_data_ids: {only_data_ids}")
    df_missed_age_sex = df_all.loc[only_data_ids, :]
    df_all.drop(only_data_ids, inplace=True)
df_all = pd.merge(df_all, df_age_sex_res.loc[:, ['Sex', 'Age', 'Region']], left_index=True, right_index=True)
df_all.index.name = 'ID'
df_all.rename(columns=dict_immuno_genes, inplace=True)

df_paper_trn = pd.read_excel(f"{path}/{platform}/{dataset}/special/021_ml_data/immuno/legacy/data_paper_train.xlsx", index_col='index')
df_paper_trn.rename(index={'I64_1': 'I64', 'TD2': 'TD2 before', 'TD4': 'TD4 before'}, inplace=True)
df_paper_all = pd.read_excel(f"{path}/{platform}/{dataset}/special/021_ml_data/immuno/legacy/data_paper.xlsx", index_col='index')
df_paper_all.rename(index={'I64_1': 'I64', 'TD2': 'TD2 before', 'TD4': 'TD4 before'}, inplace=True)
df_previous = pd.read_excel(f"{path}/{platform}/{dataset}/special/021_ml_data/immuno/legacy/data_all.xlsx", index_col='index')
df_previous.rename(index={'I64_1': 'I64', 'TD2': 'TD2 before', 'TD4': 'TD4 before'}, inplace=True)
df_thld_25 = pd.read_excel(f"{path}/{platform}/{dataset}/special/021_ml_data/immuno/legacy/data_thld_25.xlsx", index_col='index')
df_thld_25.rename(index={'I64_1': 'I64', 'TD2': 'TD2 before', 'TD4': 'TD4 before'}, inplace=True)
df_260ai = pd.read_excel(f"{path}/{platform}/{dataset}/special/021_ml_data/immuno/260_imp(fast_knn)_replace(quarter).xlsx", index_col='index')
df_345ai = pd.read_excel(f"{path}/{platform}/{dataset}/special/021_ml_data/immuno/345_imp(fast_knn)_replace(quarter).xlsx", index_col='index')
df_is_dnam = pd.read_excel(f"{path}/{platform}/{dataset}/pheno.xlsx", index_col='index')
df_is_dnam.drop("I64_old", inplace=True)

df_all['ipAGE_trn_set'] = False
df_all.loc[df_paper_trn.index.values, 'ipAGE_trn_set'] = True
df_all['ipAGE_all_set'] = False
df_all.loc[df_paper_all.index.values, 'ipAGE_all_set'] = True
df_all['previous_set'] = False
df_all.loc[df_previous.index.values, 'previous_set'] = True
df_all['thld_25_set'] = False
df_all.loc[df_thld_25.index.values, 'thld_25_set'] = True
df_all['260ai'] = False
df_all.loc[df_260ai.index.values, '260ai'] = True
df_all['345ai'] = False
df_all.loc[df_345ai.index.values, '345ai'] = True
df_all['is_dnam'] = False
df_all.loc[df_is_dnam.index.values, 'is_dnam'] = True

df_all.index.name = 'index'
df_all.to_excel(f"{path_save}/df_all_replace({replace_method}).xlsx")

df_ctrl = df_all.loc[df_all['Status'] == 'Control', :]
df_ctrl.index.name = 'index'
df_ctrl.to_excel(f"{path_save}/df_ctrl_replace({replace_method}).xlsx")

In [None]:
df_all_imp = df_all.copy()
imputation = 'fast_knn'
fast_knn_k = 1
is_nans = df_all_imp.loc[:, feats_gene].isnull().values.any()
if is_nans:
    n_nans = df_all_imp.loc[:, feats_gene].isna().sum().sum()
    print(f"Perform imputation for {n_nans} missed values")
    df_all_imp.loc[:, feats_gene] = df_all_imp.loc[:, feats_gene].astype('float')
    if imputation == "median":
        imputed_training = median(df_all_imp.loc[:, feats_gene].values)
    elif imputation == "mean":
        imputed_training = mean(df_all_imp.loc[:, feats_gene].values)
    elif imputation == "fast_knn":
        imputed_training = fast_knn(df_all_imp.loc[:, feats_gene].values, k=fast_knn_k)
    elif imputation == "random":
        imputed_training = random(df_all_imp.loc[:, feats_gene].values)
    elif imputation == "mice":
        imputed_training = mice(df_all_imp.loc[:, feats_gene].values)
    elif imputation == "em":
        imputed_training = em(df_all_imp.loc[:, feats_gene].values)
    elif imputation == "mode":
        imputed_training = mode(df_all_imp.loc[:, feats_gene].values)
    else:
        raise ValueError(f"Unsupported imputation: {imputation}")
    df_all_imp.loc[:, feats_gene] = imputed_training
n_nans_after = df_all_imp.loc[:, feats_gene].isna().sum().sum()
print(f"Number of nans after imputation: {n_nans_after}")

df_all_imp.index.name = 'index'
df_all_imp.to_excel(f"{path_save}/df_all_imp({imputation})_replace({replace_method}).xlsx")

df_ctrl_imp = df_all_imp.loc[df_all_imp['Status'] == 'Control', :]
df_ctrl_imp.index.name = 'index'
df_ctrl_imp.to_excel(f"{path_save}/df_ctrl_imp({imputation})_replace({replace_method}).xlsx")

# Plot the number of nans in each file

In [None]:
fig = go.Figure()
for file in df_nans:
    fig.add_trace(
        go.Bar(
            name=f'{file} ({df_data_files[file].shape[0]})',
            x=[file],
            y=[df_nans[file]],
            text=f'{df_nans[file]:0.3f}%',
            textposition='auto',
            orientation='v',
        )
    )
add_layout(fig, f"", "% of non-numeric values", f"")
fig.update_layout({'colorway': px.colors.qualitative.Set1})
fig.update_layout(title_xref='paper')
fig.update_layout(
    autosize=False,
    margin=go.layout.Margin(
        l=100,
        r=20,
        b=20,
        t=90,
        pad=0
    )
)
fig.update_layout(legend_font_size=12)
fig.update_layout(showlegend=True)
fig.update_xaxes(showticklabels=False)
fig.update_traces(textposition='auto')
save_figure(fig, f"{path_save}/nan_by_file")


# Plot the number of intersection with ipAGE data

In [None]:

df_paper_train = pd.read_excel(f"{path}/{platform}/{dataset}/special/021_ml_data/immuno/data_paper_train.xlsx", index_col='index')
df_paper_train.rename(index={'I64_1': 'I64', 'TD2': 'TD2 before', 'TD4': 'TD4 before'}, inplace=True)
df_paper = pd.read_excel(f"{path}/{platform}/{dataset}/special/021_ml_data/immuno/data_paper.xlsx", index_col='index')
df_paper.rename(index={'I64_1': 'I64', 'TD2': 'TD2 before', 'TD4': 'TD4 before'}, inplace=True)

fig = go.Figure()
for file in df_nans:
    y = len(set(df_paper_train.index).intersection(set(df_data_files[file].index)))
    fig.add_trace(
        go.Bar(
            name=f'{file} ({df_data_files[file].shape[0]})',
            x=[file],
            y=[y],
            text=f'{y:d}',
            textposition='auto',
            orientation='v',
        )
    )
add_layout(fig, f"", f"Intersection with {df_paper_train.shape[0]} ipAGE train", f"")
fig.update_layout({'colorway': px.colors.qualitative.Set1})
fig.update_layout(title_xref='paper')
fig.update_layout(
    autosize=False,
    margin=go.layout.Margin(
        l=100,
        r=20,
        b=20,
        t=90,
        pad=0
    )
)
fig.update_layout(legend_font_size=12)
fig.update_layout(showlegend=True)
fig.update_xaxes(showticklabels=False)
fig.update_traces(textposition='auto')
save_figure(fig, f"{path_save}/ipAGE_intersection_by_file")

# Plot the nans distribution by person in files

In [None]:
num_cols = 2
num_rows = int(np.ceil(len(df_data_files) / num_cols))
bin_size = 3

fig = make_subplots(rows=num_rows, cols=num_cols, shared_yaxes=False, shared_xaxes=True)
for file_id, file in enumerate(df_data_files):
    r_id, c_id = divmod(file_id, num_cols)

    x_base = df_data_files[file].isna().sum(axis=1).values / len(feats_immuno) * 100

    show_legend = True
    fig.add_trace(
        go.Histogram(
            x=x_base,
            name=f'{file} ({df_data_files[file].shape[0]})',
            showlegend=show_legend,
            marker=dict(
                opacity=0.9,
                line=dict(
                    width=0.01
                ),
            ),
            xbins=dict(size=bin_size)
        ),
        row=r_id + 1,
        col=c_id + 1
    )

    fig.update_xaxes(
        autorange=True,
        title_text=f"% of non-numeric values",
        row=r_id + 1,
        col=c_id + 1,
        showgrid=True,
        zeroline=False,
        linecolor='black',
        showline=True,
        gridcolor='gainsboro',
        gridwidth=0.05,
        mirror=True,
        ticks='outside',
        titlefont=dict(
            color='black',
            size=20
        ),
        showticklabels=True,
        tickangle=0,
        tickfont=dict(
            color='black',
            size=20
        ),
        exponentformat='e',
        showexponent='all'
    )
    fig.update_yaxes(
        autorange=True,
        title_text="Number of subjects",
        row=r_id + 1,
        col=c_id + 1,
        showgrid=True,
        zeroline=False,
        linecolor='black',
        showline=True,
        gridcolor='gainsboro',
        gridwidth=0.05,
        mirror=True,
        ticks='outside',
        titlefont=dict(
            color='black',
            size=20
        ),
        showticklabels=True,
        tickangle=0,
        tickfont=dict(
            color='black',
            size=20
        ),
        exponentformat='e',
        showexponent='all'
    )

fig.update_layout(
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.01,
        xanchor="center",
        x=0.5
    ),
    title=dict(
        text="",
        font=dict(size=25)
    ),
    template="none",
    autosize=False,
    width=1200,
    height=1800,
    margin=go.layout.Margin(
        l=100,
        r=40,
        b=100,
        t=100,
        pad=0
    )
)
fig.update_layout(legend_font_size=20)
fig.update_layout(legend={'itemsizing': 'constant'})
fig.update_layout({'colorway': px.colors.qualitative.Set1}, barmode='overlay')
save_figure(fig, f"{path_save}/nan_hist_by_file")

# Plot the nans distribution by person in files

In [None]:
num_cols = 5
num_rows = int(np.ceil(len(feats_immuno) / num_cols))
bin_size = 2
fig = make_subplots(rows=num_rows, cols=num_cols, shared_yaxes=False)
for f_id, f in enumerate(feats_immuno):
    r_id, c_id = divmod(f_id, num_cols)

    show_legend = False
    if f_id == 1:
        show_legend = True

    for file_id, file in enumerate(df_data_files):
        y = df_data_files[file][f].isna().sum() / df_data_files[file].shape[0] * 100
        fig.add_trace(
            go.Bar(
                x=[f'{file} ({df_data_files[file].shape[0]})'],
                y=[y],
                name=f'{file} ({df_data_files[file].shape[0]})',
                text=f'{y:0.2f}',
                textposition='auto',
                orientation='v',
                showlegend=show_legend,
            ),
            row=r_id + 1,
            col=c_id + 1
        )
    fig.update_xaxes(
        autorange=True,
        title_text=f,
        row=r_id + 1,
        col=c_id + 1,
        showgrid=True,
        zeroline=False,
        linecolor='black',
        showline=True,
        gridcolor='gainsboro',
        gridwidth=0.05,
        mirror=True,
        ticks='outside',
        titlefont=dict(
            color='black',
            size=20
        ),
        showticklabels=False,
        tickangle=0,
        tickfont=dict(
            color='black',
            size=20
        ),
        exponentformat='e',
        showexponent='all'
    )
    fig.update_yaxes(
        autorange=True,
        title_text=f"% of non-numeric values",
        row=r_id + 1,
        col=c_id + 1,
        showgrid=True,
        zeroline=False,
        linecolor='black',
        showline=True,
        gridcolor='gainsboro',
        gridwidth=0.05,
        mirror=True,
        ticks='outside',
        titlefont=dict(
            color='black',
            size=20
        ),
        showticklabels=True,
        tickangle=0,
        tickfont=dict(
            color='black',
            size=20
        ),
        exponentformat='e',
        showexponent='all'
    )
fig.update_layout(
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.01,
        xanchor="center",
        x=0.5
    ),
    title=dict(
        text="",
        font=dict(size=25)
    ),
    template="none",
    autosize=False,
    width=3000,
    height=4000,
    margin=go.layout.Margin(
        l=100,
        r=40,
        b=100,
        t=100,
        pad=0
    )
)
fig.update_layout(legend_font_size=50)
fig.update_layout(legend={'itemsizing': 'constant'})
fig.update_layout({'colorway': px.colors.qualitative.Set1[0:len(df_data_files)]})
save_figure(fig, f"{path_save}/nan_bar_by_feature")

In [None]:
num_cols = 5
num_rows = int(np.ceil(len(feats_immuno) / num_cols))
bin_size = 2
fig = make_subplots(rows=num_rows, cols=num_cols, shared_yaxes=False)
for f_id, f in enumerate(feats_immuno):
    r_id, c_id = divmod(f_id, num_cols)

    show_legend = False
    if f_id == 1:
        show_legend = True

    for file_id, file in enumerate(df_data_files):
        y = df_data_files[file][f].isna().sum() / df_data_files[file].shape[0] * 100
        fig.add_trace(
            go.Bar(
                x=[f'{file} ({df_data_files[file].shape[0]})'],
                y=[y],
                name=f'{file} ({df_data_files[file].shape[0]})',
                text=f'{y:0.2f}',
                textposition='auto',
                orientation='v',
                showlegend=show_legend,
            ),
            row=r_id + 1,
            col=c_id + 1
        )
    fig.update_xaxes(
        autorange=True,
        title_text=f,
        row=r_id + 1,
        col=c_id + 1,
        showgrid=True,
        zeroline=False,
        linecolor='black',
        showline=True,
        gridcolor='gainsboro',
        gridwidth=0.05,
        mirror=True,
        ticks='outside',
        titlefont=dict(
            color='black',
            size=20
        ),
        showticklabels=False,
        tickangle=0,
        tickfont=dict(
            color='black',
            size=20
        ),
        exponentformat='e',
        showexponent='all'
    )
    fig.update_yaxes(
        autorange=True,
        title_text=f"% of non-numeric values",
        row=r_id + 1,
        col=c_id + 1,
        showgrid=True,
        zeroline=False,
        linecolor='black',
        showline=True,
        gridcolor='gainsboro',
        gridwidth=0.05,
        mirror=True,
        ticks='outside',
        titlefont=dict(
            color='black',
            size=20
        ),
        showticklabels=True,
        tickangle=0,
        tickfont=dict(
            color='black',
            size=20
        ),
        exponentformat='e',
        showexponent='all'
    )
fig.update_layout(
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.01,
        xanchor="center",
        x=0.5
    ),
    title=dict(
        text="",
        font=dict(size=25)
    ),
    template="none",
    autosize=False,
    width=3000,
    height=4000,
    margin=go.layout.Margin(
        l=100,
        r=40,
        b=100,
        t=100,
        pad=0
    )
)
fig.update_layout(legend_font_size=50)
fig.update_layout(legend={'itemsizing': 'constant'})
fig.update_layout({'colorway': px.colors.qualitative.Set1[0:len(df_data_files)]})
save_figure(fig, f"{path_save}/nan_bar_by_feature")

# Check intersection with subset from ipAGE paper

In [None]:
df_paper = pd.read_excel(f"{path}/{platform}/{dataset}/special/021_ml_data/immuno/data_paper.xlsx", index_col='index')
df_paper.rename(index={'I64_1': 'I64', 'TD2': 'TD2 before', 'TD4': 'TD4 before'}, inplace=True)
df_paper_train = pd.read_excel(f"{path}/{platform}/{dataset}/special/021_ml_data/immuno/data_paper_train.xlsx", index_col='index')
df_paper_train.rename(index={'I64_1': 'I64', 'TD2': 'TD2 before', 'TD4': 'TD4 before'}, inplace=True)
only_paper_ids = list(set(df_paper.index) - set(df_ctrl.index))
if len(only_paper_ids) > 0:
    raise Warning(f"only_paper_ids: {only_paper_ids}")

# Plot the number of nans by features in controls

In [None]:
nans_features = df_ctrl.loc[:, feats_immuno].isna().sum(axis=0).values / df_ctrl.shape[0] * 100
df_nan_feature = pd.DataFrame({'feature': feats_immuno, 'nan': nans_features})
df_nan_feature.sort_values(['nan'], ascending=[False], inplace=True)
fig = go.Figure()
fig.add_trace(
    go.Bar(
        x=df_nan_feature['nan'],
        y=list(range(len(df_nan_feature['nan']))),
        orientation='h',
        marker=dict(color='red', opacity=0.9)
    )
)
add_layout(fig, "% of non-numeric values", "", f"")
fig.update_layout({'colorway': ['red', 'black']})
fig.update_layout(legend_font_size=20)
fig.update_layout(showlegend=False)
fig.update_layout(
    yaxis = dict(
        tickmode = 'array',
        tickvals = list(range(len(df_nan_feature['nan']))),
        ticktext = df_nan_feature['feature']
    )
)
fig.update_yaxes(autorange=False)
fig.update_layout(yaxis_range=[-1, len(df_nan_feature['feature'])])
fig.update_yaxes(tickfont_size=24)
fig.update_xaxes(tickfont_size=30)
fig.update_layout(
    autosize=False,
    width=800,
    height=1400,
    margin=go.layout.Margin(
        l=175,
        r=20,
        b=100,
        t=40,
        pad=0
    )
)
save_figure(fig, f"{path_save}/nan_by_feature")

# DONE: duplicates choosing from file

In [None]:
df_file_doubled = df_data_files["MULTIPLEX_20_11_2020_ AGING"].copy()
df_file_doubled_unique = df_file_doubled.loc[~df_file_doubled.index.duplicated(keep=False), :]
df_file_doubled_1 = df_file_doubled.loc[df_file_doubled.index.duplicated(keep='first'), :]
df_file_doubled_2 = df_file_doubled.loc[df_file_doubled.index.duplicated(keep='last'), :]
df_file_doubled_2 = df_file_doubled_2.loc[df_file_doubled_1.index, :]
df_file_doubled_sub = df_file_doubled_2.sub(df_file_doubled_1)

na_columns_doubled = df_file_doubled.columns[df_file_doubled.isna().any()].tolist()
na_columns_doubled_1 = df_file_doubled_1.columns[df_file_doubled_1.isna().any()].tolist()
na_columns_doubled_2 = df_file_doubled_2.columns[df_file_doubled_2.isna().any()].tolist()

na_count_doubled = df_file_doubled.isna().sum(axis=0)
na_count_doubled_1 = df_file_doubled_1.isna().sum(axis=0)
na_count_doubled_2 = df_file_doubled_2.isna().sum(axis=0)

num_cols = 5
num_rows = int(np.ceil(len(feats_immuno) / num_cols))
pathlib.Path(f"{path_save}/MULTIPLEX_20_11_2020_ AGING").mkdir(parents=True, exist_ok=True)
test_mw = {'features': feats_immuno, 'pvals': []}
for f in feats_immuno:
    x_1 = df_file_doubled_1[f].dropna().values
    x_2 = df_file_doubled_2[f].dropna().values
    stat, pval = mannwhitneyu(x_1, x_2, alternative='two-sided')
    test_mw['pvals'].append(pval)
_, test_mw['pvals_fdr'], _, _ = multipletests(test_mw['pvals'], 0.05, method='fdr_bh')
titles = [f"{test_mw['features'][x]} ({dict_immuno_genes[test_mw['features'][x]]}) Mann-Whitney p-value: {test_mw['pvals_fdr'][x]:0.2e}" for x in range(len(test_mw['pvals_fdr']))]
test_mw_df = pd.DataFrame(test_mw)
test_mw_df.to_excel(f"{path_save}/MULTIPLEX_20_11_2020_ AGING/features_mw.xlsx", index=False)

fig_features_vio = make_subplots(rows=num_rows, cols=num_cols, shared_yaxes=False, subplot_titles=titles)
for f_id, f in enumerate(feats_immuno):
    r_id, c_id = divmod(f_id, num_cols)
    iqr = scipy.stats.iqr(df_file_doubled[f].dropna().values)
    bin_size = iqr / 8

    q1 = df_file_doubled[f].dropna().quantile(0.25)
    q3 = df_file_doubled[f].dropna().quantile(0.75)
    iqr = q3 - q1
    filter_1 = (df_file_doubled_1[f] >= q1 - 2.5 * iqr) & (df_file_doubled_1[f] <= q3 + 2.5 * iqr)
    df_file_doubled_1[f"{f}_outlier"] = filter_1
    filter_2 = (df_file_doubled_2[f] >= q1 - 2.5 * iqr) & (df_file_doubled_2[f] <= q3 + 2.5 * iqr)
    df_file_doubled_2[f"{f}_outlier"] = filter_2

    x_1 = df_file_doubled_1[f].dropna().values
    x_2 = df_file_doubled_2[f].dropna().values
    x_diff = df_file_doubled_sub[f].dropna().values

    show_legend = False
    if f_id == 2:
        show_legend = True

    fig_features_vio.add_trace(
        go.Violin(
            y=x_1,
            name=f"First",
            showlegend=show_legend,
            box_visible=True,
            meanline_visible=True,
            line_color='black',
            fillcolor="lime",
            marker=dict(color="lime", line=dict(color='black', width=0.1), opacity=0.8),
            points='all',
            opacity=0.8,
            scalemode='width',
            width=0.5
        ),
        row=r_id + 1,
        col=c_id + 1
    )
    fig_features_vio.add_trace(
        go.Violin(
            y=x_2,
            name=f"Second",
            showlegend=show_legend,
            box_visible=True,
            meanline_visible=True,
            line_color='black',
            fillcolor="fuchsia",
            marker=dict(color="fuchsia", line=dict(color='black', width=0.1), opacity=0.8),
            points='all',
            opacity=0.8,
            scalemode='width',
            width=0.5
        ),
        row=r_id + 1,
        col=c_id + 1
    )
    fig_features_vio.add_trace(
        go.Violin(
            y=x_diff,
            name=f"Diff",
            showlegend=show_legend,
            box_visible=True,
            meanline_visible=True,
            line_color='black',
            fillcolor="red",
            marker=dict(color="red", line=dict(color='black', width=0.1), opacity=0.8),
            points='all',
            opacity=0.8,
            scalemode='width',
            width=0.5
        ),
        row=r_id + 1,
        col=c_id + 1
    )
    fig_features_vio.update_xaxes(
        autorange=True,
        title_text="",
        row=r_id + 1,
        col=c_id + 1,
        showgrid=True,
        zeroline=False,
        linecolor='black',
        showline=True,
        gridcolor='gainsboro',
        gridwidth=0.05,
        mirror=True,
        ticks='outside',
        titlefont=dict(
            color='black',
            size=20
        ),
        showticklabels=True,
        tickangle=0,
        tickfont=dict(
            color='black',
            size=20
        ),
        exponentformat='e',
        showexponent='all'
    )
    fig_features_vio.update_yaxes(
        autorange=True,
        title_text=f"{f} ({dict_immuno_genes[f]})",
        row=r_id + 1,
        col=c_id + 1,
        showgrid=True,
        zeroline=False,
        linecolor='black',
        showline=True,
        gridcolor='gainsboro',
        gridwidth=0.05,
        mirror=True,
        ticks='outside',
        titlefont=dict(
            color='black',
            size=20
        ),
        showticklabels=True,
        tickangle=0,
        tickfont=dict(
            color='black',
            size=20
        ),
        exponentformat='e',
        showexponent='all'
    )

fig_features_vio.update_layout(
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.01,
        xanchor="center",
        x=0.5
    ),
    title=dict(
        text="",
        font=dict(size=25)
    ),
    template="none",
    autosize=False,
    width=3000,
    height=4000,
    margin=go.layout.Margin(
        l=100,
        r=40,
        b=100,
        t=100,
        pad=0
    )
)
fig_features_vio.update_layout(legend_font_size=50)
fig_features_vio.update_traces(scalemode='count')
fig_features_vio.update_layout(legend={'itemsizing': 'constant'})
fig_features_vio.update_layout({'colorway': ["lime", "fuchsia"]}, barmode='overlay')
save_figure(fig_features_vio, f"{path_save}/MULTIPLEX_20_11_2020_ AGING/features_vio")

fig = go.Figure()
fig.add_trace(
    go.Bar(
        name='First',
        x=['Below/above threshold or NAN'],
        y=[df_file_doubled_1.isna().sum().sum() / (df_file_doubled_1.shape[0] * len(feats_immuno)) * 100],
        text=f'{df_file_doubled_1.isna().sum().sum() / (df_file_doubled_1.shape[0] * len(feats_immuno)) * 100:0.3f}%',
        textposition='auto',
        orientation='v',
        marker=dict(color='lime', opacity=0.9)
    )
)
fig.add_trace(
    go.Bar(
        name='Second',
        x=['Below/above threshold or NAN'],
        y=[df_file_doubled_2.isna().sum().sum() / (df_file_doubled_2.shape[0] * len(feats_immuno)) * 100],
        text=f'{df_file_doubled_2.isna().sum().sum() / (df_file_doubled_2.shape[0] * len(feats_immuno)) * 100:0.3f}%',
        textposition='auto',
        orientation='v',
        marker=dict(color='fuchsia', opacity=0.9)
    )
)
fig.update_layout(barmode='group')
add_layout(fig, f"", "Percentage of all cells", f"")
fig.update_layout(title_xref='paper')
fig.update_layout(
    autosize=False,
    margin=go.layout.Margin(
        l=100,
        r=20,
        b=100,
        t=90,
        pad=0
    )
)
fig.update_layout(legend_font_size=20)
fig.update_layout(showlegend=True)
fig.update_traces(textposition='auto')
save_figure(fig, f"{path_save}/MULTIPLEX_20_11_2020_ AGING/bad_cells")

# Age correlation and percentage of missing values

In [None]:
df_base_train = pd.read_excel(f"{path}/{platform}/{dataset}/special/021_ml_data/immuno/data_paper_train.xlsx", index_col='index')
dict_corr_miss = {'feature': feats_gene, 'pvals': [], 'missed': []}
for f in feats_gene:
    x = df_base_train.loc[:, 'Age'].values
    y = df_base_train.loc[:, f].values
    corr, pval = stats.pearsonr(x, y)
    dict_corr_miss['pvals'].append(pval)
    n_missed = (df_base_train[f] == 0).sum()
    dict_corr_miss['missed'].append( n_missed / df_base_train.shape[0])
_, dict_corr_miss['pvals_fdr'], _, _ = multipletests(dict_corr_miss['pvals'], 0.05, method='fdr_bh')
df_corr_miss = pd.DataFrame(dict_corr_miss)
df_corr_miss.set_index('feature', inplace=True)
df_corr_miss.sort_values(['pvals_fdr'], ascending=[True], inplace=True)

xs_1 = -np.log10(df_corr_miss.loc[:, 'pvals_fdr'].values)[::-1]
xs_2 = df_corr_miss.loc[:, 'missed'].values[::-1]
ys = df_corr_miss.index.values[::-1]

fig = go.Figure()
fig.add_trace(
    go.Bar(
        x=xs_1,
        y=list(range(len(xs_1))),
        orientation='h',
        marker=dict(color='red', opacity=0.9)
    )
)
fig.add_trace(
    go.Scatter(
        x=[-np.log10(0.05), -np.log10(0.05)],
        y=[-1, len(xs_1)],
        showlegend=False,
        mode='lines',
        line = dict(color='black', width=2, dash='dash')
    )
)
#add_layout(fig, "$\\huge{-\log_{10}(\\text{p-value})}$", "", f"")
add_layout(fig, "-log10(p-value)", "", f"")
fig.update_layout({'colorway': ['red', 'black']})
fig.update_layout(legend_font_size=20)
fig.update_layout(showlegend=False)
fig.update_layout(
    yaxis = dict(
        tickmode = 'array',
        tickvals = list(range(len(xs_1))),
        ticktext = ys
    )
)
fig.update_yaxes(autorange=False)
fig.update_layout(yaxis_range=[-1, len(xs_1)])
fig.update_yaxes(tickfont_size=24)
fig.update_xaxes(tickfont_size=30)
fig.update_layout(
    autosize=False,
    width=800,
    height=1400,
    margin=go.layout.Margin(
        l=175,
        r=20,
        b=100,
        t=40,
        pad=0
    )
)
pathlib.Path(f"{path_save}/corr_miss").mkdir(parents=True, exist_ok=True)
save_figure(fig, f"{path_save}/corr_miss/corr")

fig = go.Figure()
fig.add_trace(
    go.Bar(
        x=xs_2,
        y=list(range(len(xs_2))),
        orientation='h',
        marker=dict(color='blue', opacity=0.9)
    )
)
add_layout(fig, "Below/above threshold or NAN", "", f"")
fig.update_layout({'colorway': ['blue']})
fig.update_layout(legend_font_size=20)
fig.update_layout(showlegend=False)
fig.update_layout(
    yaxis = dict(
        tickmode = 'array',
        tickvals = list(range(len(xs_1))),
        ticktext = ys
    )
)
fig.update_yaxes(autorange=False)
fig.update_layout(yaxis_range=[-1, len(xs_1)])
fig.update_yaxes(tickfont_size=24)
fig.update_xaxes(tickfont_size=30)
fig.update_layout(
    autosize=False,
    width=800,
    height=1400,
    margin=go.layout.Margin(
        l=175,
        r=20,
        b=100,
        t=40,
        pad=0
    )
)
pathlib.Path(f"{path_save}/corr_miss").mkdir(parents=True, exist_ok=True)
save_figure(fig, f"{path_save}/corr_miss/missed")

# Checking ipAGE on duplicate:

In [None]:
ipAGE_df = pd.read_excel(f"{path}/{platform}/{dataset}/special/011_immuno_part3_and_part4_check_clocks/legacy/Control/v2/clock.xlsx")
df_full = pd.read_excel(f"{path}/{platform}/{dataset}/special/021_ml_data/immuno/data_all.xlsx", index_col='index')
ipAGE_feats = ipAGE_df['feature'].to_list()
ipAGE_coefs = ipAGE_df['coef'].to_list()

df_dupl_2_for_ipAGE = df_file_doubled_1.copy()
df_dupl_2_for_ipAGE.rename(columns=dict_immuno_genes, inplace=True)
df_dupl_2_for_ipAGE.fillna(0, inplace=True)

ipAGE_trgt = np.full(df_dupl_2_for_ipAGE.shape[0], ipAGE_coefs[0])
for feat_id in range(1, len(ipAGE_feats)):
    ipAGE_trgt += df_dupl_2_for_ipAGE.loc[:, ipAGE_feats[feat_id]].values * ipAGE_coefs[feat_id]

df_dupl_2_for_ipAGE['ipAGE'] = ipAGE_trgt
df_dupl_2_for_ipAGE['Age'] = df_age_sex_base.loc[df_dupl_2_for_ipAGE.index, 'Age']
mae_trgt = mean_absolute_error(df_dupl_2_for_ipAGE['Age'].values, df_dupl_2_for_ipAGE['ipAGE'].values)
print(f"mae_trgt: {mae_trgt}")
df_dupl_2_for_ipAGE_ctrl = df_dupl_2_for_ipAGE.loc[df_dupl_2_for_ipAGE.index.isin(df_full.index)]
mae_ctrl = mean_absolute_error(df_dupl_2_for_ipAGE_ctrl['Age'].values, df_dupl_2_for_ipAGE_ctrl['ipAGE'].values)
print(f"mae_ctrl: {mae_ctrl}")