In [None]:
import pandas as pd
import numpy as np
import scipy
from sklearn.linear_model import ElasticNet, ElasticNetCV
from sklearn.model_selection import RepeatedKFold, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scripts.python.routines.betas import betas_drop_na
from plotly.subplots import make_subplots
from numpy.ma import masked_array
from scipy import stats
from mpl_toolkits.axes_grid1 import make_axes_locatable
import pickle
import random
import plotly.express as px
import copy
import statsmodels.formula.api as smf
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scripts.python.pheno.datasets.filter import filter_pheno
from scripts.python.pheno.datasets.features import get_column_name, get_status_dict, get_sex_dict
from scripts.python.routines.plot.scatter import add_scatter_trace
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import pathlib
from scripts.python.routines.manifest import get_manifest
from scripts.python.routines.plot.save import save_figure
from scripts.python.routines.plot.layout import add_layout, get_axis
from scripts.python.routines.plot.p_value import add_p_value_annotation
from statsmodels.stats.multitest import multipletests
from sklearn.metrics import mean_absolute_error
import plotly.io as pio
pio.kaleido.scope.mathjax = None
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=False)
from functools import reduce
from scipy.stats import chi2_contingency
from scipy.stats import kruskal, mannwhitneyu
from impyute.imputation.cs import fast_knn, mean, median, random, mice, mode, em

# Prepare data

In [None]:
dataset = "GSEUNN"
path = f"E:/YandexDisk/Work/pydnameth/datasets"
datasets_info = pd.read_excel(f"{path}/datasets.xlsx", index_col='dataset')
platform = datasets_info.loc[dataset, 'platform']
manifest = get_manifest(platform)

path_save = f"{path}/{platform}/{dataset}/special/028_sphy_snp"
pathlib.Path(f"{path_save}").mkdir(parents=True, exist_ok=True)

df_data = pd.read_excel(f"{path}/{platform}/{dataset}/special/022_ml_data_cardio/snp_sphy/data.xlsx", index_col='index')

imputation = 'fast_knn'
fast_knn_k = 1

df_features_sphy_cat = pd.read_excel(f"{path}/{platform}/{dataset}/special/022_ml_data_cardio/snp_sphy/features_sphy_cat.xlsx", index_col='features')
features_sphy_cat = df_features_sphy_cat.index.values
df_features_snp_cat = pd.read_excel(f"{path}/{platform}/{dataset}/special/022_ml_data_cardio/snp_sphy/features_snp_cat.xlsx", index_col='features')
features_snp_cat = df_features_snp_cat.index.values
df_features_sphy_cont = pd.read_excel(f"{path}/{platform}/{dataset}/special/022_ml_data_cardio/snp_sphy/features_sphy_cont.xlsx", index_col='features')
features_sphy_cont = df_features_sphy_cont.index.values

is_nans = df_data.loc[:, features_sphy_cont].isnull().values.any()
if is_nans:
    n_nans = df_data.loc[:, features_sphy_cont].isna().sum().sum()
    print(f"Perform imputation for {n_nans} missed values")
    df_data.loc[:, features_sphy_cont] = df_data.loc[:, features_sphy_cont].astype('float')
    if imputation == "median":
        imputed_training = median(df_data.loc[:, features_sphy_cont].values)
    elif imputation == "mean":
        imputed_training = mean(df_data.loc[:, features_sphy_cont].values)
    elif imputation == "fast_knn":
        imputed_training = fast_knn(df_data.loc[:, features_sphy_cont].values, k=fast_knn_k)
    elif imputation == "random":
        imputed_training = random(df_data.loc[:, features_sphy_cont].values)
    elif imputation == "mice":
        imputed_training = mice(df_data.loc[:, features_sphy_cont].values)
    elif imputation == "em":
        imputed_training = em(df_data.loc[:, features_sphy_cont].values)
    elif imputation == "mode":
        imputed_training = mode(df_data.loc[:, features_sphy_cont].values)
    else:
        raise ValueError(f"Unsupported imputation: {imputation}")
    df_data.loc[:, features_sphy_cont] = imputed_training

# Categorical - categorical tests and plots

In [None]:
df_pval_cat_cat = pd.DataFrame(data=np.zeros(shape=(len(features_snp_cat), len(features_sphy_cat))), index=features_snp_cat, columns=features_sphy_cat)
for f_id_snp, f_snp in enumerate(features_snp_cat):
    for f_id_shpy, f_shpy in enumerate(features_sphy_cat):
        df_cross = pd.crosstab(df_data[f_snp], df_data[f_shpy])
        chi2, pval, dof, ex = chi2_contingency(df_cross, correction=True)
        df_pval_cat_cat.at[f_snp, f_shpy] = pval
df_pval_cat_cat_fdr = df_pval_cat_cat.copy()
selection = np.ones((df_pval_cat_cat.shape[0], df_pval_cat_cat.shape[1]), dtype=np.bool)
df_fdr = df_pval_cat_cat.where(selection).stack().reset_index()
df_fdr.columns = ['row', 'col', 'pval']
_, pvals_corr, _, _ = multipletests(df_fdr.loc[:, 'pval'].values, 0.05, method='fdr_bh')
df_fdr['pval_fdr_bh'] = pvals_corr
for line_id in range(df_fdr.shape[0]):
    df_pval_cat_cat_fdr.loc[df_fdr.at[line_id, 'row'], df_fdr.at[line_id, 'col']] = df_fdr.at[line_id, 'pval_fdr_bh']

# Continious - categorical tests and plots

In [None]:
df_data['PhenoAgeAcc'] = df_data['PhenoAge'] - df_data['Age']
features_cont = list(features_sphy_cont) + ['PhenoAgeAcc']
df_pval_cat_cont = pd.DataFrame(data=np.zeros(shape=(len(features_snp_cat), len(features_cont))), index=features_snp_cat, columns=features_cont)
for f_id_snp, f_snp in enumerate(features_snp_cat):
    for f_id_shpy, f_shpy in enumerate(features_cont):
        cats = df_data[f_snp].unique()
        vals_dict = {}
        for cat in cats:
            vals_cat = df_data.loc[df_data[f_snp] == cat, f_shpy].values
            vals_dict[cat] = vals_cat
        if len(cats) > 2:
            stat, pval = kruskal(*vals_dict.values())
        elif len(cats) == 2:
            stat, pval = mannwhitneyu(*vals_dict.values(), alternative='two-sided')
        else:
            raise ValueError("Number of datasets less than 2")
        df_pval_cat_cont.at[f_snp, f_shpy] = pval
df_pval_cat_cont_fdr = df_pval_cat_cont.copy()
selection = np.ones((df_pval_cat_cont.shape[0], df_pval_cat_cont.shape[1]), dtype=np.bool)
df_fdr = df_pval_cat_cont.where(selection).stack().reset_index()
df_fdr.columns = ['row', 'col', 'pval']
_, pvals_corr, _, _ = multipletests(df_fdr.loc[:, 'pval'].values, 0.05, method='fdr_bh')
df_fdr['pval_fdr_bh'] = pvals_corr
for line_id in range(df_fdr.shape[0]):
    df_pval_cat_cont_fdr.loc[df_fdr.at[line_id, 'row'], df_fdr.at[line_id, 'col']] = df_fdr.at[line_id, 'pval_fdr_bh']
df_pval_cat_cont_fdr = df_pval_cat_cont_fdr.iloc[::-1]

In [None]:
df_to_plot = df_pval_cat_cont_fdr.copy()
df_to_plot = -np.log10(df_to_plot)
mtx_to_plot = df_to_plot.to_numpy()
cmap = plt.get_cmap("viridis").copy()
cmap.set_under('black')
fig, ax = plt.subplots()
im = ax.imshow(mtx_to_plot, cmap=cmap, vmin=-np.log10(0.05), vmax=10)
cbar = ax.figure.colorbar(im, ax=ax, location='top', fraction=0.0875, pad=0.04)
cbar.set_label(r"$-\log_{10}(\mathrm{p-value})$", horizontalalignment='center', fontsize=15)
ax.set_aspect("equal")
ax.set_xticks(np.arange(df_to_plot.shape[1]))
ax.set_yticks(np.arange(df_to_plot.shape[0]))
ax.set_xticklabels(df_to_plot.columns.values)
ax.set_yticklabels(df_to_plot.index.values)
plt.setp(ax.get_xticklabels(), rotation=90)
threshold = np.ptp(mtx_to_plot.flatten()) * 0.5
ax.tick_params(axis='both', which='major', labelsize=12)
ax.tick_params(axis='both', which='minor', labelsize=12)
textcolors = ("black", "white")
for i in range(df_to_plot.shape[0]):
    for j in range(df_to_plot.shape[1]):
        color = textcolors[int(im.norm(mtx_to_plot[i, j]) < threshold)]
        if np.isinf(mtx_to_plot[i, j]):
            text = ax.text(j, i, f"", ha="center", va="center", color=color, fontsize=7)
        else:
            text = ax.text(j, i, f"{mtx_to_plot[i, j]:0.2f}", ha="center", va="center", color=color, fontsize=7)
fig.tight_layout()
plt.savefig(f"{path_save}/df_pval_cat_cont_fdr.png", bbox_inches='tight', dpi=400)
plt.savefig(f"{path_save}/df_pval_cat_cont_fdr.pdf", bbox_inches='tight', dpi=400)
plt.clf()

# Continious - continious tests and plots

In [None]:
features_cont = list(features_sphy_cont) + ['Age']
df_res_cont_cont = pd.DataFrame(data=np.zeros(shape=(len(features_cont), len(features_cont))), index=features_cont, columns=features_cont)
for f_id_1 in range(len(features_cont)):
    for f_id_2 in range(f_id_1, len(features_cont)):
        f_1 = features_cont[f_id_1]
        f_2 = features_cont[f_id_2]
        if f_id_1 != f_id_2:
            vals_1 = df_data.loc[:, f_1].values
            vals_2 = df_data.loc[:, f_2].values
            corr, pval = stats.pearsonr(vals_1, vals_2)
            df_res_cont_cont.at[f_2, f_1] = pval
            df_res_cont_cont.at[f_1, f_2] = corr
        else:
            df_res_cont_cont.at[f_2, f_1] = np.nan
selection = np.tri(df_res_cont_cont.shape[0], df_res_cont_cont.shape[1], -1, dtype=np.bool)
df_fdr = df_res_cont_cont.where(selection).stack().reset_index()
df_fdr.columns = ['row', 'col', 'pval']
_, df_fdr['pval_fdr_bh'], _, _ = multipletests(df_fdr.loc[:, 'pval'].values, 0.05, method='fdr_bh')
df_res_cont_cont_fdr = df_res_cont_cont.copy()
for line_id in range(df_fdr.shape[0]):
    df_res_cont_cont_fdr.loc[df_fdr.at[line_id, 'row'], df_fdr.at[line_id, 'col']] = -np.log10(df_fdr.at[line_id, 'pval_fdr_bh'])

In [None]:
df_to_plot = df_res_cont_cont_fdr.copy()
mtx_to_plot = df_to_plot.to_numpy()

mtx_triu = np.triu(mtx_to_plot, +1)
max_corr = np.max(mtx_triu)
min_corr = np.min(mtx_triu)
mtx_triu_mask = masked_array(mtx_triu, mtx_triu==0)
cmap_triu = plt.get_cmap("bwr").copy()

mtx_tril = np.tril(mtx_to_plot, -1)
mtx_tril_mask = masked_array(mtx_tril, mtx_tril==0)
cmap_tril = plt.get_cmap("viridis").copy()
cmap_tril.set_under('black')

fig, ax = plt.subplots()

im_triu = ax.imshow(mtx_triu_mask, cmap=cmap_triu, vmin=-1, vmax=1)
cbar_triu = ax.figure.colorbar(im_triu, ax=ax, location='right')
cbar_triu.set_label(r"$\mathrm{Correlation\:coefficient}$", horizontalalignment='center', fontsize=13)

im_tril = ax.imshow(mtx_tril_mask, cmap=cmap_tril, vmin=-np.log10(0.05))
cbar_tril = ax.figure.colorbar(im_tril, ax=ax, location='right')
cbar_tril.set_label(r"$-\log_{10}(\mathrm{p-value})$", horizontalalignment='center', fontsize=13)

ax.set_aspect("equal")
ax.set_xticks(np.arange(df_to_plot.shape[1]))
ax.set_yticks(np.arange(df_to_plot.shape[0]))
ax.set_xticklabels(df_to_plot.columns.values)
ax.set_yticklabels(df_to_plot.index.values)
plt.setp(ax.get_xticklabels(), rotation=90)
threshold = np.ptp(mtx_tril.flatten()) * 0.5
ax.tick_params(axis='both', which='major', labelsize=12)
ax.tick_params(axis='both', which='minor', labelsize=12)
textcolors = ("black", "white")
for i in range(df_to_plot.shape[0]):
    for j in range(df_to_plot.shape[1]):
        color = "black"
        if i > j:
            color = textcolors[int(im.norm(mtx_tril[i, j]) < threshold)]
        if np.isinf(mtx_to_plot[i, j]) or np.isnan(mtx_to_plot[i, j]):
            text = ax.text(j, i, f"", ha="center", va="center", color=color, fontsize=5)
        else:
            text = ax.text(j, i, f"{mtx_to_plot[i, j]:0.2f}", ha="center", va="center", color=color, fontsize=5)
fig.tight_layout()
plt.savefig(f"{path_save}/df_pval_cont_cont_fdr.png", bbox_inches='tight', dpi=400)
plt.savefig(f"{path_save}/df_pval_cont_cont_fdr.pdf", bbox_inches='tight', dpi=400)
plt.clf()