# Debugging autoreload

In [None]:
%load_ext autoreload
%autoreload 2

# Load packages

In [None]:
import plotly.graph_objects as go
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import pathlib
from scipy.stats import mannwhitneyu
from plottable import ColumnDefinition, Table
from statsmodels.stats.multitest import multipletests
from sklearn.metrics import mean_absolute_error
from scipy.stats import mannwhitneyu, variation, levene
import statsmodels.formula.api as smf
from functools import reduce
import pyaging as pya
from tqdm import tqdm
import functools
import matplotlib


# Load data

In [None]:
path = "E:/YandexDisk/Work/pydnameth/draft/13_fmba_cvd_dnam/data/120_1"

# Load and process manifest
df_mnfst = pd.read_pickle(f"E:/YandexDisk/Work/pydnameth/datasets/GPL33022/manifest.pkl")
df_mnfst['CHR'] = df_mnfst['chr'].str[3::]

cpg_groups = ['TSS1500', 'TSS200', '5UTR', 'Exons', '3UTR']
for cpg_group in cpg_groups:
    df_mnfst[f'UCSC_RefGene_Group {cpg_group}'] = False
    if cpg_group != 'Exons':
        df_mnfst.loc[df_mnfst['UCSC_RefGene_Group'].str.contains(cpg_group, na=False), f'UCSC_RefGene_Group {cpg_group}'] = True
    else:
        df_mnfst.loc[df_mnfst['UCSC_RefGene_Group'].str.contains('exon_', na=False), f'UCSC_RefGene_Group {cpg_group}'] = True
        
cpg_chrs = [str(x) for x in range(1, 24)] + ['X', 'Y']
for cpg_chr in cpg_chrs:
    df_mnfst[f'CHR {cpg_chr}'] = False
    df_mnfst.loc[df_mnfst['CHR'] == cpg_chr, f'CHR {cpg_chr}'] = True
    
cpg_islands =  ['Island', 'Shore', 'Shelf', 'OpenSea']
for cpg_island in cpg_islands:
    df_mnfst[f'Relation_to_Island {cpg_island}'] = False
    df_mnfst.loc[df_mnfst['Relation_to_Island'] == cpg_island, f'Relation_to_Island {cpg_island}'] = True
    
# Load DMPs
df_dmps = pd.read_csv(f"{path}/GSEA(ebayes)_group_orgn_limma.csv", index_col=0)
df_dmps.sort_values(["adj.P.Val"], ascending=[True], inplace=True)

dmps = df_dmps.index[df_dmps["adj.P.Val"] < 0.05].values


# Enrichment

In [None]:
reg_enr_orders = {
    'Chromosomes': cpg_chrs,
    'Relation to Island': cpg_islands,
    'UCSC RefGene Group': cpg_groups
}
reg_enr_col_names = {
    'Chromosomes': "CHR",
    'Relation to Island': "Relation_to_Island",
    'UCSC RefGene Group': "UCSC_RefGene_Group"
}
reg_enr_fig_sizes = {
    'Chromosomes': (17, 10),
    'Relation to Island': (5, 10),
    'UCSC RefGene Group': (5, 10)
}

df_dmps_fisher_target = df_mnfst.loc[dmps, :]
df_dmps_fisher_global = df_mnfst.loc[df_dmps.index.values, :]
df_dmps_fisher_padding = df_dmps_fisher_global.loc[~df_dmps_fisher_global.index.isin(dmps), :]
for var in reg_enr_orders:
    columns=["11", "12", "21", "22", "sum", "pval", "odds_ratio"]
    df_var = pd.DataFrame(index=reg_enr_orders[var], columns=columns, data=np.zeros((len(reg_enr_orders[var]), len(columns))))
    df_var.index.name = var
    for var_val in reg_enr_orders[var]:
        contingency_table = pd.DataFrame(index=["specific", "non-specific"], columns=["in_val", "not_in_val"])
        contingency_table.at["specific", "in_val"] = df_dmps_fisher_target.loc[df_dmps_fisher_target[f"{reg_enr_col_names[var]} {var_val}"] == True, :].shape[0]
        contingency_table.at["specific", "not_in_val"] = df_dmps_fisher_target.loc[df_dmps_fisher_target[f"{reg_enr_col_names[var]} {var_val}"] == False, :].shape[0]
        contingency_table.at["non-specific", "in_val"] = df_dmps_fisher_padding.loc[df_dmps_fisher_padding[f"{reg_enr_col_names[var]} {var_val}"] == True, :].shape[0]
        contingency_table.at["non-specific", "not_in_val"] = df_dmps_fisher_padding.loc[df_dmps_fisher_padding[f"{reg_enr_col_names[var]} {var_val}"] == False, :].shape[0]
        df_var.at[var_val, "11"] = contingency_table.at["specific", "in_val"]
        df_var.at[var_val, "12"] = contingency_table.at["specific", "not_in_val"]
        df_var.at[var_val, "21"] = contingency_table.at["non-specific", "in_val"]
        df_var.at[var_val, "22"] = contingency_table.at["non-specific", "not_in_val"]
        df_var.at[var_val, "sum"] = contingency_table.values.sum()
        odds_ratio, pval = stats.fisher_exact(contingency_table.to_numpy(), alternative='two-sided')
        
        if var_val == 'Y':
            ololo = 1
        if np.isnan(odds_ratio) or np.isinf(odds_ratio):
            odds_ratio = 1.0
        df_var.at[var_val, "odds_ratio"], df_var.at[var_val, "pval"] = odds_ratio, pval
    _, df_var['pval_fdr_bh'], _, _ = multipletests(df_var['pval'].values, 0.05, method='fdr_bh')
    df_var[r'$ \log_{10}(\mathrm{Odds\ ratio})$'] = np.log10(df_var.loc[:, 'odds_ratio'].values)
    df_var[r'$ -\log_{10}(\mathrm{p-value})$'] = -np.log10(df_var.loc[:, 'pval_fdr_bh'].values)
    
    # df_plot = df_var[df_var["11"] > 0]
    df_plot = df_var

    plt.figure(figsize=reg_enr_fig_sizes[var])
    plt.xticks(rotation=90)
    sns.set_theme(style='whitegrid', font_scale=2)
    cmap = plt.get_cmap("viridis").copy()
    cmap.set_under('black')

    plot = plt.scatter(
        df_plot.index,
        df_plot.loc[:, r'$ -\log_{10}(\mathrm{p-value})$'].values,
        c=df_plot.loc[:, r'$ -\log_{10}(\mathrm{p-value})$'].values,
        cmap=cmap,
        vmin=-np.log10(0.05)
    )
    plt.clf()
    cbar = plt.colorbar(plot, extend='min')

    df_plot['bar_color'] = 'black'
    for df_var_index in df_plot.index.values:
        if df_plot.at[df_var_index, "pval_fdr_bh"] < 0.05:
            value_tmp = df_plot.at[df_var_index, r'$ -\log_{10}(\mathrm{p-value})$']
            value_color = (value_tmp-cbar.vmin)/(cbar.vmax-cbar.vmin)
            df_plot.at[df_var_index, 'bar_color'] = matplotlib.colors.rgb2hex(cbar.cmap(value_color))
    df_plot.to_excel(f"{path}/enrichment/fisher_{var}.xlsx")

    plt.xticks(rotation=90)
    cbar.set_label(r"$-\log_{10}(\mathrm{p-value})$", horizontalalignment='center')
    ax = sns.barplot(
        data=df_plot,
        x=df_plot.index,
        y=r'$ \log_{10}(\mathrm{Odds\ ratio})$',
        palette=df_plot.loc[:, 'bar_color'].values,
        dodge=False,
        edgecolor='black',
    )
    plt.savefig(f"{path}/enrichment/fisher_{var}.png", bbox_inches='tight')
    plt.savefig(f"{path}/enrichment/fisher_{var}.pdf", bbox_inches='tight')
    plt.close()
