In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt

from ukbb_recessive.regression.regressions import sci_notation, plot_errorbar_grouped, plot_errorbar_grouped_transposed

from matplotlib import font_manager
import matplotlib
from matplotlib.cm import get_cmap

In [None]:
sns.set(style='whitegrid', rc={"lines.linewidth": 0.7})


# Add fonts
font_dirs = ['../../../../data/fonts']
font_files = font_manager.findSystemFonts(fontpaths=font_dirs)

for font_file in font_files:
    font_manager.fontManager.addfont(font_file)
    print ("Added:", font_file)

In [None]:
SMALL_SIZE = 5
MEDIUM_SIZE = 6
BIGGER_SIZE = 7

plt.rc('font', size=MEDIUM_SIZE, family='Arimo')          # controls default text sizes
# plt.rc('axes', titlesize=MEDIUM_SIZE)     # fontsize of the axes title
# plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
# plt.rc('xtick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
# plt.rc('ytick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
# plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
# plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

plt.rcParams['text.usetex']= False

matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

In [None]:
def configure_axis(ax, ytick_size=MEDIUM_SIZE, xtick_size=MEDIUM_SIZE, xlabel_size=MEDIUM_SIZE, ylabel_size=MEDIUM_SIZE, x_label=None, y_label=None):
    # Turn off grid
    ax.grid(False) 

    # Customize spines
    ax.spines['left'].set_color('black')
    ax.spines['bottom'].set_color('black')
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
        
    # Set tick labels size
    ax.tick_params(axis='y', labelsize=ytick_size) 
    ax.tick_params(axis='x', labelsize=xtick_size) 

    # Add tick marks
    ax.yaxis.set_ticks_position('left')
    ax.xaxis.set_ticks_position('bottom')
    ax.tick_params(which='major', width=1.00, length=2.5)
    ax.tick_params(which='minor', width=0.75, length=1.25)

    ax.set_xlabel(x_label, size=xlabel_size)
    ax.set_ylabel(y_label, size=ylabel_size)

# Load data

In [None]:
# read raw table
reader = pd.ExcelFile('../../../../data/tables/table_basic_regressions_on_[s_het_recessive_all].xlsx')

all_results_df_recessive = pd.read_excel(reader, sheet_name="Raw data", header=[0, 1], skiprows=[2])
all_results_df_recessive = all_results_df_recessive.drop(all_results_df_recessive.columns[0], axis=1)

# prettify p-values
new_columns = [(level0, 'p_value_pretty') for level0 in all_results_df_recessive.columns.get_level_values(level=0).unique()]
all_results_df_recessive[new_columns] = all_results_df_recessive.loc[:, (slice(None), 'p_value')].applymap(sci_notation)

# leave s_het effects only
all_results_df_recessive = all_results_df_recessive[all_results_df_recessive[all_results_df_recessive.columns[0]].str.contains('s_het')]

In [None]:
# read raw table
reader = pd.ExcelFile('../../../../data/tables/table_panel_regressions_on_[s_het_recessive_AR_without_ID,_s_het_recessive_ID_total].xlsx')

all_results_df = pd.read_excel(reader, sheet_name="Raw data", header=[0, 1], skiprows=[2])
all_results_df = all_results_df.drop(all_results_df.columns[0], axis=1)

# prettify p-values
new_columns = [(level0, 'p_value_pretty') for level0 in all_results_df.columns.get_level_values(level=0).unique()]
all_results_df[new_columns] = all_results_df.loc[:, (slice(None), 'p_value')].applymap(sci_notation)

# leave s_het effects only
all_results_df = all_results_df[all_results_df[all_results_df.columns[0]].str.contains('s_het')]

In [None]:
plot_results_df = []

for dataset_type in ['Weghorn', 'Cassa', 'pLI']:
    total_results_dataset = pd.concat([all_results_df[dataset_type], all_results_df_recessive[dataset_type]])
    total_results_dataset['type'] = dataset_type

    plot_results_df += [total_results_dataset]

plot_results_df = pd.concat(plot_results_df)

In [None]:
renaming_dict_target = {
    'any_education_including_none': "Any education",
    'higher_education_including_none': "Higher education",
    # 'is_blond': 'Hair color', 
    'childlessness': 'Childlessness'
}

renaming_dict_panel = {
    's_het_recessive_AR_without_ID': "Other recessive genes",
    's_het_recessive_ID_total': 'Recessive ID genes', 
    's_het_recessive_all': "All recessive genes"
}


plot_results_df = plot_results_df[plot_results_df['target'].isin(renaming_dict_target.keys())]
plot_results_df['odds_ratio_lower'] = plot_results_df['odds_ratio'] - plot_results_df['odds_ratio_lower']
plot_results_df['odds_ratio_upper'] = plot_results_df['odds_ratio_upper'] - plot_results_df['odds_ratio']

all_results_df = plot_results_df[plot_results_df['gender'] == 'all']
plot_results_df['target'] = plot_results_df['target'].apply(lambda x: renaming_dict_target.get(x, x))
plot_results_df['feature'] = plot_results_df['feature'].apply(lambda x: renaming_dict_panel.get(x, x))

plot_results_df = plot_results_df.sort_values(by='feature')

## Childlessness ratio data

In [None]:
def s_het_bin(s_het):
    if s_het is None:
        return None
    if s_het <= 0:
        return "0"
    elif s_het <= 0.15:
        return "0-0.15"
    else:
        return '> 0.15'

In [None]:
datasets_path = '../datasets'

dataset = pd.read_csv(f"{datasets_path}/Weghorn-drift.csv", sep='\t')
    
dataset['s_het_recessive_ID_total_bin'] = dataset['s_het_recessive_ID-total'].apply(s_het_bin)
dataset.loc[dataset['s_het_recessive_ID-total'].isnull(), 's_het_recessive_ID_total_bin'] = None

dataset['s_het_recessive_AR_without_ID_bin'] = dataset['s_het_recessive_AR_without_ID'].apply(s_het_bin)
dataset.loc[dataset['s_het_recessive_AR_without_ID'].isnull(), 's_het_recessive_AR_without_ID_bin'] = None

In [None]:
s_hets=['s_het_recessive_ID_total_bin', 's_het_recessive_AR_without_ID_bin']
tags=[ 'Recessive ID genes', 'Other recessive genes']

dataset_plot_data = []
for s_het, tag in zip(s_hets, tags):
    d1 = dataset[['childlessness', 'any_education_including_none', s_het]].copy()
    d1.columns = ['childlessness', 'any_education_including_none', 's_het']
    d1['tag'] = tag
    dataset_plot_data.append(d1)

dataset_plot_data = pd.concat(dataset_plot_data)

dataset_plot_data = dataset_plot_data.sort_values(by='s_het')

## S-het data

In [None]:
s_het_paths = {
    'Weghorn-drift': "../weghorn_drift_gencode-v34.txt",
    'Cassa': "../gene_s_het_cassa_all_genes_gencode-v34.txt",
    'PLI': "../gnomad.v2.1.1.PLI_gencode-v34.txt", 
}

#gene panel
gene_panel = pd.read_csv("../gene-panel-gencode-v34.txt", header=None)
gene_panel.columns = ['Gene name', 'Gene panel']

gene_panel.loc[gene_panel['Gene panel'] != 'ID-total', 'Gene panel'] = 'Other \nrecessive'
gene_panel.loc[gene_panel['Gene panel'] == 'ID-total', 'Gene panel'] = 'Recessive ID'

gene_panel.tail(3)

In [None]:


s_het_df = pd.read_csv(s_het_paths['Weghorn-drift'], sep='\t').rename(columns={'gene_symbol': 'Gene name'}).merge(gene_panel, on='Gene name', how='inner')
s_het_df['Gene panel'] = s_het_df['Gene panel'].fillna('Non-recessive')

s_het_df = s_het_df.sort_values(by=['Gene panel', 's_het'])

s_het_df

# Plot

In [None]:
def plot_3a(ax):
    y_order = ['Childlessness', 'Any education', 'Higher education']

    group_order = ['All recessive genes', 'Recessive ID genes', 'Other recessive genes']

    colors=['#006D5B', 'BlueViolet','#F89F5B']

    printed_results_df = plot_errorbar_grouped(
        df=plot_results_df[plot_results_df['type'] == 'Weghorn'], axis=ax, y_column='target', group_column = 'feature',  
        title='', 
        legend_loc='center right', 
        group_scale=0.2, y_order=y_order, group_order=group_order, ymargin=0.1, colors=colors)

    # Display p-values or any other information in the sidebar
    p_values = printed_results_df['p_value_pretty']
    or_values = printed_results_df['odds_ratio']
    positions = printed_results_df['y']


    for i, (p, oddsr, pos) in enumerate(zip(p_values, or_values, positions)):
        ax.text(2.5, pos+0.08, f'{oddsr:.2f}   {p}', va='center', size=MEDIUM_SIZE)

    configure_axis(ax, x_label="OR (99% CI)", y_label=None, ytick_size=SMALL_SIZE)

    legend_handles_labels = ax.get_legend_handles_labels()
    ax.get_legend().set_visible(False)

    return legend_handles_labels


def plot_legend(ax, legend_handles_labels):
    legend_kwargs = {
        "frameon": False,
        "ncol": 1, 
        "labelspacing": 0.1, 
        "markerfirst": False, 
        'fontsize': MEDIUM_SIZE
    }
    ax.legend(legend_handles_labels[0], legend_handles_labels[1], loc='upper left', **legend_kwargs)
    ax.axis('off')

def plot_3b(ax):

    hue_order = ['Recessive ID', 'Other \nrecessive']
    colors = ['BlueViolet','#F89F5B']
    
    sns.pointplot(
        data=s_het_df, x="s_het", y='Gene panel',
        estimator='median',
        errorbar=("ci", 95), capsize=0,
        color=".5", order=hue_order, linestyles='', 
        palette=colors, markers='s', ax=ax
    )

    configure_axis(ax, x_label="Median s-het (95% CI)")


def plot_3cd(ax, target, ylabel):
    colors=['BlueViolet','#F89F5B']

    sns.barplot(data=dataset_plot_data, y=target, x='s_het', hue='tag', 
                palette=colors, ax=ax, errorbar=("ci", 95), saturation=1.)
    
    plt.legend(loc='upper left', frameon=False, fontsize=MEDIUM_SIZE)

    configure_axis(ax, y_label=ylabel)

In [None]:
import matplotlib.ticker as ticker

cm = 1/2.54  # centimeters in inches
k = 1.

fig = plt.figure(constrained_layout=True, figsize=(12*cm*k, 10*cm*k))

subfigs = fig.subfigures(nrows=2, ncols=1, height_ratios=[1.5,1], wspace=0.07)

# top row
top_subfigs = subfigs[0].subfigures(nrows=1, ncols=2, width_ratios=[1.5,1], wspace=0.07)

ax = top_subfigs[0].subplots(1, 1)

legend_handles_labels = plot_3a(ax)

ax = top_subfigs[1].subplots(2, 1, gridspec_kw={'height_ratios': [0.5, 1]})

plot_legend(ax[0], legend_handles_labels)
plot_3b(ax[1])
# ax[1].set_xlim([0.022, 0.05])

# bottom row
bottom_subfigs = subfigs[1].subfigures(nrows=1, ncols=2, width_ratios=[1,1], wspace=0.07)

ax = bottom_subfigs[0].subplots(1, 1)

plot_3cd(ax, target = 'childlessness', ylabel="Proportion of \n childless individuals",)
ax.set_ylim(0.17, 0.25)

ax = bottom_subfigs[1].subplots(1, 1)

plot_3cd(ax, target = 'any_education_including_none', ylabel="Proportion of individuals \n having any education")
ax.set_ylim(0.75, 0.85)

plt.savefig("../../../../data/plots/figure_3.pdf", format="pdf", bbox_inches="tight")

In [None]:
dataset_plot_data['counter'] = 1

dataset_plot_data.groupby(['tag', 's_het']).count()

In [None]:
from scipy.stats import chi2_contingency

non_id_data = dataset.loc[dataset['s_het_recessive_AR_without_ID_bin'] == '> 0.15', 'childlessness'].dropna().values
success_non_id, nobs_non_id = np.sum(non_id_data), len(non_id_data)

id_data = dataset.loc[dataset['s_het_recessive_ID_total_bin'] == '> 0.15', 'childlessness'].dropna().values
success_id, nobs_id = np.sum(id_data), len(id_data)

# Assuming success1 and success2 are the number of successes in each group
# and nobs1 and nobs2 are the total number of observations in each group
contingency_table = np.array([[success_non_id, nobs_non_id - success_non_id], [success_id, nobs_id - success_id]])

chi2, p_value, _, _ = chi2_contingency(contingency_table)
print("Chi-square statistic:", chi2)
print("P-value:", p_value)

In [None]:

from scipy.stats import chi2_contingency

non_id_data = dataset.loc[dataset['s_het_recessive_AR_without_ID_bin'] == '> 0.15', 'any_education_including_none'].dropna().values
success_non_id, nobs_non_id = np.sum(non_id_data), len(non_id_data)

id_data = dataset.loc[dataset['s_het_recessive_ID_total_bin'] == '> 0.15', 'any_education_including_none'].dropna().values
success_id, nobs_id = np.sum(id_data), len(id_data)

# Assuming success1 and success2 are the number of successes in each group
# and nobs1 and nobs2 are the total number of observations in each group
contingency_table = np.array([[success_non_id, nobs_non_id - success_non_id], [success_id, nobs_id - success_id]])

chi2, p_value, _, _ = chi2_contingency(contingency_table)
print("Chi-square statistic:", chi2)
print("P-value:", p_value)

In [None]:
from scipy.stats import mannwhitneyu

stat, p_value = mannwhitneyu(
    s_het_df.loc[s_het_df['Gene panel'] == 'Other \nrecessive', 's_het'], 
    s_het_df.loc[s_het_df['Gene panel'] == 'Recessive ID', 's_het']
)

print("Mann-Whitney U test statistic:", stat)
print("P-value:", p_value)
