In [None]:
import pandas as pd

import seaborn as sns

import matplotlib.pyplot as plt

from ukbb_recessive.regression.regressions import read_results_excel
import ukbb_recessive.regression.plotting as plotting


sns.set_style("whitegrid")

import matplotlib

from matplotlib import font_manager
import numpy as np
from matplotlib.cm import get_cmap

import glob

import matplotlib.ticker as ticker


# Add fonts
plotting.add_fonts(['../../../data/fonts'])
plotting.configure_matplotlib()

PLOTS_OUTPUT_DIR = '../../../data/plots/supp_figures'

tables_folder = '../../../data/tables'

# Sup figure 1 (Frequency distribution of PLPs)

In [None]:
def hets_freq_bin(hets_freq):
    if hets_freq <= 0:
        return '0%'
    elif hets_freq <= 0.5:
        return '0-0.5%'
    elif hets_freq <= 1:
        return '0.5-1%'
    elif hets_freq <= 2:
        return '1-2%'
    elif hets_freq <= 5:
        return '2-5%'
    else:
        return '>5%'

# samples of interest: European & non-related
european_non_rel_samples =  ".../450k/samples/european_non_related_no_withdrawal_to_include_450k.no_hom_comp_het.txt"

european_non_rel_samples = set(pd.read_csv(european_non_rel_samples, header=None)[0].values.tolist())
print ("Number of samples:", len(european_non_rel_samples))

# gene panel
gene_panel = pd.read_csv(".../450k/regions/gene-panel-gencode-v34.txt", header=None)
gene_panel.columns = ['gene', 'panel']


In [None]:
# read list of PLPs
plps = ".../450k/plp_selection/basic/new_gene_names/new_freq/new_relatedness/all_chr_total_presumable_plps_HFE_final_sorted.txt"
plps = pd.read_csv(plps, sep='\t')

# define hets frequency bins
plps['hets_freq'] = plps['hets']*100./len(european_non_rel_samples)
plps['hets_freq_bin'] = plps['hets_freq'].apply(hets_freq_bin)

# count number of variants per frequency bin
plps_freq_count = plps.groupby('hets_freq_bin').count()[['chr']].reset_index().rename(columns={'chr': 'plp_cnt'})

In [None]:
# count number of het carriers per gene and add gene panel information
plps_gene = plps.groupby('gene').agg({'hets': 'sum', 'homs':'sum'}).reset_index().merge(gene_panel, how='left').fillna(0)

# define hets frequency bins
plps_gene['hets_freq'] = plps_gene['hets']*100./len(european_non_rel_samples)
plps_gene['hets_freq_bin'] = plps_gene['hets_freq'].apply(hets_freq_bin)

# count number of genes per frequency bin
plps_freq_gene_count = plps_gene.groupby('hets_freq_bin').count()[['gene']].reset_index().rename(columns={'gene': 'plp_cnt'})
plps_freq_gene_count['plp_cnt'] = plps_freq_gene_count['plp_cnt'].astype(int)


In [None]:
def plot_frequency_annotated(plps, ax):
    sns.barplot(data=plps, x='hets_freq_bin', y='plp_cnt', ax=ax, color='#74C0D1', edgecolor='k', linewidth=0.5)

    # add the annotation
    labels =[x.get_height() for x in ax.containers[-1]]
    labels_percentage = [f"{int(num)} \n({round(num*100/sum(labels), 3)}%)" for num in labels]

    ax.bar_label(ax.containers[-1],  labels=labels_percentage, label_type='edge', size=plotting.SMALL_SIZE, padding=3, color='k')


cm = 1/2.54  # centimeters in inches
k = 1.

fig = plt.figure(constrained_layout=True, figsize=(18*cm*k, 8*cm*k))

axes = fig.subplots(1, 2)

# plot variant frequency distribution
ax = axes[0]

plot_frequency_annotated(plps_freq_count, ax)

plotting.configure_axis(ax, x_label="Variant frequency", y_label="Number of PLPs", ylim=(0.1, 100000))
ax.set_yscale("log")

# plot gene frequency distribution
ax = axes[1]

plot_frequency_annotated(plps_freq_gene_count, ax)

plotting.configure_axis(ax, x_label="Carrier frequency", y_label="Number of genes", ylim=(0.1, 10000))
ax.set_yscale("log")


plt.savefig(f"{PLOTS_OUTPUT_DIR}/sup_figure_1.pdf", format="pdf", bbox_inches="tight")


# Sup figure 2 (PLPs count, s-het burden distribution for childlessness)

In [None]:
datasets_path = '.../450k/datasets'

# samples of interest: European & non-related without hom and comp_het
european_non_rel_samples = (
    ".../450k/samples/european_non_related_no_withdrawal_to_include_450k.no_hom_comp_het.txt"
)

with open(european_non_rel_samples, 'r') as f:
    european_non_rel_samples = [l.strip() for l in f.readlines()]


dataset = pd.read_csv(f"{datasets_path}/Roulette.csv", sep='\t')
dataset = dataset[dataset['eid'].astype(str).isin(european_non_rel_samples)].copy()

In [None]:
import matplotlib.ticker as ticker


cm = 1/2.54  # centimeters in inches
k = 1.

fig = plt.figure(constrained_layout=True, figsize=(18*cm*k, 8*cm*k))

axes = fig.subplots(1, 3, width_ratios=[0.6, 1, 1])

# plot % of zeros 
dataset['mutations_cnt_recessive_all_label'] = dataset['mutations_cnt_recessive_all'].apply(lambda x: f"{int(x)} PLPs" if x <= 0 else ">0 PLPs")
sns.histplot(data=dataset.sort_values('mutations_cnt_recessive_all'), 
             x='mutations_cnt_recessive_all_label', hue='childlessness', 
             multiple='dodge', palette='bright', shrink=0.9,
             ax=axes[0], stat='probability', 
             common_norm=False, cumulative=False,
             log_scale=False, fill=True)

plotting.configure_axis(axes[0], x_label="Number of PLPs")

# plot PLPs count
dataset['mutations_cnt_recessive_all_label'] = dataset['mutations_cnt_recessive_all'].apply(lambda x: f"{int(x)} PLPs" if x <= 2 else ">2 PLPs")
sns.histplot(data=dataset[dataset['s_het_recessive_all'] > 0].sort_values('mutations_cnt_recessive_all'), 
             x='mutations_cnt_recessive_all_label', hue='childlessness', 
             multiple='dodge', palette='bright', shrink=0.9,
             ax=axes[1], stat='probability', 
             common_norm=False, cumulative=False,
             log_scale=False, fill=True)

plotting.configure_axis(axes[1], x_label="Number of PLPs")

# plot s-het burden
sns.histplot(data=dataset, 
             x='s_het_recessive_all', hue='childlessness', 
             ax=axes[2], palette='bright', stat='probability', 
             common_norm=False, cumulative=False, bins=35,
             log_scale=True, element="step", fill=False,)

plotting.configure_axis(axes[2], x_label="Genetic burden", ylim=(0, 0.12))

plt.savefig(f"{PLOTS_OUTPUT_DIR}/sup_figure_2.pdf", format="pdf", bbox_inches="tight")

# Sup figure 3 (covariates analysis)

In [None]:
def parse_covariate(analysis):
    if len(analysis.split(',')) > 2:
        return 'With all covariates'
    elif 'years_of_edu' in analysis:
        return 'With years of education'
    elif 'diagnosis_total_ICD10_cnt_log' in analysis:
        return 'With diagnoses count (log)'
    elif 'ICD_infertility' in analysis:
        return 'With infertility status'
    elif 'living_with_a_partner' in analysis:
        return 'With partner status'
    elif 'fluid_intelligence_score' in analysis:
        return 'With fluid intelligence'
    else:
        return 'Original regression'

In [None]:
results_path = f"{tables_folder}/table_covariate_analysis.xlsx"

# load flat table
covariates = read_results_excel(results_path, flatten_multiindex=True)

# leave only necessary s_het effect and dataset
covariates = covariates[covariates['feature'].str.contains("s_het")]
covariates = covariates[covariates['dataset'] == 'Roulette']
covariates = covariates[covariates['target'] == 'childlessness']
covariates = covariates[~covariates['analysis'].str.contains('years_of_edu|fluid_intelligence_score')]

# add odds ratio distance
covariates = plotting.add_odds_ratio_intervals(covariates)
covariates = plotting.add_effect_size_intervals(covariates)

# parse covariates
covariates['covariate'] = covariates['analysis'].apply(parse_covariate)


In [None]:
def plot_covariates(ax, text_pos=2.5, plot_data=covariates):
    y_order = ['childlessness', 'is_blond']
    # group_order = ['Original regression',  'With years of education', 'With fluid intelligence', 
    #                'With diagnoses count (log)', 'With infertility status', 
    #                'With partner status', 'With all covariates']

    group_order = ['Original regression',  
                   'With diagnoses count (log)', 'With infertility status', 
                   'With partner status']
    
    legend_kwargs = {
        "bbox_to_anchor": (0.0, 1.5, 1, .102), 
        "frameon": False,
        "mode": "expand", 
        "ncol": 2, 
        "labelspacing": 0.1, 
        "markerfirst": False, 
        'fontsize': plotting.MEDIUM_SIZE
    }


    plotted_data = plotting.plot_errorbar_grouped(df=plot_data, 
                                        axis=ax, y_column='target', group_column = 'covariate',  
                                        legend_loc='upper left', 
                                        legend_kwargs=legend_kwargs,
                                        group_scale=0.01, 
                                        y_scale=0.13,
                                        # colors=colors,
                                        y_order=y_order, 
                                        group_order=group_order)


    plotting.configure_axis(ax, x_label='OR (99% CI)', y_label=None)

    # Display p-values or any other information in the sidebar
    p_values = plotted_data['p_value_corrected_pretty']
    or_values = plotted_data['odds_ratio']
    positions = plotted_data['y']

    for i, (p, oddsr, pos) in enumerate(zip(p_values, or_values, positions)):
        ax.text(oddsr+0.1, pos+0.001, f'{oddsr:.2f}   {p}', va='center', size=plotting.MEDIUM_SIZE)

In [None]:
cm = 1/2.54  # centimeters in inches
k = 1.

fig = plt.figure(constrained_layout=True, figsize=(18*cm*k, 6*cm*k))

axes = fig.subplots(1, 2)

plot_covariates(axes[0], text_pos=2.5, plot_data=covariates[covariates['feature'] == 's_het_recessive_all'].copy())
axes[0].set_title('PLPs in recessive genes')

plot_covariates(axes[1], text_pos=2.5, plot_data=covariates[covariates['feature'] == 's_het_lof_without_AR'].copy())
axes[1].set_title('Singleton LoFs in non-recessive genes')

plt.savefig(f"{PLOTS_OUTPUT_DIR}/sup_figure_3.pdf", format="pdf", bbox_inches="tight")

# Sup figure 4 (deprivation analysis)

In [None]:
def parse_covariate(analysis):
    if 'multiple_deprivation_engand' in analysis:
        return 'With multiple deprivation'
    elif 'edu_deprivation_england' in analysis:
        return 'With educational deprivation'
    elif 'housing_deprivation_england' in analysis:
        return 'With housing deprivation'
    elif 'income_deprivation_england' in analysis:
        return 'With income deprivation'
    elif 'health_deprivation_england' in analysis:
        return 'With health deprivation'
    else:
        return 'Original regression'
    
# Dictionary to rename target columns
renaming_dict_target = {
    'is_blond': 'Hair color', 
    'years_of_edu': 'Years of\neducation',
    'diagnosis_total_ICD10_cnt_log': 'Diagnoses\ncount (log)',
    'ICD_infertility': 'Infertility',
    'living_with_a_partner': 'Has partner',
    'fluid_intelligence_score': 'Fluid\nintelligence',
    'childlessness': 'Childlessness'
}

In [None]:
results_path_1 = f"{tables_folder}/table_covariate_deprivation_analysis.xlsx"
results_path_2 = f"{tables_folder}/table_id_vs_rest_covariate_deprivation_analysis.xlsx"

# load flat table
deprivation = pd.concat([read_results_excel(results_path_1, flatten_multiindex=True), 
                         read_results_excel(results_path_2, flatten_multiindex=True)])

# leave only necessary s_het effect and dataset
deprivation = deprivation[deprivation['feature'].str.contains("s_het")]
deprivation = deprivation[deprivation['dataset'] == 'Roulette']
deprivation = deprivation[deprivation['target'] != 'is_blond']
deprivation['target'] = deprivation['target'].apply(lambda x: renaming_dict_target.get(x, x))


# add odds ratio distance
deprivation = plotting.add_odds_ratio_intervals(deprivation)
deprivation = plotting.add_effect_size_intervals(deprivation)

# parse covariates
deprivation['covariate'] = deprivation['analysis'].apply(parse_covariate)

# look only at recessive
deprivation = deprivation[deprivation['feature'] != 's_het_lof_without_AR'].copy()

In [None]:
def plot_deprivation(ax, targets, plot_data=deprivation, legend_loc='upper left', plot_entity='odds_ratio'):
    plot_data=plot_data[plot_data['target'].isin(targets)].copy()

    y_order = targets
    group_order = ['Original regression',  'With educational deprivation', 'With health deprivation',
                   'With housing deprivation', 'With income deprivation', 'With multiple deprivation']
        
    legend_kwargs = {
        "bbox_to_anchor": (0.0, 1.5, 1, .102), 
        "frameon": False,
        "mode": "expand", 
        "ncol": 1, 
        "labelspacing": 0.1, 
        "markerfirst": False, 
        'fontsize': plotting.MEDIUM_SIZE
    }


    plotted_data = plotting.plot_errorbar_grouped(df=plot_data, 
                                        axis=ax, y_column='target', group_column = 'covariate',  
                                        legend_loc=legend_loc, 
                                        legend_kwargs=legend_kwargs,
                                        group_scale=0.01, 
                                        y_scale=0.13,
                                        plot_entity=plot_entity,
                                        vertical_loc=1 if plot_entity == 'odds_ratio' else 0,
                                        y_order=y_order, 
                                        group_order=group_order)


    plotting.configure_axis(ax, x_label='OR (99% CI)' if plot_entity == 'odds_ratio' else 'Effect (99% CI)', y_label=None)

    # Display p-values or any other information in the sidebar
    p_values = plotted_data['p_value_corrected_pretty']
    or_values = plotted_data[plot_entity]
    or_values_pretty = plotted_data[f"odds_ratio_pretty"]
    positions = plotted_data['y']

    for i, (p, oddsr, oddsr_pretty, pos) in enumerate(zip(p_values, or_values, or_values_pretty, positions)):
        ax.text(oddsr+0.1, pos+0.001, f'{oddsr_pretty}, P={p}', va='center', size=plotting.MEDIUM_SIZE)

In [None]:
cm = 1/2.54  # centimeters in inches
k = 1.

fig = plt.figure(constrained_layout=True, figsize=(18*cm*k, 18*cm*k))

all_axes = fig.subplots(3, 3)

for axes, feature in zip(all_axes, ['s_het_recessive_all', 's_het_recessive_ID_total', 's_het_recessive_AR_without_ID']):
    plot_data = deprivation[deprivation['feature'] == feature].copy()

    for idx, target in enumerate(['Childlessness', 'Years of\neducation', 'Diagnoses\ncount (log)']):

        legend_loc = 'upper left' if feature=='s_het_recessive_all' else None
        plot_entity = 'odds_ratio' if target == 'Childlessness' else 'effect'
        plot_deprivation(axes[idx], targets=[target], plot_data=plot_data, legend_loc=legend_loc, plot_entity=plot_entity)
        axes[idx].set_title(feature)

plt.savefig(f"{PLOTS_OUTPUT_DIR}/sup_figure_4.pdf", format="pdf", bbox_inches="tight")

# Sup figure 5 (s-het distribution over panels)

In [None]:
# read original gene panel

gene_panels = pd.read_csv(".../450k/regions/gene-panel-gencode-v34.txt", header=None)
gene_panels.columns = ['gene_symbol', 'panel']

print ("Number of genes:", gene_panels.shape[0])

In [None]:
show_order = [
    'Non-Recessive',
    'All recessive',
    'ID',
    'Metabolic',
    'Metabolic-ID',
    'Dermatologic',
    'Blindness',
    'Multi-system',
    'Neuromuscular',
    'Cilia + Kidney',
    'Immune system',
    'Endocrine',
    'Skeletal',
    'Hematologic',
    'Deafness'
]


s_het = pd.read_csv(".../450k/selection_roulette/s_het_roulette_gencode-v34.csv", sep='\t')

s_het = s_het.merge(gene_panels, how='left').rename(columns={'panel': 'Gene set'})

s_het['Gene set'] = s_het['Gene set'].fillna('Non-Recessive')

renaming_dict_gene_sets = {
	'Cilia+Kidney': 'Cilia + Kidney',
     'Derm': 'Dermatologic',
	'ID-total': 'ID',
	'Immune_system': 'Immune system',
	'Overlaps': 'Multi-system',
	'Skeletal+Craniofacial': 'Skeletal'
}

delete_list = ['No_panel', 'Tumor', 'Cardiovascular']

s_het['Gene set'] = s_het['Gene set'].apply(lambda x: renaming_dict_gene_sets.get(x, x))
s_het = s_het[~s_het['Gene set'].isin(delete_list)]

s_het_recessive = s_het[s_het['Gene set'] != 'Non-Recessive'].copy()
s_het_recessive['Gene set'] = 'All recessive'

s_het = pd.concat([s_het, s_het_recessive])

s_het['Gene set'] = s_het['Gene set'].astype('category')
s_het['Gene set'] = s_het['Gene set'].cat.set_categories(show_order, ordered=True)

s_het = s_het.sort_values(by='Gene set')

In [None]:
cm = 1/2.54  # centimeters in inches
k = 1.

fig = plt.figure(constrained_layout=True, figsize=(18*cm*k, 9*cm*k))

ax = fig.subplots(1, 1)

sns.stripplot(data=s_het, x='s_het', y='Gene set', hue='Gene set', 
              jitter=0.2, edgecolor='black', size=1,
              marker="D", linewidth=0.2, alpha=.8, ax=ax)

# plt.axvline(x=0.1,  linestyle='--', color='gray', linewidth=1)

plotting.configure_axis(ax, x_label='S-het', y_label=None, ymargin=0)

plt.savefig(f"{PLOTS_OUTPUT_DIR}/sup_figure_5.pdf", format="pdf", bbox_inches="tight")

# Sup figure 6 (distribution non-ID gene panel)

In [None]:
renaming_dict_gene_sets = {f"AR_without_ID_{i}_sampled": f"Other recessive genes (sample {i+1})" for i in range (0, 20)}
renaming_dict_gene_sets['ID_sampled'] = 'ID'

s_het = pd.read_csv(".../450k/selection_roulette/s_het_roulette_gencode-v34.csv", sep='\t')
s_het = s_het.rename(columns={'gene_symbol': 'Gene name'})

gene_panel_sampled = pd.read_csv(".../450k/regions/gene-panel-gencode-v34.sampled.txt")

gene_panel_sampled = gene_panel_sampled.merge(s_het, how='left')

gene_panel_sampled['Gene panel'] = gene_panel_sampled['Gene panel'].apply(lambda x: renaming_dict_gene_sets.get(x, x))

gene_panel_sampled

In [None]:
import matplotlib.ticker as ticker

cm = 1/2.54  # centimeters in inches
k = 1.
fig = plt.figure(constrained_layout=True, figsize=(12*cm*k, 8*cm*k))

ax = fig.subplots(1, 1)
sns.stripplot(data=gene_panel_sampled, x='s_het', y='Gene panel', hue='Gene panel', 
              jitter=0.2, edgecolor='black', size=1,
              marker="D", linewidth=0.2, alpha=.8, ax=ax)
plotting.configure_axis(ax, x_label='S-het', xlim=None, ymargin=0.08, format_x=True)

# save figure
plt.savefig(f"{PLOTS_OUTPUT_DIR}/sup_figure_6.pdf", format="pdf", bbox_inches="tight")

# Sup figure 7 (resampling non-ID gene panel)

In [None]:
# Dictionary to rename target columns
renaming_dict_target = {
    'is_blond': 'Hair color', 
    'years_of_edu': 'Years of\neducation',
    'diagnosis_total_ICD10_cnt_log': 'Diagnoses\ncount (log)',
    'ICD_infertility': 'Infertility',
    'living_with_a_partner': 'Has partner',
    'fluid_intelligence_score': 'Fluid\nintelligence',
    'childlessness': 'Childlessness'
}

tables_folder = '../../../data/tables'

sampling_df = read_results_excel(f"{tables_folder}/sampling/genes/table_id_vs_other_sampling_analysis.xlsx", 
                                 flatten_multiindex=True)

# leave only necessary s_het effect
sampling_df = sampling_df[sampling_df['feature'].str.contains("s_het")]

# add odds ratio distance
sampling_df = plotting.add_odds_ratio_intervals(sampling_df)
sampling_df = plotting.add_effect_size_intervals(sampling_df)

# rename target
sampling_df = sampling_df[sampling_df['target'].isin(renaming_dict_target.keys())].copy()
sampling_df['target'] = sampling_df['target'].apply(lambda x: renaming_dict_target.get(x, x))


In [None]:
def plot_sampling(ax, targets, plot_entity='odds_ratio', vline_loc=1):

    # select necessary targets
    data_non_id = sampling_df[sampling_df['target'].isin(targets) & (sampling_df['feature'] != 's_het_recessive_ID_sampled')].copy()
    data_id = sampling_df[sampling_df['target'].isin(targets) & (sampling_df['feature'] == 's_het_recessive_ID_sampled')].copy()

    # sort dataset according to the target order
    data_non_id.target = pd.Categorical(data_non_id.target, categories=targets)
    data_non_id = data_non_id.sort_values('target')

    data_id.target = pd.Categorical(data_id.target, categories=targets)
    data_id = data_id.sort_values('target')

    # plot neutral line
    ax.vlines(x=vline_loc, ymin=-3, ymax=10, color='grey', linestyle='--', alpha=0.5, linewidth=0.5)
    
    vp = sns.violinplot(data=data_non_id[data_non_id['gender'] == 'all'], y='target', x=plot_entity, hue='target',
                ax=ax, linewidth=0.5, linecolor='k', palette=['#a0b4ff'])

    # plot PLPs estimation
    pp = sns.pointplot(data=data_id[data_id['gender'] == 'all'], y='target', x=plot_entity,
                    linestyle='none', markers='o', ax=ax, color='#ffa260', markersize=3, markeredgewidth=0.5, markeredgecolor='k')

In [None]:
import matplotlib.ticker as ticker

cm = 1/2.54  # centimeters in inches
k = 1.
fig = plt.figure(constrained_layout=True, figsize=(12*cm*k, 6*cm*k))

# divide subfigure into left and right parts
subfigs = fig.subfigures(nrows=1, ncols=2,  width_ratios=[1, 1], wspace=0.03)

# plot phenotypes results
ax_left = subfigs[0].subplots(2, 1)
plot_sampling(ax_left[0], targets = ['Childlessness'], plot_entity='odds_ratio', vline_loc=1)
plot_sampling(ax_left[1], targets = ['Hair color'], plot_entity='odds_ratio', vline_loc=1)

plotting.configure_axis(ax_left[0], x_label='Odds ratio',  xlim=None, ymargin=0.08, format_x=True)
plotting.configure_axis(ax_left[1], x_label='Odds ratio', xlim=None, ymargin=0.08, format_x=True)


# plot synonymous
ax_right = subfigs[1].subplots(3, 1)

plot_sampling(ax_right[0], targets = ['Years of\neducation'], plot_entity='effect', vline_loc=0)

plot_sampling(ax_right[1], targets = ['Fluid\nintelligence'], plot_entity='effect', vline_loc=0)

plot_sampling(ax_right[2], targets = ['Diagnoses\ncount (log)'], plot_entity='effect', vline_loc=0)

plotting.configure_axis(ax_right[0], x_label='Effect size', xlim=None, ymargin=0.08, format_x=True)
plotting.configure_axis(ax_right[1], x_label='Effect size', xlim=None, ymargin=0.08, format_x=True)
plotting.configure_axis(ax_right[2], x_label='Effect size',  xlim=None, ymargin=0.08, format_x=True)
ax_right[2].set_xticks([0, 0.2, 0.4, 0.6])
ax_right[2].set_xticklabels([0, 0.2, 0.4, 0.6])

# save figure
plt.savefig(f"{PLOTS_OUTPUT_DIR}/sup_figure_7.pdf", format="pdf", bbox_inches="tight")