In [None]:
import pandas as pd

import seaborn as sns

import matplotlib.pyplot as plt

from ukbb_recessive.regression.regressions import sci_notation, get_plot_data, plot_errorbar_grouped
from ukbb_recessive.data_collection.variants import VariantFeatures


sns.set_style("whitegrid")

import matplotlib

from matplotlib import font_manager
import numpy as np
from matplotlib.cm import get_cmap

import glob

import matplotlib.ticker as ticker


In [None]:
# Add fonts
font_dirs = ['../../../../data/fonts']
font_files = font_manager.findSystemFonts(fontpaths=font_dirs)

for font_file in font_files:
    font_manager.fontManager.addfont(font_file)
    print ("Added:", font_file)

In [None]:
SMALL_SIZE = 5
MEDIUM_SIZE = 6
BIGGER_SIZE = 7

plt.rc('font', size=BIGGER_SIZE, family='Arimo')          # controls default text sizes

plt.rcParams['text.usetex']= False

matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

In [None]:
def configure_axis(ax, ytick_size=MEDIUM_SIZE, xtick_size=MEDIUM_SIZE, xlabel_size=MEDIUM_SIZE, ylabel_size=MEDIUM_SIZE, x_label=None, y_label=None):
    # Turn off grid
    ax.grid(False) 

    # Customize spines
    ax.spines['left'].set_color('black')
    ax.spines['bottom'].set_color('black')
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
        
    # Set tick labels size
    ax.tick_params(axis='y', labelsize=ytick_size) 
    ax.tick_params(axis='x', labelsize=xtick_size) 

    # Add tick marks
    ax.yaxis.set_ticks_position('left')
    ax.xaxis.set_ticks_position('bottom')
    ax.tick_params(which='major', width=1.00, length=2.5)
    ax.tick_params(which='minor', width=0.75, length=1.25)

    ax.set_xlabel(x_label, size=xlabel_size)
    ax.set_ylabel(y_label, size=ylabel_size)

# Extended figure 1

In [None]:
def hets_freq_bin(hets_freq):
    if hets_freq <= 0:
        return '0%'
    elif hets_freq <= 0.5:
        return '0-0.5%'
    elif hets_freq <= 1:
        return '0.5-1%'
    elif hets_freq <= 2:
        return '1-2%'
    elif hets_freq <= 5:
        return '2-5%'
    else:
        return '>5%'

# samples of interest: European & non-related
european_non_rel_samples =  "../450k/samples/european_non_related_no_withdrawal_to_include.txt"

european_non_rel_samples = set(pd.read_csv(european_non_rel_samples, header=None)[0].values.tolist())
print ("Number of samples:", len(european_non_rel_samples))


In [None]:
# read list of PLPs
plps = "../450k/plp_selection/basic/new_gene_names/new_freq/all_chr_total_presumable_plps_HFE_final_sorted.txt"
plps = pd.read_csv(plps, sep='\t')

# define hets frequency bins
plps['hets_freq'] = plps['hets']*100./len(european_non_rel_samples)
plps['hets_freq_bin'] = plps['hets_freq'].apply(hets_freq_bin)

# count number of variants per frequency bin
plps_freq_count = plps.groupby('hets_freq_bin').count()[['chr']].reset_index().rename(columns={'chr': 'plp_cnt'})
# remove 0% bin
plps_freq_count = plps_freq_count[plps_freq_count['hets_freq_bin'] != '0%']

In [None]:
import matplotlib.ticker as ticker


cm = 1/2.54  # centimeters in inches
k = 1.

fig = plt.figure(constrained_layout=True, figsize=(18*cm*k, 8*cm*k))

ax = fig.subplots(1, 1)

sns.barplot(data=plps_freq_count, x='hets_freq_bin', y='plp_cnt', ax=ax, color='#61D4D4')

# add the annotation
labels =[x.get_height() for x in ax.containers[-1]]
labels_percentage = [f"{int(num)} \n({round(num*100/sum(labels), 4)}%)" for num in labels]


ax.bar_label(ax.containers[-1],  labels=labels_percentage, label_type='edge', size=SMALL_SIZE)

configure_axis(ax, x_label="Variant frequency", y_label="Number of PLPs")
ax.set_yscale("log")

plt.savefig("../../../../data/plots/ext_figure_1.pdf", format="pdf", bbox_inches="tight")


# Extended figure 2

In [None]:
gene_panel = pd.read_csv("../gene-panel-gencode-v34.txt", header=None)
gene_panel.columns = ['gene', 'panel']

In [None]:
# count number of het carriers per gene and add gene panel information
plps_gene = plps.groupby('gene').agg({'hets': 'sum'}).reset_index().merge(gene_panel, how='outer').fillna(0)

# define hets frequency bins
plps_gene['hets_freq'] = plps_gene['hets']*100./len(european_non_rel_samples)
plps_gene['hets_freq_bin'] = plps_gene['hets_freq'].apply(hets_freq_bin)

# count number of genes per frequency bin
plps_freq_gene_count = plps_gene.groupby('hets_freq_bin').count()[['gene']].reset_index().rename(columns={'gene': 'plp_cnt'})
plps_freq_gene_count['plp_cnt'] = plps_freq_gene_count['plp_cnt'].astype(int)


In [None]:
cm = 1/2.54  # centimeters in inches
k = 1.

fig = plt.figure(constrained_layout=True, figsize=(18*cm*k, 8*cm*k))

ax = fig.subplots(1, 1)

sns.barplot(data=plps_freq_gene_count, x='hets_freq_bin', y='plp_cnt', ax=ax, color='#61D4D4')

# add the annotation
labels =[x.get_height() for x in ax.containers[-1]]
labels_percentage = [f"{int(num)} \n({round(num*100/sum(labels), 4)}%)" for num in labels]

ax.bar_label(ax.containers[-1],  labels=labels_percentage, label_type='edge', size=SMALL_SIZE)

configure_axis(ax, x_label="Carrier frequency", y_label="Number of genes")
ax.set_yscale("log")

plt.savefig("../../../../data/plots/ext_figure_2.pdf", format="pdf", bbox_inches="tight")



# Extended figure 3

In [None]:
results_files = {
    'PLPs in recessive genes': '../../../../data/tables/table_basic_regressions_on_[s_het_recessive_all].xlsx', 
    'Singleton LoFs in non-recessive genes': '../../../../data/tables/table_basic_regressions_on_[s_het_lof_without_AR].xlsx',
    'PLPs in recessive genes w/out LoF carriers': '../../../../data/tables/table_basic_regressions_on_[s_het_recessive_all]_without_LoF_carriers.xlsx'
}

s_het_effects = {
    'PLPs in recessive genes': 's_het_recessive_all', 
    'Singleton LoFs in non-recessive genes': 's_het_lof_without_AR',
    'PLPs in recessive genes w/out LoF carriers': 's_het_recessive_all'
}

# Load data

data = {}
for key, path in results_files.items():
    # read raw table
    reader = pd.ExcelFile(path)

    all_results_df = pd.read_excel(reader, sheet_name="Raw data", header=[0, 1], skiprows=[2])
    all_results_df = all_results_df.drop(all_results_df.columns[0], axis=1)

    # prettify p-values
    new_columns = [(level0, 'p_value_pretty') for level0 in all_results_df.columns.get_level_values(level=0).unique()]
    all_results_df[new_columns] = all_results_df.loc[:, (slice(None), 'p_value')].applymap(sci_notation)

    # leave s_het effects only
    data[key] = all_results_df[all_results_df[all_results_df.columns[0]] == s_het_effects[key]]

# Dictionary to rename target columns
renaming_dict_target = {
    'is_blond': 'Hair color', 
    'childlessness': 'Childlessness'
}

# List to store the final plot data
plot_data = []

# Loop over the different scores for genetic burden
for idx, s_het_type in enumerate(['Cassa', 'pLI']):

    # List to store the plot data 
    phenotypic_data = []
    
    # Loop over the different datasets
    for key, df in data.items():
        # Append plot data for childlessness
        phenotypic_data.append(get_plot_data(df=df[s_het_type], target='childlessness', tag=key))
        # Append plot data for is_blond
        phenotypic_data.append(get_plot_data(df=df[s_het_type], target='is_blond', tag=key))

    # Combine phenotypic data for all datasets
    phenotypic_data = pd.concat(phenotypic_data)
    phenotypic_all = phenotypic_data[phenotypic_data['gender'] == 'all']

    # Prepare and add data for the plot
    phenotypic_all = phenotypic_all.sort_values(by='tag', ascending=False)
    phenotypic_all['type'] = s_het_type
    plot_data += [phenotypic_all]

# Combine all the data from different  scores for genetic burden
plot_data = pd.concat(plot_data)

# Rename the 'target' column using the renaming_dict_target dictionary
plot_data['target'] = plot_data['target'].apply(lambda x: renaming_dict_target.get(x, x))

plot_data.head(3)

In [None]:
def plot_3ab_ext(ax, data_type, text_pos=2.5):
    y_order = ['Childlessness', 'Hair color']
    group_order = ['Singleton LoFs in non-recessive genes', 'PLPs in recessive genes w/out LoF carriers', 'PLPs in recessive genes'][::-1]

    legend_kwargs = {
        "bbox_to_anchor": (0.0, 1., 1, .102), 
        "frameon": False,
        "mode": "expand", 
        "ncol": 1, 
        "labelspacing": 0.1, 
        "markerfirst": False, 
        'fontsize': MEDIUM_SIZE
    }

    colors=['BlueViolet', '#006D5B', '#F89F5B']

    plotted_data = plot_errorbar_grouped(df=plot_data[plot_data['type'] == data_type], 
                                        axis=ax, y_column='target', group_column = 'tag',  
                                        title='', 
                                        ymargin=0.1, 
                                        legend_loc='upper left', 
                                        legend_kwargs=legend_kwargs,
                                        group_scale=0.03, 
                                        y_scale=0.13,
                                        colors=colors,
                                        y_order=y_order, group_order=group_order)


    configure_axis(ax, x_label='OR (99% CI)', y_label=None)

    # Display p-values or any other information in the sidebar
    p_values = plotted_data['p_value_pretty']
    or_values = plotted_data['odds_ratio']
    positions = plotted_data['y']


    for i, (p, oddsr, pos) in enumerate(zip(p_values, or_values, positions)):
        ax.text(text_pos, pos+0.01, f'{oddsr:.2f}   {p}', va='center', size=MEDIUM_SIZE)


In [None]:
cm = 1/2.54  # centimeters in inches
k = 1.

fig = plt.figure(constrained_layout=True, figsize=(18*cm*k, 8*cm*k))

axes = fig.subplots(1, 2)

plot_3ab_ext(axes[0], 'Cassa', text_pos=2.5)

plot_3ab_ext(axes[1], 'pLI', text_pos=1.2)

plt.savefig("../../../../data/plots/ext_figure_3.pdf", format="pdf", bbox_inches="tight")

# Extended figure 4

In [None]:
# read raw table
reader = pd.ExcelFile('../../../../data/tables/table_basic_regressions_with_covariates.xlsx')

all_results_df = pd.read_excel(reader, sheet_name="Raw data", header=[0, 1], skiprows=[2])
all_results_df = all_results_df.drop(all_results_df.columns[0], axis=1)

# prettify p-values
new_columns = [(level0, 'p_value_pretty') for level0 in all_results_df.columns.get_level_values(level=0).unique()]
all_results_df[new_columns] = all_results_df.loc[:, (slice(None), 'p_value')].applymap(sci_notation)

# leave s_het effects only
all_results_df = all_results_df[all_results_df[all_results_df.columns[0]].str.contains('s_het')]

In [None]:
renaming_covariates = {
	'diagnosis_main_ICD10_cnt': 'With number of main ICD-10 diagnoses',
    'diagnosis_secondary_ICD10_cnt': 'With number of secondary ICD-10 diagnoses',
	'diagnosis_total_ICD10_cnt': 'With total number of ICD-10 diagnoses',
	'ICD_infertility': 'With infertility status',
	'basic': 'Original regression',
}


plot_data = []

for idx, s_het_type in enumerate(['Weghorn', 'Cassa', 'pLI']):

    childlessness_all = get_plot_data(df=all_results_df[s_het_type], target='childlessness', tag=s_het_type)

    plot_data += [childlessness_all]

plot_data = pd.concat(plot_data)

plot_data['covariate'] = plot_data['analysis'].apply(lambda x: x.split(',')[-1][:-1].strip() if ',' in x else 'basic')

plot_data['covariate'] = plot_data['covariate'].apply(lambda x: renaming_covariates.get(x, x))

plot_data.head(3)

In [None]:
def plot_4_ext(ax, text_pos=2.5):
    y_order = ['Weghorn', 'Cassa', 'pLI']
    group_order = ['Original regression',  'With infertility status', 'With number of main ICD-10 diagnoses',
                    'With number of secondary ICD-10 diagnoses', 'With total number of ICD-10 diagnoses']

    legend_kwargs = {
        "bbox_to_anchor": (0.0, 1.1, 1, .102), 
        "frameon": False,
        "mode": "expand", 
        "ncol": 1, 
        "labelspacing": 0.1, 
        "markerfirst": False, 
        'fontsize': MEDIUM_SIZE
    }

    colors=['BlueViolet', '#9c0079', '#df0057', '#ff5731', '#ffa600']

    plotted_data = plot_errorbar_grouped(df=plot_data, 
                                        axis=ax, y_column='tag', group_column = 'covariate',  
                                        title='', 
                                        ymargin=0.1, 
                                        legend_loc='upper left', 
                                        legend_kwargs=legend_kwargs,
                                        group_scale=0.02, 
                                        y_scale=0.13,
                                        colors=colors,
                                        y_order=y_order, group_order=group_order)


    configure_axis(ax, x_label='OR (99% CI)', y_label=None)

    # Display p-values or any other information in the sidebar
    p_values = plotted_data['p_value_pretty']
    or_values = plotted_data['odds_ratio']
    positions = plotted_data['y']

    for i, (p, oddsr, pos) in enumerate(zip(p_values, or_values, positions)):
        ax.text(text_pos, pos+0.01, f'{oddsr:.2f}   {p}', va='center', size=MEDIUM_SIZE)


In [None]:
cm = 1/2.54  # centimeters in inches
k = 1.

fig = plt.figure(constrained_layout=True, figsize=(9*cm*k, 8*cm*k))

ax= fig.subplots(1, 1)

plot_4_ext(ax)

plt.savefig("../../../../data/plots/ext_figure_4.pdf", format="pdf", bbox_inches="tight")

# Extended figure 5

In [None]:
results_files = {
    'Singleton LoFs in non-recessive genes': '../../../../data/tables/table_reduced_samples_regressions_on_[s_het_lof_without_AR].xlsx', 
    'PLPs in recessive genes': '../../../../data/tables/table_reduced_samples_regressions_on_[s_het_recessive_all].xlsx'
}

s_het_effects = {
    'PLPs in recessive genes': 's_het_recessive_all', 
    'Singleton LoFs in non-recessive genes': 's_het_lof_without_AR'
}

# Load data

p_values_data = {}
for key, path in results_files.items():
    # read raw table
    reader = pd.ExcelFile(path)

    p_values_df = pd.read_excel(reader, sheet_name="Raw data", header=[0, 1], skiprows=[2])
    p_values_df = p_values_df.drop(p_values_df.columns[0], axis=1)

    # prettify p-values
    new_columns = [(level0, 'p_value_pretty') for level0 in p_values_df.columns.get_level_values(level=0).unique()]
    p_values_df[new_columns] = p_values_df.loc[:, (slice(None), 'p_value')].applymap(sci_notation)

    # parse fraction
    new_columns = [(level0, 'fraction') for level0 in p_values_df.columns.get_level_values(level=0).unique()]
    p_values_df[new_columns] = p_values_df.loc[:, (slice(None), 'analysis')].applymap(lambda x: float(x.split('=')[-1]))

    # leave s_het effects only
    p_values_data[key] = p_values_df[p_values_df[p_values_df.columns[0]] == s_het_effects[key]]

In [None]:
def plot_5_ext(axes):
    data_tags = sorted(s_het_effects.keys())

    legend_kwargs = {
        "bbox_to_anchor": (-0.1, 1, 1.1, .102), 
        "frameon": False,
        "mode": "expand", 
        "ncol": 2, 
        "labelspacing": 0.1, 
        "markerfirst": False, 
        'fontsize': MEDIUM_SIZE
    }

    colors=['BlueViolet', '#F89F5B']

    for col in range (2):
        ax = axes[col]
        tag = data_tags[col]

        ax.axhline(y=0.005, color='Gray', linestyle='dotted', linewidth=1, alpha=0.5)

        sns.lineplot(data=p_values_data[tag]['Weghorn'], x='fraction', y='p_value', marker="D", 
                 color=colors[col], ax=ax,  errorbar=('pi', 99), linewidth=1, markersize=2)
        
        ax.set_yscale('log')
        
        configure_axis(ax, x_label='Cohort size', y_label='P-value')

        
    ax.legend(**legend_kwargs)

In [None]:
cm = 1/2.54  # centimeters in inches
k = 1.

fig = plt.figure(constrained_layout=True, figsize=(18*cm*k, 8*cm*k))

axes = fig.subplots(1, 2)

plot_5_ext(axes)

plt.savefig("../../../../data/plots/ext_figure_5.pdf", format="pdf", bbox_inches="tight")

# Extended figure 6

In [None]:
# read raw table
reader = pd.ExcelFile('../../../../data/tables/table_basic_regressions_on_[s_het_recessive_all].xlsx')

all_results_df_recessive = pd.read_excel(reader, sheet_name="Raw data", header=[0, 1], skiprows=[2])
all_results_df_recessive = all_results_df_recessive.drop(all_results_df_recessive.columns[0], axis=1)

# prettify p-values
new_columns = [(level0, 'p_value_pretty') for level0 in all_results_df_recessive.columns.get_level_values(level=0).unique()]
all_results_df_recessive[new_columns] = all_results_df_recessive.loc[:, (slice(None), 'p_value')].applymap(sci_notation)

# leave s_het effects only
all_results_df_recessive = all_results_df_recessive[all_results_df_recessive[all_results_df_recessive.columns[0]].str.contains('s_het')]

In [None]:
# read raw table
reader = pd.ExcelFile('../../../../data/tables/table_panel_regressions_on_[s_het_recessive_AR_without_ID,_s_het_recessive_ID_total].xlsx')

all_results_df = pd.read_excel(reader, sheet_name="Raw data", header=[0, 1], skiprows=[2])
all_results_df = all_results_df.drop(all_results_df.columns[0], axis=1)

# prettify p-values
new_columns = [(level0, 'p_value_pretty') for level0 in all_results_df.columns.get_level_values(level=0).unique()]
all_results_df[new_columns] = all_results_df.loc[:, (slice(None), 'p_value')].applymap(sci_notation)

# leave s_het effects only
all_results_df = all_results_df[all_results_df[all_results_df.columns[0]].str.contains('s_het')]

In [None]:
plot_results_df = []

for dataset_type in ['Weghorn', 'Cassa', 'pLI']:
    total_results_dataset = pd.concat([all_results_df[dataset_type], all_results_df_recessive[dataset_type]])
    total_results_dataset['type'] = dataset_type

    plot_results_df += [total_results_dataset]

plot_results_df = pd.concat(plot_results_df)

renaming_dict_target = {
    'any_education_including_none': "Any education",
    'higher_education_including_none': "Higher education",
    # 'is_blond': 'Hair color', 
    'childlessness': 'Childlessness'
}

renaming_dict_panel = {
    's_het_recessive_AR_without_ID': "Other recessive genes",
    's_het_recessive_ID_total': 'Recessive ID genes', 
    's_het_recessive_all': "All recessive genes"
}


plot_results_df = plot_results_df[plot_results_df['target'].isin(renaming_dict_target.keys())]
plot_results_df['odds_ratio_lower'] = plot_results_df['odds_ratio'] - plot_results_df['odds_ratio_lower']
plot_results_df['odds_ratio_upper'] = plot_results_df['odds_ratio_upper'] - plot_results_df['odds_ratio']

all_results_df = plot_results_df[plot_results_df['gender'] == 'all']
plot_results_df['target'] = plot_results_df['target'].apply(lambda x: renaming_dict_target.get(x, x))
plot_results_df['feature'] = plot_results_df['feature'].apply(lambda x: renaming_dict_panel.get(x, x))

plot_results_df = plot_results_df.sort_values(by='feature')

In [None]:
def plot_6_ext(ax, dataset_type, text_pos=2.5):
    y_order = ['Childlessness', 'Any education', 'Higher education']

    group_order = ['All recessive genes', 'Recessive ID genes', 'Other recessive genes']

    colors=['#006D5B', 'BlueViolet','#F89F5B']

    printed_results_df = plot_errorbar_grouped(
        df=plot_results_df[plot_results_df['type'] == dataset_type], axis=ax, y_column='target', group_column = 'feature',  
        title=dataset_type, 
        legend_loc='center right', 
        group_scale=0.2, y_order=y_order, group_order=group_order, ymargin=0.1, colors=colors)

    # Display p-values or any other information in the sidebar
    p_values = printed_results_df['p_value_pretty']
    or_values = printed_results_df['odds_ratio']
    positions = printed_results_df['y']

    for i, (p, oddsr, pos) in enumerate(zip(p_values, or_values, positions)):
        ax.text(text_pos, pos+0.08, f'{oddsr:.2f}   {p}', va='center', size=MEDIUM_SIZE)

    configure_axis(ax, x_label="OR (99% CI)", y_label=None, ytick_size=SMALL_SIZE)

    legend_handles_labels = ax.get_legend_handles_labels()
    ax.get_legend().set_visible(False)

    return legend_handles_labels

In [None]:
cm = 1/2.54  # centimeters in inches
k = 1.

fig = plt.figure(constrained_layout=True, figsize=(18*cm*k, 8*cm*k))

axes = fig.subplots(1, 2)

plot_6_ext(axes[0], 'Cassa', text_pos=3)

plot_6_ext(axes[1], 'pLI', text_pos=1.2)

plt.savefig("../../../../data/plots/ext_figure_6.pdf", format="pdf", bbox_inches="tight")

# Extended figure 7

In [None]:
cr_df = pd.read_csv("../../../../data/tables/CR_panel.csv").melt(id_vars='Panel', value_vars=['UKB', 'Dutch', 'Estonian']).rename(
    columns={'variable': 'Cohort', 'value': 'Consaguinity ratio'})

cr_df = cr_df[cr_df['Panel'] != 'Severe genes']

cr_df.head(3)

In [None]:
cm = 1/2.54  # centimeters in inches
k = 1.

fig = plt.figure(constrained_layout=True, figsize=(18*cm*k, 8*cm*k))

ax = fig.subplots(1, 1)

colors = ['#00008b', '#df0057', '#ffa600']

sns.barplot(data=cr_df, x='Panel', y='Consaguinity ratio', hue='Cohort', ax=ax, palette=colors, saturation=0.8)

for container in ax.containers:
    ax.bar_label(container, size=SMALL_SIZE)

# ax.grid(linestyle='dotted', axis='y') 
configure_axis(ax, x_label=None, y_label='Consanguinity ratio', ytick_size=SMALL_SIZE)

plt.xticks(rotation=90)
plt.xlabel(None)

legend_kwargs = {
        "frameon": False,
        "labelspacing": 0.1, 
        "markerfirst": False, 
        'fontsize': MEDIUM_SIZE
    }

plt.legend(**legend_kwargs)

plt.savefig("../../../../data/plots/ext_figure_7.pdf", format="pdf", bbox_inches="tight")

# Extended figure 8

In [None]:
# read original gene panel

gene_panels = pd.read_csv("../gene-panel-gencode-v34.txt", header=None)
gene_panels.columns = ['gene_symbol', 'panel']

print ("Number of genes:", gene_panels.shape[0])

In [None]:
def s_het_bracket(s_het):
    if s_het <= 0.003:
        return '<=0.003'
    elif s_het <= 0.01:
         return '<=0.01'
    elif s_het <= 0.03:
         return '<=0.03'
    elif s_het <= 0.1:
         return '<=0.1'
    return '>0.1'

show_order = [
    'Non-Recessive',
    'All recessive',
    'ID',
    'Metabolic',
    'Metabolic-ID',
    'Dermatologic',
    'Blindness',
    'Multi-system',
    'Neuromuscular',
    'Cilia + Kidney',
    'Immune system',
    'Endocrine',
    'Skeletal',
    'Hematologic',
    'Deafness'
]


s_het = pd.read_csv("../weghorn_drift_gencode-v34.txt", sep='\t')

s_het = s_het.merge(gene_panels, how='left').rename(columns={'panel': 'Gene set'})

s_het['Gene set'] = s_het['Gene set'].fillna('Non-Recessive')

renaming_dict_gene_sets = {
	'Cilia+Kidney': 'Cilia + Kidney',
     'Derm': 'Dermatologic',
	'ID-total': 'ID',
	'Immune_system': 'Immune system',
	'Overlaps': 'Multi-system',
	'Skeletal+Craniofacial': 'Skeletal'
}

delete_list = ['No_panel', 'Tumor', 'Cardiovascular']

s_het['Gene set'] = s_het['Gene set'].apply(lambda x: renaming_dict_gene_sets.get(x, x))
s_het = s_het[~s_het['Gene set'].isin(delete_list)]

s_het_recessive = s_het[s_het['Gene set'] != 'Non-Recessive'].copy()
s_het_recessive['Gene set'] = 'All recessive'

s_het = pd.concat([s_het, s_het_recessive])

s_het['Gene set'] = s_het['Gene set'].astype('category')
s_het['Gene set'].cat.set_categories(show_order, ordered=True, inplace=True)

s_het = s_het.sort_values(by='Gene set')

In [None]:
cm = 1/2.54  # centimeters in inches
k = 1.

fig = plt.figure(constrained_layout=True, figsize=(18*cm*k, 9*cm*k))

ax = fig.subplots(1, 1)

sns.stripplot(data=s_het, x='s_het', y='Gene set', hue='Gene set', 
              jitter=0.2, edgecolor='black', size=1,
              marker="D", linewidth=0.2, alpha=.8, ax=ax)

plt.axvline(x=0.15,  linestyle='--', color='gray', linewidth=1)

configure_axis(ax, x_label='S-het', y_label=None)

ax.get_legend().remove()

plt.savefig("../../../../data/plots/ext_figure_8.pdf", format="pdf", bbox_inches="tight")