In [None]:
import pandas as pd

import seaborn as sns

import matplotlib.pyplot as plt

from ukbb_recessive.regression.regressions import sci_notation, get_plot_data, plot_errorbar_grouped

sns.set_style("whitegrid")

import matplotlib

from matplotlib import font_manager
import numpy as np
from matplotlib.cm import get_cmap

In [None]:
# Add fonts
font_dirs = ['../../../../data/fonts']
font_files = font_manager.findSystemFonts(fontpaths=font_dirs)

for font_file in font_files:
    font_manager.fontManager.addfont(font_file)
    print ("Added:", font_file)

In [None]:
results_files = {
    'PLPs in recessive genes': '../../../../data/tables/table_basic_regressions_on_[s_het_recessive_all].xlsx', 
    'Singleton LoFs in non-recessive genes': '../../../../data/tables/table_basic_regressions_on_[s_het_lof_without_AR].xlsx',
    'PLPs in recessive genes w/out LoF carriers': '../../../../data/tables/table_basic_regressions_on_[s_het_recessive_all]_without_LoF_carriers.xlsx'
}

s_het_effects = {
    'PLPs in recessive genes': 's_het_recessive_all', 
    'Singleton LoFs in non-recessive genes': 's_het_lof_without_AR',
    'PLPs in recessive genes w/out LoF carriers': 's_het_recessive_all'
}

# Load data

data = {}
for key, path in results_files.items():
    # read raw table
    reader = pd.ExcelFile(path)

    all_results_df = pd.read_excel(reader, sheet_name="Raw data", header=[0, 1], skiprows=[2])
    all_results_df = all_results_df.drop(all_results_df.columns[0], axis=1)

    # prettify p-values
    new_columns = [(level0, 'p_value_pretty') for level0 in all_results_df.columns.get_level_values(level=0).unique()]
    all_results_df[new_columns] = all_results_df.loc[:, (slice(None), 'p_value')].applymap(sci_notation)

    # leave s_het effects only
    data[key] = all_results_df[all_results_df[all_results_df.columns[0]] == s_het_effects[key]]

In [None]:
SMALL_SIZE = 16
MEDIUM_SIZE = 22
BIGGER_SIZE = 24

plt.rc('font', size=SMALL_SIZE, family='Arimo')          # controls default text sizes
plt.rc('axes', titlesize=BIGGER_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=BIGGER_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

plt.rcParams['text.usetex']= False


matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

# Figure 1A

In [None]:
plot_data = []

for idx, s_het_type in enumerate(['Weghorn', 'Cassa', 'pLI']):

    childlessness_data = []

    for key, df in data.items():
        childlessness_data.append(get_plot_data(df=df[s_het_type], target='childlessness', tag=key))

    childlessness_data = pd.concat(childlessness_data)
    childlessness_all = childlessness_data[childlessness_data['gender'] == 'all']

    childlessness_all = childlessness_all.sort_values(by='tag', ascending=False)

    childlessness_all['type'] = s_het_type
    plot_data += [childlessness_all]

plot_data = pd.concat(plot_data)

In [None]:
y_order = ['Weghorn', 'Cassa', 'pLI']
group_order = ['Singleton LoFs in non-recessive genes', 'PLPs in recessive genes w/out LoF carriers', 'PLPs in recessive genes'][::-1]

fig, ax = plt.subplots(1, 1, figsize=(12, 5))

plotted_data = plot_errorbar_grouped(df=plot_data, axis=ax, y_column='type', group_column = 'tag',  title='Association with childlessness', 
                      text_margin_ratio=0.53, ymargin=0.2, legend_loc='lower right', group_scale=0.25, 
                      y_order=y_order, group_order=group_order)

plt.savefig("../../../../data/plots/figure_1a.pdf", format="pdf", bbox_inches="tight")

plotted_data[['type', 'tag', 'p_value_pretty']]

# Figure 3A

In [None]:
# generate plot data
df = data['PLPs in recessive genes']

plot_data = []

for idx, s_het_type in enumerate(['Weghorn', 'Cassa', 'pLI']):

    edu_any = get_plot_data(df=df[s_het_type], target='any_education_including_none', tag='Any education')
    edu_higher = get_plot_data(df=df[s_het_type], target='higher_education_including_none', tag='Higher education')

    edu = pd.concat([edu_any, edu_higher])
    edu_all = edu[edu['gender'] == 'all']

    edu_all['type'] = s_het_type

    plot_data += [edu_all]

plot_data = pd.concat(plot_data)

# plot
group_order = ['Weghorn', 'Cassa', 'pLI']

fig, ax = plt.subplots(1, 1, figsize=(12, 4))

plot_errorbar_grouped(df=plot_data, axis=ax, y_column='tag', group_column = 'type',  title='Association with education', 
                      text_margin_ratio=0.53, ymargin=0.2, legend_loc='upper left', group_scale=0.2, group_order=y_order,
                      colors=get_cmap("Accent").colors[::-1])

# plt.xlim([0.22, 1.01])

plt.savefig("../../../../data/plots/figure_3a.pdf", format="pdf", bbox_inches="tight")

plot_data.sort_values(by='target')[['target', 'type', 'p_value_pretty']]

# S-het distribution per gene

In [None]:
# read original gene panel

gene_panels = pd.read_csv(".../gene-panel.txt", header=None)
gene_panels.columns = ['gene_symbol', 'panel']

print ("Number of genes:", gene_panels.shape[0])

In [None]:
show_order = ['Non-Recessive',
'All recessive',
 'ID',
 'Metabolic',
 'Metabolic-ID',
 'Dermatologic',
 'Blindness',
 'Multi-system',
 'Neuromuscular',
 'Cilia + Kidney',
 'Immune system',
 'Endocrine',
 'Skeletal',
 'Hematologic',
 'Deafness'
]

def s_het_bracket(s_het):
    if s_het <= 0.003:
        return '<=0.003'
    elif s_het <= 0.01:
         return '<=0.01'
    elif s_het <= 0.03:
         return '<=0.03'
    elif s_het <= 0.1:
         return '<=0.1'
    return '>0.1'

s_het = pd.read_csv("/ifs/data/research/projects/UKB/hila/450k/selection_weghorn/weghorn_drift_gencode-v34.txt", sep='\t')

s_het = s_het.merge(gene_panels, how='left').rename(columns={'panel': 'Gene set'})

s_het['Gene set'] = s_het['Gene set'].fillna('Non-Recessive')

# s_het = s_het.rename(columns={'panel': 'Gene set'})

renaming_dict_gene_sets = {
	'Cilia+Kidney': 'Cilia + Kidney',
     'Derm': 'Dermatologic',
	'ID-total': 'ID',
	'Immune_system': 'Immune system',
	'Overlaps': 'Multi-system',
	'Skeletal+Craniofacial': 'Skeletal'
}

delete_list = ['No_panel', 'Tumor', 'Cardiovascular']

s_het['Gene set'] = s_het['Gene set'].apply(lambda x: renaming_dict_gene_sets.get(x, x))
s_het = s_het[~s_het['Gene set'].isin(delete_list)]

s_het_recessive = s_het[s_het['Gene set'] != 'Non-Recessive'].copy()
s_het_recessive['Gene set'] = 'All recessive'

s_het = pd.concat([s_het, s_het_recessive])

s_het['Gene set'] = s_het['Gene set'].astype('category')
s_het['Gene set'].cat.set_categories(show_order, ordered=True, inplace=True)

s_het = s_het.sort_values(by='Gene set')

In [None]:
fig = plt.figure(figsize=(15, 10))

# sns.boxenplot(data=s_het, x='s_het', y='Gene set')
sns.stripplot(data=s_het, x='s_het', y='Gene set', hue='Gene set', 
              jitter=0.2, edgecolor='black', 
              marker="D", linewidth=1, alpha=.8,)

plt.axvline(x=0.15,  linestyle='--', color='gray', linewidth=1.2)

plt.ylabel('Gene set')
plt.xlabel('S-het')

plt.savefig("../../../../data/plots/supp_figure_5.pdf", format="pdf", bbox_inches="tight")


In [None]:
# fig = plt.figure(figsize=(12, 8))
# # , element='step', fill=False, log_scale=True, stat='percent', bins=5, cumulative=False, common_norm=False,

# data = sns.histplot(data=s_het, x='s_het', y='panel', hue='panel', log_scale=(False, False), bins=30, 
#              legend=False, common_norm=False, cbar=False, stat='percent')

# plt.show()

# P-values plot

In [None]:
results_files = {
    'Singleton LoFs in non-recessive genes': '../../../../data/tables/table_reduced_samples_regressions_on_[s_het_lof_without_AR].xlsx', 
    'PLPs in recessive genes': '../../../../data/tables/table_reduced_samples_regressions_on_[s_het_recessive_all].xlsx'
}

s_het_effects = {
    'PLPs in recessive genes': 's_het_recessive_all', 
    'Singleton LoFs in non-recessive genes': 's_het_lof_without_AR'
}

# Load data

data = {}
for key, path in results_files.items():
    # read raw table
    reader = pd.ExcelFile(path)

    all_results_df = pd.read_excel(reader, sheet_name="Raw data", header=[0, 1], skiprows=[2])
    all_results_df = all_results_df.drop(all_results_df.columns[0], axis=1)

    # prettify p-values
    new_columns = [(level0, 'p_value_pretty') for level0 in all_results_df.columns.get_level_values(level=0).unique()]
    all_results_df[new_columns] = all_results_df.loc[:, (slice(None), 'p_value')].applymap(sci_notation)

    # parse fraction
    new_columns = [(level0, 'fraction') for level0 in all_results_df.columns.get_level_values(level=0).unique()]
    all_results_df[new_columns] = all_results_df.loc[:, (slice(None), 'analysis')].applymap(lambda x: float(x.split('=')[-1]))

    # leave s_het effects only
    data[key] = all_results_df[all_results_df[all_results_df.columns[0]] == s_het_effects[key]]

In [None]:
fig = plt.figure(constrained_layout=True, figsize=(20, 12))

data_tags = sorted(s_het_effects.keys())

subfigs = fig.subfigures(nrows=2, ncols=1)

for row, subfig in enumerate(subfigs):
    tag = data_tags[row]
    subfig.suptitle(tag)

    ax = subfig.subplots(1, 2)

    sns.lineplot(data=data[tag]['Weghorn'], x='fraction', y='odds_ratio', marker="D", ax=ax[0])
    ax[0].set_ylabel('Odds ratio')
    ax[0].set_xlabel('Cohort size fraction')


    sns.lineplot(data=data[tag]['Weghorn'], x='fraction', y='p_value', marker="D", ax=ax[1])
    ax[1].set_ylabel('P-value')
    ax[1].set_xlabel('Cohort size fraction')

    ax[1].axhline(y=0.005, color='salmon', linestyle='--')
    ax[1].set_yscale('log')


plt.savefig("../../../../data/plots/supp_figure_4.pdf", format="pdf", bbox_inches="tight")

# Covariates

A

In [None]:
# read raw table
reader = pd.ExcelFile('../../../../data/tables/table_basic_regressions_with_covariates.xlsx')

all_results_df = pd.read_excel(reader, sheet_name="Raw data", header=[0, 1], skiprows=[2])
all_results_df = all_results_df.drop(all_results_df.columns[0], axis=1)

# prettify p-values
new_columns = [(level0, 'p_value_pretty') for level0 in all_results_df.columns.get_level_values(level=0).unique()]
all_results_df[new_columns] = all_results_df.loc[:, (slice(None), 'p_value')].applymap(sci_notation)

# leave s_het effects only
all_results_df = all_results_df[all_results_df[all_results_df.columns[0]].str.contains('s_het')]

In [None]:
renaming_covariates = {
	'diagnosis_main_ICD10_cnt': 'With number of main ICD-10 diagnoses',
    'diagnosis_secondary_ICD10_cnt': 'With number of secondary ICD-10 diagnoses',
	'diagnosis_total_ICD10_cnt': 'With total number of ICD-10 diagnoses',
	'ICD_infertility': 'With infertility status',
	'basic': 'Original regression',
}


plot_data = []

for idx, s_het_type in enumerate(['Weghorn', 'Cassa', 'pLI']):

    childlessness_all = get_plot_data(df=all_results_df[s_het_type], target='childlessness', tag=s_het_type)

    plot_data += [childlessness_all]

plot_data = pd.concat(plot_data)

plot_data['covariate'] = plot_data['analysis'].apply(lambda x: x.split(',')[-1][:-1].strip() if ',' in x else 'basic')

plot_data['covariate'] = plot_data['covariate'].apply(lambda x: renaming_covariates.get(x, x))

# plot

y_order = ['Weghorn', 'Cassa', 'pLI']

group_order = ['Original regression',  'With infertility status', 'With number of main ICD-10 diagnoses',
                'With number of secondary ICD-10 diagnoses', 'With total number of ICD-10 diagnoses']

fig, ax = plt.subplots(1, 1, figsize=(15, 8))


plot_errorbar_grouped(df=plot_data, axis=ax, y_column='tag', group_column = 'covariate',  title='Association with childlessness', 
                      text_margin_ratio=0.53, ymargin=0.1, legend_loc='lower right', group_scale=0.15, 
                      y_order=y_order, group_order=group_order)


plt.savefig("../../../../data/plots/supp_figure_9.pdf", format="pdf", bbox_inches="tight")

plot_data.sort_values(by='tag')[['tag', 'covariate', 'p_value_pretty']]

plot_data[['tag', 'covariate', 'p_value_pretty']]

# Regressions plot

In [None]:
datasets_path = '.../datasets'

dataset = pd.read_csv(f"{datasets_path}/Weghorn-drift.csv", sep='\t')

def s_het_bin(s_het):
    if s_het is None:
        return None
    if s_het <= 0:
        return "0"
    elif s_het <= 0.1:
        return "0-0.1"
    else:
        return '> 0.1'

In [None]:
dataset['s_het_recessive_ID_total_bin'] = dataset['s_het_recessive_ID-total'].apply(s_het_bin)
dataset.loc[dataset['s_het_recessive_ID-total'].isnull(), 's_het_recessive_ID_total_bin'] = None

dataset['s_het_recessive_AR_without_ID_bin'] = dataset['s_het_recessive_AR_without_ID'].apply(s_het_bin)
dataset.loc[dataset['s_het_recessive_AR_without_ID'].isnull(), 's_het_recessive_AR_without_ID_bin'] = None

In [None]:
def plot_comparison_regressions(dataset, target, ax, s_hets=[], tags=[]):
    plot_data = []
    for s_het, tag in zip(s_hets, tags):
        d1 = dataset[[target, s_het]].copy()
        d1.columns = [target, 's_het']
        d1['tag'] = tag
        plot_data.append(d1)

    plot_data = pd.concat(plot_data)

    plot_data = plot_data.sort_values(by='s_het')
    sns.pointplot(data=plot_data, y=target, x='s_het', hue='tag', palette="Accent",
                  dodge=True, markers='s', ax=ax, linestyles='--')

    ax.legend().set_title('')

In [None]:
fig = plt.figure(constrained_layout=True, figsize=(20, 6))

subfigs = fig.subfigures(nrows=1, ncols=2)

# subfig[0].suptitle("")
ax = subfigs[0].subplots(1, 1)

plot_comparison_regressions(dataset=dataset, target='childlessness', 
                            ax=ax,
                            s_hets=['s_het_recessive_ID_total_bin', 's_het_recessive_AR_without_ID_bin'],
                            tags=[ 'Recessive ID genes', 'Other recessive genes'])

ax.set_ylabel("Proportion of \n childless individuals")
ax.set_xlabel("Genetic burden")


ax = subfigs[1].subplots(1, 1)

plot_comparison_regressions(dataset=dataset, target='any_education_including_none', 
                            ax=ax,
                            s_hets=['s_het_recessive_ID_total_bin', 's_het_recessive_AR_without_ID_bin'],
                            tags=[ 'Recessive ID genes', 'Other recessive genes'])

ax.set_ylabel("Proportion of individuals \n having any education")
ax.set_xlabel("Genetic burden")


plt.savefig("../../../../data/plots/supp_figure_10.pdf", format="pdf", bbox_inches="tight")