In [None]:
import pandas as pd

import seaborn as sns

import matplotlib.pyplot as plt

from ukbb_recessive.regression.regressions import sci_notation, get_plot_data, plot_errorbar_grouped
from ukbb_recessive.data_collection.variants import VariantFeatures


sns.set_style("whitegrid")

import matplotlib

from matplotlib import font_manager
import numpy as np
from matplotlib.cm import get_cmap

import glob

import matplotlib.ticker as ticker


In [None]:
# Add fonts
font_dirs = ['../../../../data/fonts']
font_files = font_manager.findSystemFonts(fontpaths=font_dirs)

for font_file in font_files:
    font_manager.fontManager.addfont(font_file)
    print ("Added:", font_file)

In [None]:
SMALL_SIZE = 5
MEDIUM_SIZE = 6
BIGGER_SIZE = 7

plt.rc('font', size=BIGGER_SIZE, family='Arimo')          # controls default text sizes
# plt.rc('axes', titlesize=BIGGER_SIZE)     # fontsize of the axes title
# plt.rc('axes', labelsize=BIGGER_SIZE)    # fontsize of the x and y labels
# plt.rc('xtick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
# plt.rc('ytick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
# plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
# plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

plt.rcParams['text.usetex']= False

matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

In [None]:

def configure_axis(ax, ytick_size=MEDIUM_SIZE, xtick_size=MEDIUM_SIZE, xlabel_size=MEDIUM_SIZE, ylabel_size=MEDIUM_SIZE, x_label=None, y_label=None):
    # Turn off grid
    ax.grid(False) 

    # Customize spines
    ax.spines['left'].set_color('black')
    ax.spines['bottom'].set_color('black')
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
        
    # Set tick labels size
    ax.tick_params(axis='y', labelsize=ytick_size) 
    ax.tick_params(axis='x', labelsize=xtick_size) 

    # Add tick marks
    ax.yaxis.set_ticks_position('left')
    ax.xaxis.set_ticks_position('bottom')
    ax.tick_params(which='major', width=1.00, length=2.5)
    ax.tick_params(which='minor', width=0.75, length=1.25)

    ax.set_xlabel(x_label, size=xlabel_size)
    ax.set_ylabel(y_label, size=ylabel_size)

# Prepare datasets

## PLP statistics dataset

In [None]:
# samples of interest: European & non-related
european_non_rel_samples =  "../450k/samples/european_non_related_no_withdrawal_to_include.txt"

european_non_rel_samples = set(pd.read_csv(european_non_rel_samples, header=None)[0].values.tolist())
print ("Number of samples:", len(european_non_rel_samples))

In [None]:
sample_plps = glob.glob("../450k/RAP_output_per_chr/filtered_plps/basic/new_gene_names/new_freq/chr*")
sample_plps = VariantFeatures().read_sample_plps(sample_plps)
sample_plps = sample_plps[sample_plps['s'].isin(european_non_rel_samples)]
sample_plps = sample_plps[sample_plps['GT'] != '1/1']

per_sample_plp_count = sample_plps.groupby('s').count()[['chrom']].reset_index().rename(columns={'chrom': 'plp_cnt'})
per_sample_plp_count = per_sample_plp_count.merge(pd.DataFrame({'s': list(european_non_rel_samples)}), how='outer').fillna(0)
per_sample_plp_count = per_sample_plp_count.groupby('plp_cnt').count().reset_index()
per_sample_plp_count['plp_cnt'] = per_sample_plp_count['plp_cnt'].astype(int)

per_sample_plp_count = per_sample_plp_count.sort_values(by='plp_cnt')

per_sample_plp_count.head(3)

## Childlessness regressions on different variants sets

In [None]:
results_files = {
    'PLPs in recessive genes': '../../../../data/tables/table_basic_regressions_on_[s_het_recessive_all].xlsx', 
    'Singleton LoFs in non-recessive genes': '../../../../data/tables/table_basic_regressions_on_[s_het_lof_without_AR].xlsx',
    'PLPs in recessive genes w/out LoF carriers': '../../../../data/tables/table_basic_regressions_on_[s_het_recessive_all]_without_LoF_carriers.xlsx'
}

s_het_effects = {
    'PLPs in recessive genes': 's_het_recessive_all', 
    'Singleton LoFs in non-recessive genes': 's_het_lof_without_AR',
    'PLPs in recessive genes w/out LoF carriers': 's_het_recessive_all'
}

# Load data

data = {}
for key, path in results_files.items():
    # read raw table
    reader = pd.ExcelFile(path)

    all_results_df = pd.read_excel(reader, sheet_name="Raw data", header=[0, 1], skiprows=[2])
    all_results_df = all_results_df.drop(all_results_df.columns[0], axis=1)

    # prettify p-values
    new_columns = [(level0, 'p_value_pretty') for level0 in all_results_df.columns.get_level_values(level=0).unique()]
    all_results_df[new_columns] = all_results_df.loc[:, (slice(None), 'p_value')].applymap(sci_notation)

    # leave s_het effects only
    data[key] = all_results_df[all_results_df[all_results_df.columns[0]] == s_het_effects[key]]

In [None]:
# Dictionary to rename target columns
renaming_dict_target = {
    'is_blond': 'Hair color', 
    'childlessness': 'Childlessness'
}

# List to store the final plot data
plot_data = []

# Loop over the different scores for genetic burden
for idx, s_het_type in enumerate(['Weghorn', 'Cassa', 'pLI']):

    # List to store the plot data 
    phenotypic_data = []
    
    # Loop over the different datasets
    for key, df in data.items():
        # Append plot data for childlessness
        phenotypic_data.append(get_plot_data(df=df[s_het_type], target='childlessness', tag=key))
        # Append plot data for is_blond
        phenotypic_data.append(get_plot_data(df=df[s_het_type], target='is_blond', tag=key))

    # Combine phenotypic data for all datasets
    phenotypic_data = pd.concat(phenotypic_data)
    phenotypic_all = phenotypic_data[phenotypic_data['gender'] == 'all']

    # Prepare and add data for the plot
    phenotypic_all = phenotypic_all.sort_values(by='tag', ascending=False)
    phenotypic_all['type'] = s_het_type
    plot_data += [phenotypic_all]

# Combine all the data from different  scores for genetic burden
plot_data = pd.concat(plot_data)

# Rename the 'target' column using the renaming_dict_target dictionary
plot_data['target'] = plot_data['target'].apply(lambda x: renaming_dict_target.get(x, x))


## Covariate regressions dataset

In [None]:
# read raw table
reader = pd.ExcelFile('../../../../data/tables/table_basic_regressions_with_covariates.xlsx')

all_results_df = pd.read_excel(reader, sheet_name="Raw data", header=[0, 1], skiprows=[2])
all_results_df = all_results_df.drop(all_results_df.columns[0], axis=1)

# prettify p-values
new_columns = [(level0, 'p_value_pretty') for level0 in all_results_df.columns.get_level_values(level=0).unique()]
all_results_df[new_columns] = all_results_df.loc[:, (slice(None), 'p_value')].applymap(sci_notation)

# leave s_het effects only
all_results_df = all_results_df[all_results_df[all_results_df.columns[0]].str.contains('s_het')]

In [None]:
renaming_covariates = {
	'diagnosis_main_ICD10_cnt': '# main diagnoses',
    'diagnosis_secondary_ICD10_cnt': '# secondary diagnoses',
	'diagnosis_total_ICD10_cnt': '# total diagnoses',
	'ICD_infertility': 'Infertility',
	'basic': 'No covariates'
}


covariate_plot_data = []

for idx, s_het_type in enumerate(['Weghorn']):

    childlessness_all = get_plot_data(df=all_results_df[s_het_type], target='childlessness', tag=s_het_type)

    covariate_plot_data += [childlessness_all]

covariate_plot_data = pd.concat(covariate_plot_data)

covariate_plot_data['covariate'] = covariate_plot_data['analysis'].apply(lambda x: x.split(',')[-1][:-1].strip() if ',' in x else 'basic')

covariate_plot_data['covariate'] = covariate_plot_data['covariate'].apply(lambda x: renaming_covariates.get(x, x))

covariate_plot_data.head(3)

## P-values data

In [None]:
results_files = {
    'Singleton LoFs in non-recessive genes': '../../../../data/tables/table_reduced_samples_regressions_on_[s_het_lof_without_AR].xlsx', 
    'PLPs in recessive genes': '../../../../data/tables/table_reduced_samples_regressions_on_[s_het_recessive_all].xlsx'
}

s_het_effects = {
    'PLPs in recessive genes': 's_het_recessive_all', 
    'Singleton LoFs in non-recessive genes': 's_het_lof_without_AR'
}

# Load data

p_values_data = {}
for key, path in results_files.items():
    # read raw table
    reader = pd.ExcelFile(path)

    p_values_df = pd.read_excel(reader, sheet_name="Raw data", header=[0, 1], skiprows=[2])
    p_values_df = p_values_df.drop(p_values_df.columns[0], axis=1)

    # prettify p-values
    new_columns = [(level0, 'p_value_pretty') for level0 in p_values_df.columns.get_level_values(level=0).unique()]
    p_values_df[new_columns] = p_values_df.loc[:, (slice(None), 'p_value')].applymap(sci_notation)

    # parse fraction
    new_columns = [(level0, 'fraction') for level0 in p_values_df.columns.get_level_values(level=0).unique()]
    p_values_df[new_columns] = p_values_df.loc[:, (slice(None), 'analysis')].applymap(lambda x: float(x.split('=')[-1]))

    # leave s_het effects only
    p_values_data[key] = p_values_df[p_values_df[p_values_df.columns[0]] == s_het_effects[key]]

# Plot

In [None]:
# Create a function to format numbers with a comma as a thousand separator
def format_thousands(x):
    return f'{x:,.0f}'  # Use , as a thousand separator and format as integers


def plot_1a(ax):

    sns.barplot(data=per_sample_plp_count, x='plp_cnt', y='s', ax=ax, color='#61D4D4', edgecolor='k', linewidth=0.5)

    print (ax.containers[-1])

    # add the annotation
    # Format the y-data using the format_thousands function
    labels = [format_thousands(val) for val in ax.containers[-1].datavalues]
    ax.bar_label(ax.containers[-1],  label_type='edge', labels=labels, rotation=90, padding=3, fontsize=SMALL_SIZE)

    configure_axis(ax, x_label='Number of PLPs', y_label='Samples')


def plot_1b(ax):

    y_order = ['Childlessness', 'Hair color']
    group_order = ['Singleton LoFs in non-recessive genes', 'PLPs in recessive genes w/out LoF carriers', 'PLPs in recessive genes'][::-1]

    legend_kwargs = {
        "bbox_to_anchor": (0.0, 1., 1, .102), 
        "frameon": False,
        "mode": "expand", 
        "ncol": 1, 
        "labelspacing": 0.1, 
        "markerfirst": False, 
        'fontsize': MEDIUM_SIZE
    }

    colors=['BlueViolet', '#006D5B', '#F89F5B']

    plotted_data = plot_errorbar_grouped(df=plot_data[plot_data['type'] == 'Weghorn'], 
                                        axis=ax, y_column='target', group_column = 'tag',  
                                        title='', 
                                        ymargin=0.12, 
                                        legend_loc='upper left', 
                                        legend_kwargs=legend_kwargs,
                                        group_scale=0.03, 
                                        y_scale=0.1,
                                        colors=colors,
                                        y_order=y_order, group_order=group_order)


    configure_axis(ax, x_label='OR (99% CI)', y_label=None)

    # Display p-values or any other information in the sidebar
    p_values = plotted_data['p_value_pretty']
    or_values = plotted_data['odds_ratio']
    positions = plotted_data['y']


    for i, (p, oddsr, pos) in enumerate(zip(p_values, or_values, positions)):
        ax.text(2.5, pos+0.01, f'{oddsr:.2f}   {p}', va='center', size=MEDIUM_SIZE)


def plot_1c(ax):
    data_tags = sorted(s_het_effects.keys())

    colors=['BlueViolet', '#F89F5B']

    for col in range (2):
        tag = data_tags[col]

        sns.lineplot(data=p_values_data[tag]['Weghorn'], x='fraction', y='odds_ratio', marker="D", color=colors[col],
                    ax=ax, errorbar=('pi', 99), linewidth=1, markersize=2, label=tag)
        
    ax.set_ylim([1,3])
        
    configure_axis(ax, x_label='Cohort size', y_label='Childlessness\nOR')

    legend_kwargs = {
        "bbox_to_anchor": (-0.1, 1.2, 1.1, .102), 
        "frameon": False,
        "mode": "expand", 
        "ncol": 2, 
        "labelspacing": 0.1, 
        "markerfirst": False, 
        'fontsize': MEDIUM_SIZE
    }
    ax.legend(**legend_kwargs)


def plot_1d(ax):
    tag = 'PLPs in recessive genes'

    ax.axhline(y=0.005, color='Gray', linestyle='dotted', linewidth=1, alpha=0.5)

    
    sns.lineplot(data=p_values_data[tag]['Weghorn'], x='fraction', y='p_value', marker="D", 
                 color='BlueViolet', ax=ax,  errorbar=('pi', 99), linewidth=1, markersize=2)
    
    ax.set_yscale('log')
    
    configure_axis(ax, x_label='Cohort size', y_label='P-value')

In [None]:
import matplotlib.ticker as ticker


cm = 1/2.54  # centimeters in inches
k = 1.
fig = plt.figure(constrained_layout=True, figsize=(18*cm*k, 8*cm*k))

data_tags = sorted(s_het_effects.keys())

subfigs = fig.subfigures(nrows=1, ncols=2, width_ratios=[1,1], wspace=0.03)

# Left plots
subfigs_left = subfigs[0].subfigures(nrows=2, ncols=1, height_ratios=[1.2, 2], hspace=0.07)

ax_1 = subfigs_left[0].subplots(1, 1)

plot_1a(ax_1)

ax_2, ax_3 = tuple(subfigs_left[1].subplots(1, 2, gridspec_kw={'width_ratios': [2.5, 1]}))

plot_1c(ax_2)
plot_1d(ax_3)

# Right plot
ax = subfigs[1].subplots(1, 1)

plot_1b(ax)

plt.savefig("../../../../data/plots/figure_1.pdf", format="pdf", bbox_inches="tight")

plt.show()