In [None]:
import pandas as pd

import seaborn as sns

import matplotlib.pyplot as plt

from ukbb_recessive.regression.regressions import sci_notation, plot_errorbar_grouped, plot_errorbar_grouped_transposed

sns.set_style("whitegrid")

from matplotlib import font_manager
import matplotlib
from matplotlib.cm import get_cmap

In [None]:
# Add fonts
font_dirs = ['../../../../data/fonts']
font_files = font_manager.findSystemFonts(fontpaths=font_dirs)

for font_file in font_files:
    font_manager.fontManager.addfont(font_file)
    print ("Added:", font_file)

In [None]:
SMALL_SIZE = 16
MEDIUM_SIZE = 22
BIGGER_SIZE = 24

plt.rc('font', size=SMALL_SIZE, family='Arimo')          # controls default text sizes
plt.rc('axes', titlesize=BIGGER_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=BIGGER_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

plt.rcParams['text.usetex']= False

matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

# Figure 3B

In [None]:
# read raw table
reader = pd.ExcelFile('../../../../data/tables/table_basic_regressions_on_[s_het_recessive_all].xlsx')

all_results_df = pd.read_excel(reader, sheet_name="Raw data", header=[0, 1], skiprows=[2])
all_results_df = all_results_df.drop(all_results_df.columns[0], axis=1)

# prettify p-values
new_columns = [(level0, 'p_value_pretty') for level0 in all_results_df.columns.get_level_values(level=0).unique()]
all_results_df[new_columns] = all_results_df.loc[:, (slice(None), 'p_value')].applymap(sci_notation)

# leave s_het effects only
all_results_df = all_results_df[all_results_df[all_results_df.columns[0]].str.contains('s_het')]

all_results_df_recessive = all_results_df['Weghorn'].copy()

In [None]:
# read raw table
reader = pd.ExcelFile('../../../../data/tables/table_panel_regressions_on_[s_het_recessive_AR_without_ID,_s_het_recessive_ID_total].xlsx')

all_results_df = pd.read_excel(reader, sheet_name="Raw data", header=[0, 1], skiprows=[2])
all_results_df = all_results_df.drop(all_results_df.columns[0], axis=1)

# prettify p-values
new_columns = [(level0, 'p_value_pretty') for level0 in all_results_df.columns.get_level_values(level=0).unique()]
all_results_df[new_columns] = all_results_df.loc[:, (slice(None), 'p_value')].applymap(sci_notation)

# leave s_het effects only
all_results_df = all_results_df[all_results_df[all_results_df.columns[0]].str.contains('s_het')]

all_results_df = pd.concat([all_results_df['Weghorn'], all_results_df_recessive])

In [None]:
renaming_dict_target = {
    'any_education_including_none': "Any education",
    'is_blond': 'Hair color', 
    'childlessness': 'Childlessness'
}

renaming_dict_panel = {
    's_het_recessive_AR_without_ID': "Other recessive genes",
    's_het_recessive_ID_total': 'Recessive ID genes', 
    's_het_recessive_all': "All recessive genes"
}

all_results_df = all_results_df[all_results_df['target'].isin(['childlessness', 'any_education_including_none', 'is_blond'])]
all_results_df['odds_ratio_lower'] = all_results_df['odds_ratio'] - all_results_df['odds_ratio_lower']
all_results_df['odds_ratio_upper'] = all_results_df['odds_ratio_upper'] - all_results_df['odds_ratio']

all_results_df = all_results_df[all_results_df['gender'] == 'all']
all_results_df['target'] = all_results_df['target'].apply(lambda x: renaming_dict_target.get(x, x))
all_results_df['feature'] = all_results_df['feature'].apply(lambda x: renaming_dict_panel.get(x, x))

all_results_df = all_results_df.sort_values(by='feature')


In [None]:
ig, ax = plt.subplots(1, 1, figsize=(12, 5))

y_order = ['Any education', 'Childlessness', 'Hair color']
group_order = ['All recessive genes', 'Recessive ID genes', 'Other recessive genes']

colors = get_cmap("Dark2").colors
colors = [get_cmap("Accent").colors[0], colors[2], colors[5]]

printed_results_df = plot_errorbar_grouped(df=all_results_df, axis=ax, y_column='target', group_column = 'feature',  title='Association for ID and other recessive genes', 
                      text_margin_ratio=0.53, legend_loc='upper right', 
                      group_scale=0.25, y_order=y_order, group_order=group_order, ymargin=0.2, 
                      colors=colors)


plt.savefig("../../../../data/plots/figure_3b.pdf", format="pdf", bbox_inches="tight")

printed_results_df[['target', 'feature', 'p_value_pretty']]

# Panel figure

In [None]:
# read raw table
reader = pd.ExcelFile('../../../../data/tables/table_panel_regressions_on_s_het_panels.xlsx')

all_results_df = pd.read_excel(reader, sheet_name="Raw data", header=[0, 1], skiprows=[2])
all_results_df = all_results_df.drop(all_results_df.columns[0], axis=1)

# prettify p-values
new_columns = [(level0, 'p_value_pretty') for level0 in all_results_df.columns.get_level_values(level=0).unique()]
all_results_df[new_columns] = all_results_df.loc[:, (slice(None), 'p_value')].applymap(sci_notation)

# leave s_het effects only
all_results_df = all_results_df[all_results_df[all_results_df.columns[0]].str.contains('s_het')]

all_results_df = all_results_df['Weghorn']

In [None]:
renaming_dict_target = {
    'any_education_including_none': "Any education",
    'is_blond': 'Hair color', 
    'childlessness': 'Childlessness'
}

renaming_dict_panel = {
    's_het_recessive_Blindness': 'Blindness',
	's_het_recessive_Cilia_Kidney': 'Cilia + Kidney',
	's_het_recessive_Deafness': 'Deafness',
	's_het_recessive_Derm': 'Dermatologic',
	's_het_recessive_Endocrine': 'Endocrine',
	's_het_recessive_Hematologic': 'Hematologic',
	's_het_recessive_ID_total': 'ID',
	's_het_recessive_Immune_system': 'Immune system',
	's_het_recessive_Metabolic': 'Metabolic',
	's_het_recessive_Metabolic_ID': 'Metabolic-ID',
	's_het_recessive_Neuromuscular': 'Neuromuscular',
	's_het_recessive_Overlaps': 'Multi-system',
	's_het_recessive_Skeletal_Craniofacial': 'Skeletal'
}

import numpy as np

all_results_df = all_results_df[all_results_df['target'].isin(['childlessness', 'any_education_including_none', 'is_blond'])]

# all_results_df = all_results_df[all_results_df['target'].isin(['childlessness'])]

all_results_df[['odds_ratio', 'odds_ratio_lower', 'odds_ratio_upper']] = np.log(all_results_df[['odds_ratio', 'odds_ratio_lower', 'odds_ratio_upper']].values)

all_results_df['odds_ratio_lower'] = all_results_df['odds_ratio'] - all_results_df['odds_ratio_lower']
all_results_df['odds_ratio_upper'] = all_results_df['odds_ratio_upper'] - all_results_df['odds_ratio']

all_results_df = all_results_df[all_results_df['gender'] == 'all']
all_results_df['target'] = all_results_df['target'].apply(lambda x: renaming_dict_target.get(x, x))
all_results_df['feature'] = all_results_df['feature'].apply(lambda x: renaming_dict_panel.get(x, x))

all_results_df = all_results_df.sort_values(by='feature')

In [None]:
ig, ax = plt.subplots(1, 1, figsize=(20, 6))

group_order = ['Any education', 'Childlessness', 'Hair color'][::-1]

colors = get_cmap("tab20b").colors
colors = [colors[10], colors[2], colors[5]]

plot_errorbar_grouped_transposed(df=all_results_df, axis=ax, y_column='feature', group_column = 'target',  title='Associations for different disorder groups', 
                      ymargin=0.05, legend_loc='lower right', group_scale=0.2, vertical_loc=0, group_order=group_order, colors=colors)
plt.ylabel("Effect size")
plt.xticks(rotation=90)

plt.savefig("../../../../data/plots/figure_2.pdf", format="pdf", bbox_inches="tight")