In [None]:
import pandas as pd
import seaborn as sns
import numpy as np

import matplotlib.pyplot as plt

sns.set_style("whitegrid")

from matplotlib import font_manager
import matplotlib


In [None]:
# Add fonts
font_dirs = ['../../../../data/fonts']
font_files = font_manager.findSystemFonts(fontpaths=font_dirs)

for font_file in font_files:
    font_manager.fontManager.addfont(font_file)
    print ("Added:", font_file)

In [None]:
SMALL_SIZE = 5
MEDIUM_SIZE = 6
BIGGER_SIZE = 7

plt.rc('font', size=BIGGER_SIZE, family='Arimo')          # controls default text sizes
# plt.rc('axes', titlesize=BIGGER_SIZE)     # fontsize of the axes title
# plt.rc('axes', labelsize=BIGGER_SIZE)    # fontsize of the x and y labels
# plt.rc('xtick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
# plt.rc('ytick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
# plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
# plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

plt.rcParams['text.usetex']= False

matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

In [None]:
def configure_axis(ax, ytick_size=MEDIUM_SIZE, xtick_size=MEDIUM_SIZE, xlabel_size=MEDIUM_SIZE, ylabel_size=MEDIUM_SIZE, x_label=None, y_label=None):
    # Turn off grid
    ax.grid(False) 

    # Customize spines
    ax.spines['left'].set_color('black')
    ax.spines['bottom'].set_color('black')
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
        
    # Set tick labels size
    ax.tick_params(axis='y', labelsize=ytick_size) 
    ax.tick_params(axis='x', labelsize=xtick_size) 

    # Add tick marks
    ax.yaxis.set_ticks_position('left')
    ax.xaxis.set_ticks_position('bottom')
    ax.tick_params(which='major', width=1.00, length=2.5)
    ax.tick_params(which='minor', width=0.75, length=1.25)

    ax.set_xlabel(x_label, size=xlabel_size)
    ax.set_ylabel(y_label, size=ylabel_size)

In [None]:
gene_panel = pd.read_csv("../gene-panel-gencode-v34.txt", header=None)
gene_panel.columns = ['gene', 'panel']

gene_panel.head(3)

In [None]:
cr_df = pd.read_csv("../../../../data/tables/CR_panel.csv")
cr_df.head(3)

In [None]:
cohort = 406194
alleles = 2*cohort

plps = pd.read_csv("../UKB/hila/450k/plp_selection/basic/new_gene_names/"
                   "new_freq/all_chr_total_presumable_plps_HFE_final_sorted.txt", sep='\t')

plps = plps.merge(gene_panel, on='gene')

print("Number of PLPs:", plps.shape[0])

plps.head(3)

In [None]:
renaming_dict_panel = {
	'Cilia+Kidney': 'Cilia + Kidney',
	'Derm': 'Dermatologic',
	'ID-total': 'ID',
	'Immune_system': 'Immune system',
	'Overlaps': 'Multi-system',
	'Skeletal+Craniofacial': 'Skeletal'
}

mild_plps = pd.read_csv("../450k/plp_selection/basic/new_gene_names/new_freq/all_chr_total_presumable_plp_mild_var.txt", 
sep='\t', header=None)

mild_plps.columns = ['chr', 'pos', 'ref', 'alt', 'gene', 'panel']
mild_plps['is_mild'] = 1

plps = plps.merge(mild_plps, how='left')

plps['is_mild'] = plps['is_mild'].fillna(0)

plps['hets_s'] = plps['hets'] * (1-plps['is_mild'])
plps['homs_s'] = plps['homs'] * (1-plps['is_mild'])


plps['hets_m'] = plps['hets'] * plps['is_mild']
plps['homs_m'] = plps['homs'] * plps['is_mild']

plps['panel'] = plps['panel'].apply(lambda x: renaming_dict_panel.get(x, x))

plps.head(3)

In [None]:
# calculate number hets and homs per gene by summing numbers across variants
plps_gene = plps[['gene', 'panel', 'hets_s', 'homs_s', 'hets_m', 'homs_m', 'hets', 'homs']].groupby(['gene', 'panel']).sum().reset_index()

plps_gene = plps_gene[~plps_gene['panel'].isin(['Tumor', 'No_panel', 'Cardiovascular'])]

# calculate AC, AF, Hardy-Weinberg 2pq, at-risk-couples probability and consaguinos-risk probability per gene
plps_gene['AC'] = (plps_gene['hets'] + 2*plps_gene['homs']) 
plps_gene['AC_s'] = (plps_gene['hets_s'] + 2*plps_gene['homs_s']) 
plps_gene['AC_m'] = (plps_gene['hets_m'] + 2*plps_gene['homs_m']) 

plps_gene['AF'] = plps_gene['AC']*1./alleles
plps_gene['AF_s'] = plps_gene['AC_s']*1./alleles
plps_gene['AF_m'] = plps_gene['AC_m']*1./alleles

plps_gene['hets_prob'] = 2*plps_gene['AF']*(1-plps_gene['AF'])
plps_gene['hets_prob_s'] = 2*plps_gene['AF_s']*(1-plps_gene['AF_s'])
plps_gene['hets_prob_m'] = 2*plps_gene['AF_m']*(1-plps_gene['AF_m'])


plps_gene['ARC'] = (plps_gene['hets_prob_s']*plps_gene['hets_prob_s']) + 2*plps_gene['hets_prob_s']*plps_gene['hets_prob_m'] 
plps_gene['Cons'] = plps_gene['hets_prob_s']*(1./8)

plps_gene.head(3)

In [None]:
# calculate statistics per panel
plps_panel = plps_gene.drop('gene', axis=1).groupby('panel').agg({'Cons': 'sum', 'ARC': 'sum', 'AF': 'std'}).reset_index()

plps_panel['CR_hila'] = plps_panel['Cons']/plps_panel['ARC']

plps_panel = plps_panel.sort_values(by='CR_hila')

plps_panel['CR_hila_round'] = plps_panel['CR_hila'].apply(lambda x: round(x, 0))

plps_panel.head(3)

In [None]:
labels = ['low', 'medium', 'high']
plps_gene = plps_gene.sort_values(by='AF')
plps_gene['AF_bins'] = pd.cut(plps_gene['AF'], np.logspace(np.log10(1e-6),np.log10(0.076), 4), labels=labels)
plps_gene['AF_log'] = np.log10(plps_gene['AF'])

plps_gene_counts = plps_gene.groupby(['panel', 'AF_bins']).agg({'AF': 'sum'}).reset_index()
plps_gene_counts['AF'] = plps_gene_counts['AF'].fillna(0)

plps_gene_counts = plps_gene_counts.sort_values(by=['panel', 'AF_bins'])

plps_gene_counts['AF_total'] = plps_gene_counts.groupby('panel')[['AF']].transform('sum')
plps_gene_counts['AF_percentage'] = np.round(plps_gene_counts['AF']*100./plps_gene_counts['AF_total'], 2)

panels = plps_panel['panel'].values

borders = np.logspace(np.log10(1e-6),np.log10(0.076), 4)

print (borders[0], borders[1], borders[2], borders[3])

In [None]:
def plot_9a_ext(ax):
    sns.scatterplot(data=plps_panel, x='CR_hila', y='AF', ax=ax, marker='D', size=1, legend=False, color='#00008b')

    for x, y, label in plps_panel[['CR_hila', 'AF', 'panel']].values:
        ax.annotate(label, (x+0.01*x, y+0.01*y), size=SMALL_SIZE)
        
    configure_axis(ax, x_label='Consaguinity ratio', y_label='sd(AF per gene)')

def plot_9b_ext(ax):
    colors = ['#00008b', '#df0057', '#ffa600']

    bottom = np.zeros_like(panels)

    for label in labels[::-1]:
        cur_counts = plps_gene_counts[plps_gene_counts['AF_bins'] == label].set_index('panel').loc[panels, 'AF_percentage'].values

        p = ax.bar(panels, cur_counts, label=label, bottom=bottom, color=colors.pop())

        bottom += cur_counts

        ax.bar_label(p, label_type='center', size=SMALL_SIZE)

    plt.xticks(rotation=90)

    legend_kwargs = {
        "bbox_to_anchor": (0.0, 1., 0.5, .102), 
        "frameon": False,
        "mode": "expand", 
        "ncol": 3, 
        "labelspacing": 0.1, 
        "markerfirst": False, 
        'fontsize': MEDIUM_SIZE
    }

    plt.legend(**legend_kwargs)

    configure_axis(ax)

In [None]:
cm = 1/2.54  # centimeters in inches
k = 1.

fig = plt.figure(constrained_layout=True, figsize=(18*cm*k, 12*cm*k))

axes = fig.subplots(2, 1)

plot_9a_ext(axes[0])
plot_9b_ext(axes[1])


plt.savefig("../../../../data/plots/ext_figure_9.pdf", format="pdf", bbox_inches="tight")