In [None]:
import pandas as pd

import seaborn as sns

import matplotlib.pyplot as plt

from ukbb_recessive.regression.regressions import sci_notation, get_plot_data, plot_errorbar, plot_errorbar_grouped
from ukbb_recessive.data_collection.variants import VariantFeatures

sns.set_style("whitegrid")
sns.set_palette("Accent")

import matplotlib

from matplotlib import font_manager
import numpy as np

# Add fonts
font_dirs = ['../../../../data/fonts']
font_files = font_manager.findSystemFonts(fontpaths=font_dirs)

for font_file in font_files:
    font_manager.fontManager.addfont(font_file)
    print ("Added:", font_file)

In [None]:
SMALL_SIZE = 16
MEDIUM_SIZE = 22
BIGGER_SIZE = 24

plt.rc('font', size=SMALL_SIZE, family='Arimo')          # controls default text sizes
plt.rc('axes', titlesize=BIGGER_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=BIGGER_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

plt.rcParams['text.usetex']= False

matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

# Figure 3A

In [None]:
cr_df = pd.read_csv("../../../../data/tables/CR_panel.csv").melt(id_vars='Panel', value_vars=['UKB', 'Dutch', 'Estonian']).rename(
    columns={'variable': 'Cohort', 'value': 'Consaguinity ratio'})

cr_df = cr_df[cr_df['Panel'] != 'Severe genes']

cr_df.head(3)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(20, 7))

sns.barplot(data=cr_df, x='Panel', y='Consaguinity ratio', hue='Cohort')
plt.xticks(rotation=90)
plt.xlabel(None)
plt.title("Consanguinity ratio scores for three European cohorts")

for container in ax.containers:
    ax.bar_label(container)

ax.grid(linestyle='dotted', axis='y') 

plt.savefig("../../../../data/plots/figure_4a.pdf", format="pdf", bbox_inches="tight")


# Figure 3B

In [None]:
af_df = pd.read_csv("../../../../data/tables/figure_3b.csv")

af_df.columns = ['Gene set', 'UK Biobank AF', 'Dutch cohort AF']

af_df.head(2)

In [None]:
af_df

In [None]:
plt.rc('font', size=15, family='Arimo')          # controls default text sizes

fig, ax = plt.subplots(1, 1, figsize=(20, 8))

sns.regplot(data=af_df, x='UK Biobank AF', y='Dutch cohort AF', marker='D',  scatter_kws={"color": "salmon"})
# sns.lineplot(x=[0, 0.001], y=[0, 0.001], color='gray', linestyle='--')
for row in af_df.values:
    gene_set, ukb_af, dutch_af = tuple(row)
    if 'Blindness' in gene_set:
        dutch_af -= 0.000005
    if 'Cilia' in gene_set:
        dutch_af += 0.00001
        ukb_af +=  0.00001
    if 'Multi' in gene_set:
        dutch_af += 0.00003
        ukb_af +=  0.00001
    if 'Derma' in gene_set:
        dutch_af += 0.000005
    if 'Metabolic-ID' in gene_set:
        dutch_af += 0.000005
    if 'Deafness' in gene_set:
        dutch_af += 0.000025
    if (ukb_af >= 0.00045) and ('Deafness' not in gene_set):
        plt.annotate(gene_set, (ukb_af+0.0001,  dutch_af-0.000005))
    elif ("ID (2.9" in gene_set):
        plt.annotate(gene_set, (ukb_af+0.00006,  dutch_af-0.000005))
    else:
        plt.annotate(gene_set, (ukb_af-0.000005,  dutch_af-0.000005))

# plt.xscale('log')
# plt.yscale('log')

ax.grid(linestyle='dotted') 
plt.xlim([0.0001, 0.0009])



plt.gca().invert_xaxis()
plt.gca().invert_yaxis()

plt.title("Average allele frequency correlation UK Biobank - Dutch cohorts")

plt.savefig("../../../../data/plots/figure_4b.pdf", format="pdf", bbox_inches="tight")


# Supplementary figures

In [None]:
import glob
from matplotlib.cm import get_cmap

In [None]:
# samples of interest: European & non-related
european_non_rel_samples =  ".../450k/samples/european_non_related_no_withdrawal_to_include.txt"

european_non_rel_samples = set(pd.read_csv(european_non_rel_samples, header=None)[0].values.tolist())
print ("Number of samples:", len(european_non_rel_samples))


In [None]:
plps = ".../450k/plp_selection/basic/new_gene_names/new_freq/all_chr_total_presumable_plps_HFE_final_sorted.txt"

plps = pd.read_csv(plps, sep='\t')

plps.head(1)

In [None]:
gene_panel = pd.read_csv(".../gene-panel-gencode-v34.txt", header=None)
gene_panel.columns = ['gene', 'panel']


In [None]:
plt.rc('font', size=13, family='Arimo')          # controls default text sizes

sample_plps = glob.glob(".../450k/RAP_output_per_chr/filtered_plps/basic/new_gene_names/new_freq/chr*")
sample_plps = VariantFeatures().read_sample_plps(sample_plps)
sample_plps = sample_plps[sample_plps['s'].isin(european_non_rel_samples)]
sample_plps = sample_plps[sample_plps['GT'] != '1/1']

per_sample_plp_count = sample_plps.groupby('s').count()[['chrom']].reset_index().rename(columns={'chrom': 'plp_cnt'})
per_sample_plp_count = per_sample_plp_count.merge(pd.DataFrame({'s': list(european_non_rel_samples)}), how='outer').fillna(0)
per_sample_plp_count = per_sample_plp_count.groupby('plp_cnt').count().reset_index()
per_sample_plp_count['plp_cnt'] = per_sample_plp_count['plp_cnt'].astype(int)

per_sample_plp_count = per_sample_plp_count.sort_values(by='plp_cnt')

# plot
fig, ax = plt.subplots(1, 2, figsize=(22, 8))

sns.barplot(data=per_sample_plp_count, x='plp_cnt', y='s', ax=ax[0], palette="Accent")
# add the annotation
ax[0].bar_label(ax[0].containers[-1],  label_type='edge')

ax[0].grid(linestyle='dotted', axis='y') 


ax[0].set_xlabel("Number of PLPs")
ax[0].set_ylabel("Number of samples")

per_sample_plp_count.loc[per_sample_plp_count['plp_cnt'] > 5, 'plp_cnt'] = '6+'
per_sample_plp_count = per_sample_plp_count.groupby('plp_cnt').sum().reset_index()

plt.rc('font', size=15, family='Arimo')          # controls default text sizes


# per_sample_plp_count = per_sample_plp_count.sort_values(by='plp_cnt')
sns.set_palette("Accent")

ax[1].pie(per_sample_plp_count['s'],
          labels = per_sample_plp_count['plp_cnt'].astype(str), 
          autopct='%.2f%%', 
          explode=[0.1]*per_sample_plp_count.shape[0],
          startangle=180,
          counterclock=False)

ax[1].set_xlabel("Number of PLPs per sample")

plt.savefig("../../../../data/plots/supp_figure_1.pdf", format="pdf", bbox_inches="tight")


In [None]:
def hets_freq_bin(hets_freq):
    if hets_freq <= 0:
        return '0%'
    elif hets_freq <= 0.5:
        return '0-0.5%'
    elif hets_freq <= 1:
        return '0.5-1%'
    elif hets_freq <= 2:
        return '1-2%'
    elif hets_freq <= 5:
        return '2-5%'
    else:
        return '>5%'

plt.rc('font', size=15, family='Arimo')          # controls default text sizes


plps['hets_freq'] = plps['hets']*100./len(european_non_rel_samples)
plps['hets_freq_bin'] = plps['hets_freq'].apply(hets_freq_bin)

plps_freq_count = plps.groupby('hets_freq_bin').count()[['chr']].reset_index().rename(columns={'chr': 'plp_cnt'})
plps_freq_count = plps_freq_count[plps_freq_count['hets_freq_bin'] != '0%']

# plot
fig, ax = plt.subplots(1, 1, figsize=(22, 8))
ax=[ax]

sns.barplot(data=plps_freq_count, x='hets_freq_bin', y='plp_cnt', ax=ax[0])
# add the annotation
labels =[x.get_height() for x in ax[0].containers[-1]]
labels_percentage = [f"{int(num)} \n({round(num*100/sum(labels), 4)}%)" for num in labels]

ax[0].bar_label(ax[0].containers[-1],  labels=labels_percentage, label_type='edge')
ax[0].grid(linestyle='dotted', axis='y') 

ax[0].set_xlabel("Variant frequency")
ax[0].set_ylabel("Number of variants")

ax[0].set_yscale("log")
plt.savefig("../../../../data/plots/supp_figure_2.pdf", format="pdf", bbox_inches="tight")


In [None]:
plps_gene = plps.groupby('gene').agg({'hets': 'sum'}).reset_index().merge(gene_panel, how='outer').fillna(0)

plps_gene['hets_freq'] = plps_gene['hets']*100./len(european_non_rel_samples)
plps_gene['hets_freq_bin'] = plps_gene['hets_freq'].apply(hets_freq_bin)

plps_freq_gene_count = plps_gene.groupby('hets_freq_bin').count()[['gene']].reset_index().rename(columns={'gene': 'plp_cnt'})
plps_freq_gene_count['plp_cnt'] = plps_freq_gene_count['plp_cnt'].astype(int)

# plot
fig, ax = plt.subplots(1, 1, figsize=(22, 8))
ax=[ax]

sns.barplot(data=plps_freq_gene_count, x='hets_freq_bin', y='plp_cnt', ax=ax[0])
# add the annotation
labels =[x.get_height() for x in ax[0].containers[-1]]
labels_percentage = [f"{int(num)} \n({round(num*100/sum(labels), 4)}%)" for num in labels]

ax[0].bar_label(ax[0].containers[-1],  labels=labels_percentage, label_type='edge')
ax[0].grid(linestyle='dotted', axis='y') 

ax[0].set_xlabel("Carrier frequency")
ax[0].set_ylabel("Number of genes")

ax[0].set_yscale("log")
plt.savefig("../../../../data/plots/supp_figure_3.pdf", format="pdf", bbox_inches="tight")
