# Map-to-pan gene PAV detection thresholds
This notebook contains an analysis of the effect of read coverage cutoffs on gene PAV detection results within the MTP construction approach. We examine the effect on specific accessions and on the overall pan-genome composition.

In [None]:
import os
import pandas as pd
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
from plotly.colors import n_colors, named_colorscales, sample_colorscale
from scipy.stats import ttest_ind

In [None]:
pio.templates.default = "plotly_white"
colors = ['grey','purple','darkgreen','lightblue','orange']

## Paths

In [None]:
base_dir = "/groups/itay_mayrose_nosnap/liorglic/Projects/PGCM/output/A_thaliana_pan_genome"
mtp_pg = os.path.join(base_dir, "map_to_pan/x50/RESULT")

In [None]:
figs_path = "/groups/itay_mayrose_nosnap/liorglic/Projects/PGCM/figs/FINAL"

## Grid search
We examined the effect of two parameters:  
1. Min depth - the number of mapped reads required to call a region of the gene "covered"
2. Min fraction covered - the required fraction covered gene to call it present

We search the grid of these two cutoffs to examine their effect on the number of present genes in one _A. thaliana_ ecotype.

In [None]:
bedCovHist = os.path.join(mtp_pg, 'per_sample/An-1/map_to_pan_ERR3624579/ERR3624579_map_to_pan.bedCovHist')
bedCovHist_df = pd.read_csv(bedCovHist, sep='\t',
                            names = ['chr','start','end','gene','cov','bases','gene_len','frac'])

In [None]:
def gene_frac_covered(bedCovHist_df, min_depth):
    gene_df = bedCovHist_df.query("cov >= @min_depth")
    frac_covered = gene_df['frac'].sum()
    return frac_covered

In [None]:
min_depth_cutoffs = [1, 5 ,10 ,15 ,20 ,30 ,40 ,50]

In [None]:
# distribution of fractions covered across all genes using different min_depth
frac_cov = []
for x in min_depth_cutoffs:
    print(x)
    df = pd.DataFrame(bedCovHist_df.groupby('gene').apply(gene_frac_covered, x))
    df.columns = ['frac_cov']
    df['min_depth'] = x
    frac_cov.append(df)

In [None]:
frac_cov_df = pd.concat(frac_cov)

In [None]:
colors = sample_colorscale('jet',8)[::-1]
fig = go.Figure()
for x in reversed(min_depth_cutoffs):
    data_line = frac_cov_df.query('min_depth == @x')['frac_cov']
    color = colors.pop(0)
    fig.add_trace(go.Violin(x=data_line, line_color=color, name=x))
fig.update_traces(orientation='h', side='positive', width=2, points=False)
fig.update_layout(xaxis_showgrid=False, xaxis_zeroline=False)
fig.update_xaxes(mirror=True, showline=True, linecolor='black', title='Fraction covered')
fig.update_yaxes(mirror=True, showline=True, linecolor='black', showgrid=False, title='Depth threshold')
fig.update_layout(autosize=False, width=800, legend_title_text='Depth threshold', showlegend=False)
fig.show()

In [None]:
fig5s_a = os.path.join(figs_path, 'figS5a.pdf')
fig.write_image(fig5s_a)

In [None]:
fig = go.Figure()
for d in min_depth_cutoffs:
    min_frac = list(range(0,101,5))
    perc_present = []
    for m in min_frac:
        genes_present = (frac_cov_df.query('min_depth == @d')['frac_cov'] >= m/100).sum()
        tot_genes = frac_cov_df.query('min_depth == @d').shape[0]
        perc_present.append(genes_present/tot_genes*100)
    fig.add_trace(go.Scatter(x=min_frac, y=perc_present, name=d))
fig.update_xaxes(mirror=True, showline=True, linecolor='black', title='Coverage threshold (%)')
colors = sample_colorscale('jet',8)
fig.update_layout(colorway=colors)
fig.update_yaxes(mirror=True, showline=True, linecolor='black', showgrid=False, title='Genes presence(%)')
fig.update_layout(autosize=False, width=800, legend_title_text='Depth threshold')
fig.show()

In [None]:
fig5s_b = os.path.join(figs_path, 'figS5b.pdf')
fig.write_image(fig5s_b)

## Pan-genome composition
Here we test the effect of the two thresholds on the pan-genome composition using two metrics: % core genes and overall occupancy.

In [None]:
samples = {'An-1': 'ERR3624579',
           'C24': 'ERR3624577',
           'Cvi-0': 'ERR3624578',
           'Eri': 'ERR3624573',
           'Kyo': 'ERR3624576',
           'Ler': 'ERR3624574',
           'Sha': 'ERR3624575'}

In [None]:
samples_frac_cov = {}
for sample in samples:
    print(sample)
    err = samples[sample]
    bedCovHist = os.path.join(mtp_pg, 'per_sample/%s/map_to_pan_%s/%s_map_to_pan.bedCovHist' %(sample,err,err))
    bedCovHist_df = pd.read_csv(bedCovHist, sep='\t',
                            names = ['chr','start','end','gene','cov','bases','gene_len','frac'])
    frac_cov = []
    for x in min_depth_cutoffs:
        print(x)
        df = pd.DataFrame(bedCovHist_df.groupby('gene').apply(gene_frac_covered, x))
        df.columns = ['frac_cov']
        df['min_depth'] = x
        frac_cov.append(df)
    samples_frac_cov[sample] = pd.concat(frac_cov)

In [None]:
def pan_pav(samples_frac_cov, min_depth, min_frac):
    pav = []
    for s in samples_frac_cov:
        sample_pav = pd.DataFrame((samples_frac_cov[s].query('min_depth == @min_depth')['frac_cov'] >= min_frac/100).astype(int))
        sample_pav.columns = [s]
        pav.append(sample_pav)
    return pd.concat(pav, axis=1)

In [None]:
def perc_core(pg_df, c=95):
    """c is the min % to count as core"""
    return pg_df.query('presence_perc >= @c').shape[0]/pg_df.shape[0]*100

def mean_presence(pg_df):
    """Mean % of presennce across genes"""
    return pg_df['presence_perc'].mean()

In [None]:
rows = []
min_frac = list(range(0,101,5))
for d in min_depth_cutoffs:
    for m in min_frac:
        pg_pav = pan_pav(samples_frac_cov, d, m)
        pg_pav = pg_pav.dropna()
        # add the ref sample
        ref = pd.Series(pg_pav.index.str.startswith('transcript')).astype(int)
        ref.index = pg_pav.index
        pg_pav['Col-0'] = ref
        # add % presence
        pg_pav['presence_perc'] = pg_pav.sum(axis=1)/pg_pav.shape[1]*100
        core = perc_core(pg_pav, c=100)
        occup = mean_presence(pg_pav)
        rows.append(pd.Series([d,m,core,occup]))

In [None]:
pg_stats = pd.concat(rows, axis=1).transpose()
pg_stats.columns = ['min_depth', 'min_frac', 'perc_core', 'occup']

In [None]:
pg_stats['min_depth'] = pg_stats['min_depth'].astype(int)

In [None]:
pg_stats

In [None]:
fig = px.line(pg_stats, x='min_frac', y='perc_core', color='min_depth', color_discrete_sequence=colors)
fig.update_xaxes(mirror=True, showline=True, linecolor='black', title='Coverage threshold (%)')
fig.update_yaxes(mirror=True, showline=True, linecolor='black', showgrid=False, title='Core pan-genes (%)')
fig.update_layout(autosize=False, width=800, legend_title_text='Depth threshold')
fig.show()

In [None]:
fig5s_c = os.path.join(figs_path, 'figS5c.pdf')
fig.write_image(fig5s_c)

In [None]:
fig = px.line(pg_stats, x='min_frac', y='occup', color='min_depth', color_discrete_sequence=colors)
fig.update_xaxes(mirror=True, showline=True, linecolor='black', title='Coverage threshold (%)')
fig.update_yaxes(mirror=True, showline=True, linecolor='black', showgrid=False, title='Overall gene occupancy (%)')
fig.update_layout(autosize=False, width=800, legend_title_text='Depth threshold')
fig.show()

In [None]:
fig5s_d = os.path.join(figs_path, 'figS5d.pdf')
fig.write_image(fig5s_d)

## Effect of sequencing depth
Here we examine the effect of the sequencing depth. Instead of setting absolute depth cutoff, we set them as a percent of the raw sequencing depth.  
The analysis is performed by focusing on one ecotype and examining it across pan-genomes constructed with 10x, 20x, 30x, and 50x data.

In [None]:
bch_dict = {
    10: os.path.join(base_dir, "map_to_pan/x10/RESULT/per_sample/An-1/map_to_pan_ERR3624579/ERR3624579_map_to_pan.bedCovHist"),
    20: os.path.join(base_dir, "map_to_pan/x20/RESULT/per_sample/An-1/map_to_pan_ERR3624579/ERR3624579_map_to_pan.bedCovHist"),
    30: os.path.join(base_dir, "map_to_pan/x30/RESULT/per_sample/An-1/map_to_pan_ERR3624579/ERR3624579_map_to_pan.bedCovHist"),
    50: os.path.join(base_dir, "map_to_pan/x50/RESULT/per_sample/An-1/map_to_pan_ERR3624579/ERR3624579_map_to_pan.bedCovHist")
}

In [None]:
bch_df_dict = {x: pd.read_csv(bch_dict[x], sep='\t',
                            names = ['chr','start','end','gene','cov','bases','gene_len','frac']) for x in bch_dict}

In [None]:
dp_l = [0.1, 0.3, 0.5, 0.7, 0.9]
m = 0.5
res = []
for x in bch_df_dict:
    for dp in dp_l:
        df = pd.DataFrame(bch_df_dict[x].groupby('gene').apply(gene_frac_covered, dp*x))
        df.columns = ['frac_cov']
        df.query('frac_cov >= @m')
        genes_present = df.query('frac_cov >= 0.5').shape[0]
        res.append(pd.Series([x,dp,genes_present]))

In [None]:
res = pd.concat(res, axis=1).transpose()
res.columns = ['depth','dp','n_genes']

In [None]:
res['perc_present'] = res['n_genes']/27960*100

In [None]:
colors = sample_colorscale('jet',5)
fig = px.line(res, x='depth', y='perc_present', color='dp', color_discrete_sequence=colors)
fig.update_xaxes(mirror=True, showline=True, linecolor='black', title='Mean sequencing depth')
fig.update_yaxes(mirror=True, showline=True, linecolor='black', showgrid=False, title='Genes presence(%)')
fig.update_layout(autosize=False, width=800, legend_title_text='Depth fraction threshold')

fig.show()

In [None]:
fig5s_e = os.path.join(figs_path, 'figS5e.pdf')
fig.write_image(fig5s_e)