# The effect of annotation evidence
This notebook contains the analysis of the effect of annotation evidence on pan-genome results.  
Specifically, we compare pan-genomes which are constructed from the same data, except annotation evidence:  
1) No evidence (liftover + ab-initio only)
2) Standard evidence
3) High quality (HQ) evidence

In [None]:
import os
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
from plotly.subplots import make_subplots
from intervaltree import Interval, IntervalTree
from itertools import chain

In [None]:
pio.templates.default = "plotly_white"

In [None]:
pg_order = ['no_ev', 'normal_ev', 'HQ_ev']
samples = ['An-1', 'C24', 'Cvi-0', 'Eri', 'Kyo', 'Ler', 'Sha', 'TAIR10']
n_samples = len(samples)

## Paths
Paths to dirs containing pan-genome analyses

In [None]:
dn_dir = "/groups/itay_mayrose_nosnap/liorglic/Projects/PGCM/output/A_thaliana_pan_genome/de_novo"
mtp_dir = "/groups/itay_mayrose_nosnap/liorglic/Projects/PGCM/output/A_thaliana_pan_genome/map_to_pan"

In [None]:
# de novo pan-genomes
dn_pan_genomes = {
    'no_ev': os.path.join(dn_dir, "x50_no_ev/RESULT"),
    'normal_ev': os.path.join(dn_dir, "x50/RESULT"),
    'HQ_ev': os.path.join(dn_dir, "x50_HQ_ev/RESULT")
}

# map-to-pan pan-genomes
mtp_pan_genomes = {
    'no_ev': os.path.join(mtp_dir, "x50_no_ev/RESULT"),
    'normal_ev': os.path.join(mtp_dir, "x50/RESULT"),
    'HQ_ev': os.path.join(mtp_dir, "x50_HQ_ev/RESULT")
}

In [None]:
figs_path = "/groups/itay_mayrose_nosnap/liorglic/Projects/PGCM/figs/FINAL"

## Load and preprocess data

### PAV matrices

In [None]:
# de novo
dn_pav = {
    pg :
    pd.read_csv(os.path.join(dn_pan_genomes[pg],"all_samples/pan_genome/pan_PAV.tsv"), sep='\t', index_col='gene')
    for pg in dn_pan_genomes
}

# map-to-pan
mtp_pav = {
    pg :
    pd.read_csv(os.path.join(mtp_pan_genomes[pg],"all_samples/pan_genome/pan_PAV.tsv"), sep='\t', index_col='gene')
    for pg in mtp_pan_genomes
}

for pg in dn_pav:
    dn_pav[pg].columns = dn_pav[pg].columns.map(lambda x: x.split('_')[0])
    dn_pav[pg] = dn_pav[pg][samples]

for pg in mtp_pav:
    mtp_pav[pg].columns = mtp_pav[pg].columns.map(lambda x: x.split('_')[0])
    dn_pav[pg] = dn_pav[pg][samples]
    mtp_pav[pg].index = mtp_pav[pg].index.str.replace(':','_')

### Calculate occupancy and occupancy class
(core, shell, singleton)

In [None]:
def occup_class(occup, core_cut):
    if occup >= core_cut:
        return 'Core'
    elif occup == 1:
        return 'Singleton'
    else:
        return 'Shell'

In [None]:
for pg in dn_pav:
    # calculate occupancy
    dn_pav[pg]['occupancy'] = dn_pav[pg].apply(sum, axis=1)
    # discard genes with occupancy 0
    dn_pav[pg] = dn_pav[pg].query('occupancy > 0')
    # occupancy class
    dn_pav[pg]['occup_class'] = dn_pav[pg].apply(lambda row: occup_class(row['occupancy'], n_samples), axis=1)

In [None]:
for pg in mtp_pav:
    # calculate occupancy
    mtp_pav[pg]['occupancy'] = mtp_pav[pg].apply(sum, axis=1)
    # discard genes with occupancy 0
    mtp_pav[pg] = mtp_pav[pg].query('occupancy > 0')
    # occupancy class
    mtp_pav[pg]['occup_class'] = mtp_pav[pg].apply(lambda row: occup_class(row['occupancy'], n_samples), axis=1)

## Genes per sample

In [None]:
dn_gene_counts = {}
for pg in pg_order:
    sample_counts = []
    for sample in samples:
        sample_pav = dn_pav[pg][sample]
        sample_present = sample_pav.loc[sample_pav == 1]
        ref_nonref = pd.Series(sample_present.index.str.startswith('PanGene')).map({False: 'Reference', True: 'Nonreference'}).value_counts().sort_index()
        ref_nonref.name = sample
        sample_counts.append(ref_nonref)
    dn_gene_counts[pg] = pd.concat(sample_counts, axis=1).transpose()

In [None]:
dn_gene_counts_df = pd.concat([dn_gene_counts[pg] for pg in pg_order], axis=1)
dn_gene_counts_df.columns = pd.MultiIndex.from_product([['No-evidence','Standard evidence','HQ evidence'], ['Nonreference','Reference']])
dn_gene_counts_df

In [None]:
mtp_gene_counts = {}
for pg in pg_order:
    sample_counts = []
    for sample in samples:
        sample_pav = mtp_pav[pg][sample]
        sample_present = sample_pav.loc[sample_pav == 1]
        ref_nonref = pd.Series(sample_present.index.str.startswith('PanGene')).map({False: 'Reference', True: 'Nonreference'}).value_counts().sort_index()
        ref_nonref.name = sample
        sample_counts.append(ref_nonref)
    mtp_gene_counts[pg] = pd.concat(sample_counts, axis=1).transpose()

In [None]:
mtp_gene_counts_df = pd.concat([mtp_gene_counts[pg] for pg in pg_order], axis=1)
mtp_gene_counts_df.columns = pd.MultiIndex.from_product([['No-evidence','Standard evidence','HQ evidence'], ['Nonreference','Reference']])
mtp_gene_counts_df

In [None]:
dn_nonref_counts = dn_gene_counts_df[[('No-evidence','Nonreference'),('Standard evidence','Nonreference'),('HQ evidence','Nonreference')]]
dn_nonref_counts.columns = ['No-evidence','Standard evidence','HQ evidence']
dn_nonref_counts = dn_nonref_counts.dropna()
dn_nonref_counts_melt = dn_nonref_counts.reset_index().melt(id_vars='index',
                                                           value_vars=['No-evidence','Standard evidence','HQ evidence'])
dn_nonref_counts_melt.columns = ['sample','PG','genes']
dn_nonref_counts_melt['pipeline'] = 'De novo'

mtp_nonref_counts = mtp_gene_counts_df[[('No-evidence','Nonreference'),('Standard evidence','Nonreference'),('HQ evidence','Nonreference')]]
mtp_nonref_counts.columns = ['No-evidence','Standard evidence','HQ evidence']
mtp_nonref_counts = mtp_nonref_counts.dropna()
mtp_nonref_counts_melt = mtp_nonref_counts.reset_index().melt(id_vars='index',
                                                           value_vars=['No-evidence','Standard evidence','HQ evidence'])
mtp_nonref_counts_melt.columns = ['sample','PG','genes']
mtp_nonref_counts_melt['pipeline'] = 'Map-to-pan'

nonref_counts_melt = pd.concat([dn_nonref_counts_melt, mtp_nonref_counts_melt])

In [None]:
xbase = pd.Series(nonref_counts_melt["PG"].unique()).reset_index().rename(columns={"index":"x",0:"PG"})
nonref_counts_melt = nonref_counts_melt.merge(xbase, on="PG").set_index("pipeline")

In [None]:
#samples_color_map = dict(zip(gene_counts_melt['sample'].unique(), pio.templates['plotly'].layout.colorway[:8]))

sample_colors = ['blue','red','green','purple','orange','brown','lightblue','darkgreen']
sample_colors = dict(zip(samples, sample_colors))

nonref_counts_melt['color'] = nonref_counts_melt.apply(lambda row: sample_colors[row['sample']], axis=1)

pipeline_symbol_map = {'De novo': 'square',
                      'Map-to-pan': 'cross'}
nonref_counts_melt['symbol'] = nonref_counts_melt.apply(lambda row: pipeline_symbol_map[row.name], axis=1)

In [None]:
fig = go.Figure(
    [
        go.Scatter(
            name=p,
            x=nonref_counts_melt.loc[p, "x"] + i/5,
            y=nonref_counts_melt.loc[p, "genes"],
            text=nonref_counts_melt.loc[p, "PG"],
            mode="markers",
            marker={"color": nonref_counts_melt.loc[p, "color"], "symbol": nonref_counts_melt.loc[p, "symbol"], "size":7},
            hovertemplate="(%{text},%{y})"
        )
        for i, p in enumerate(nonref_counts_melt.index.get_level_values("pipeline").unique())
    ]
)

fig.update_layout(xaxis={"tickmode":"array", "tickvals":xbase["x"], "ticktext":xbase["PG"]},
                 yaxis={'title': 'Number of genes'},
                 )

fig.update_xaxes(mirror=True, showline=True, linecolor='black', showgrid=False, zeroline=False)
fig.update_yaxes(mirror=True, showline=True, linecolor='black', showgrid=False, zeroline=False)

fig.update_layout(autosize=False, width=500)
    
fig.show()

## Pan-genome size and composition
Basic stats of the total sizes and occupancy classes of the various PGs

In [None]:
dn_pg_composition = pd.concat([dn_pav[pg]['occup_class'].value_counts().rename(pg).sort_index()
           for pg in pg_order], axis=1).transpose()
dn_pg_composition['Total'] = dn_pg_composition.apply(sum, axis=1)
dn_pg_composition.index = ['No-evidence','Standard evidence', 'HQ evidence']

In [None]:
mtp_pg_composition = pd.concat([mtp_pav[pg]['occup_class'].value_counts().rename(pg).sort_index()
           for pg in pg_order], axis=1).transpose()
mtp_pg_composition['Total'] = mtp_pg_composition.apply(sum, axis=1)
mtp_pg_composition.index = ['No-evidence','Standard evidence', 'HQ evidence']

In [None]:
pg_composition = dn_pg_composition.join(mtp_pg_composition, rsuffix='_MTP')
pg_composition.columns = pd.MultiIndex.from_product([['De novo','Map-to-pan'],['Core','Shell','Singletons','Total']])
pg_composition

In [None]:
colors = ['grey','purple','darkgreen','lightblue','orange']

In [None]:
fig = make_subplots(rows=2, cols=1,
                    shared_xaxes=True,
                    vertical_spacing=0.1,
                   subplot_titles=('De novo', 'Map-to-pan'),
                   y_title="Number of pan-genes")

fig.add_trace(go.Bar(x=dn_pg_composition.index, y=dn_pg_composition['Core'], name='Core', legendrank=3), row=1, col=1)
fig.add_trace(go.Bar(x=dn_pg_composition.index, y=dn_pg_composition['Shell'], name='Shell', legendrank=2), row=1, col=1)
fig.add_trace(go.Bar(x=dn_pg_composition.index, y=dn_pg_composition['Singleton'], name='Singleton', legendrank=1), row=1, col=1)

fig.add_trace(go.Bar(x=mtp_pg_composition.index, y=mtp_pg_composition['Core'], name='Core', showlegend=False), row=2, col=1)
fig.add_trace(go.Bar(x=mtp_pg_composition.index, y=mtp_pg_composition['Shell'], name='Shell', showlegend=False), row=2, col=1)
fig.add_trace(go.Bar(x=mtp_pg_composition.index, y=mtp_pg_composition['Singleton'], name='Singleton', showlegend=False), row=2, col=1)


fig.update_layout(barmode='stack', colorway=colors[2:])
fig.update_xaxes(mirror=True, showline=True, linecolor='black')
fig.update_yaxes(mirror=True, showline=True, linecolor='black', showgrid=False)

fig.show()