# The effect of assembly quality and sequencing depth
This notebook contains the analysis of the effect of assembly quality and sequencing depth on pan-genome results.  
The analysis mainly consists of comparing several _A. thaliana_ pan-genomes, constructed with either the de novo (DN) or the map-to-pan (MTP) approach, using different data sets with increasing sequencing depth. Results are compared to a pan-genome constructed from high-quality (HQ) genome assemblies.

In [None]:
import os
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.io as pio
from Bio import SeqIO
from itertools import chain

In [None]:
pio.templates.default = "plotly_white"
colors = ['grey','purple','darkgreen','lightblue','orange']

## Paths

In [None]:
dn_dir = "/groups/itay_mayrose_nosnap/liorglic/Projects/PGCM/output/A_thaliana_pan_genome/de_novo"
mtp_dir = "/groups/itay_mayrose_nosnap/liorglic/Projects/PGCM/output/A_thaliana_pan_genome/map_to_pan"

In [None]:
# de novo pan-genomes
dn_pan_genomes = {
    'HQ-assembly': os.path.join(dn_dir, "HQ_assembly/RESULT"),
    "full_data": os.path.join(dn_dir, "full_data/RESULT"),
    "x50": os.path.join(dn_dir, "x50/RESULT"),
    "x30": os.path.join(dn_dir, "x30/RESULT"),
    "x20": os.path.join(dn_dir, "x20/RESULT"),
    "x10": os.path.join(dn_dir, "x10/RESULT"),
}

In [None]:
# map-to-pan pan-genomes
mtp_pan_genomes = {
    'HQ-assembly': os.path.join(mtp_dir, "HQ_assembly/RESULT"),
    "full_data": os.path.join(mtp_dir, "full_data/RESULT"),
    "x50": os.path.join(mtp_dir, "x50/RESULT"),
    "x30": os.path.join(mtp_dir, "x30/RESULT"),
    "x20": os.path.join(mtp_dir, "x20/RESULT"),
    "x10": os.path.join(mtp_dir, "x10/RESULT"),
}

In [None]:
compare_dir = "/groups/itay_mayrose_nosnap/liorglic/Projects/PGCM/output/A_thaliana_pan_genome/compare_pan_genomes"

In [None]:
# de novo comparison dirs
dn_compare = {
    "full_data": os.path.join(compare_dir, "DN_HQ_asm_vs_DN_full_data/RESULT"),
    "x50": os.path.join(compare_dir, "DN_HQ_asm_vs_DN_x50/RESULT"),
    "x30": os.path.join(compare_dir, "DN_HQ_asm_vs_DN_x30/RESULT"),
    "x20": os.path.join(compare_dir, "DN_HQ_asm_vs_DN_x20/RESULT"),
    "x10": os.path.join(compare_dir, "DN_HQ_asm_vs_DN_x10/RESULT"),
}

In [None]:
# map-to-pan comparison dirs
mtp_compare = {
    "full_data": os.path.join(compare_dir, "MTP_HQ_asm_vs_MTP_full_data/RESULT"),
    "x50": os.path.join(compare_dir, "MTP_HQ_asm_vs_MTP_x50/RESULT"),
    "x30": os.path.join(compare_dir, "MTP_HQ_asm_vs_MTP_x30/RESULT"),
    "x20": os.path.join(compare_dir, "MTP_HQ_asm_vs_MTP_x20/RESULT"),
    "x10": os.path.join(compare_dir, "MTP_HQ_asm_vs_MTP_x10/RESULT"),
}

In [None]:
figs_path = "/groups/itay_mayrose_nosnap/liorglic/Projects/PGCM/figs/FINAL"

## Assembly stats
Assemblies are common to the DN and MTP pan-genomes, so no need to present results for both

In [None]:
pg_order = ["x10", "x20", "x30", "x50","full_data", "HQ-assembly"]
samples = ['An-1', 'C24', 'Cvi-0', 'Eri', 'Kyo', 'Ler', 'Sha', 'TAIR10']
n_samples = len(samples)

In [None]:
dn_pg_asm_stats = {pg: pd.read_csv(os.path.join(dn_pan_genomes[pg],"all_samples/stats/assembly_stats.tsv"),
                                    sep='\t', index_col=0) for pg in pg_order[:-1]}

In [None]:
keep_columns = ['Input bases', 'Clean bases', '# contigs (>= 0 bp)', 'Total length (>= 0 bp)',
               'N50', '% Complete BUSCOs', '% unmapped (Chr0)']
for pg in dn_pg_asm_stats:
    dn_pg_asm_stats[pg] = dn_pg_asm_stats[pg][keep_columns]

In [None]:
# N50
n50_dfs = []
for pg in pg_order[:-1]:
    tmp = pd.DataFrame(dn_pg_asm_stats[pg]['N50'])
    tmp = tmp.reset_index()
    tmp.columns = ['sample','N50']
    tmp['PG'] = pg.replace('full_data','Full data')
    n50_dfs.append(tmp)
n50_df = pd.concat(n50_dfs)

# Assembly size
asm_size_dfs = []
for pg in pg_order[:-1]:
    tmp = pd.DataFrame(dn_pg_asm_stats[pg]['Total length (>= 0 bp)'])
    tmp = tmp.reset_index()
    tmp.columns = ['sample','assembly_size']
    tmp['PG'] = pg.replace('full_data','Full data')
    asm_size_dfs.append(tmp)
asm_size_df = pd.concat(asm_size_dfs)

# BUSCOs
busco_dfs = []
for pg in pg_order[:-1]:
    tmp = pd.DataFrame(dn_pg_asm_stats[pg]['% Complete BUSCOs'])
    tmp = tmp.reset_index()
    tmp.columns = ['sample','complete_buscos']
    tmp['PG'] = pg.replace('full_data','Full data')
    busco_dfs.append(tmp)
busco_df = pd.concat(busco_dfs)

In [None]:
sample_colors = ['blue','red','green','purple','orange','brown','lightblue','darkgreen']
sample_colors = dict(zip(samples, sample_colors))

In [None]:
n50_df['color'] = n50_df['sample'].map(sample_colors)
asm_size_df['color'] = asm_size_df['sample'].map(sample_colors)
busco_df['color'] = busco_df['sample'].map(sample_colors)

In [None]:
n50_median = n50_df.groupby('PG').median().reindex(pg_order[:-2]+['Full data'])
asm_size_median = asm_size_df.groupby('PG').median().reindex(pg_order[:-2]+['Full data'])
busco_median = busco_df.groupby('PG').median().reindex(pg_order[:-2]+['Full data'])

In [None]:
fig = make_subplots(rows=3, cols=1,
                    shared_xaxes=True,
                    vertical_spacing=0.05)

for sample in samples[:-1]:
    tmp_df = n50_df.query('sample == @sample')
    fig.add_trace(go.Scatter(x=tmp_df['PG'], y=tmp_df['N50'], marker_color=sample_colors[sample], mode='markers', name=sample), row=1, col=1)
fig.add_trace(go.Scatter(x=n50_median.index, y=n50_median['N50'], mode='lines', line={'color':'black', 'dash':'dash'}, name='Median'), row=1, col=1)

fig.add_trace(go.Scatter(x=asm_size_df['PG'], y=asm_size_df['assembly_size'], marker_color=asm_size_df['color'], mode='markers', showlegend=False), row=2, col=1)
fig.add_trace(go.Scatter(x=asm_size_median.index, y=asm_size_median['assembly_size'], mode='lines', line={'color':'black', 'dash':'dash'}, showlegend=False), row=2, col=1)

fig.add_trace(go.Scatter(x=busco_df['PG'], y=busco_df['complete_buscos'], marker_color=busco_df['color'], mode='markers', showlegend=False), row=3, col=1)
fig.add_trace(go.Scatter(x=busco_median.index, y=busco_median['complete_buscos'], mode='lines', line={'color':'black', 'dash':'dash'}, showlegend=False), row=3, col=1)

fig.update_yaxes(title_text="N50", row=1, col=1)
fig.update_yaxes(title_text="Assembly size", row=2, col=1)
fig.update_yaxes(title_text="% complete <br> BUSCOs", row=3, col=1)
fig.update_xaxes(mirror=True, showline=True, linecolor='black')
fig.update_yaxes(mirror=True, showline=True, linecolor='black', showgrid=False)
fig.update_layout(width=600)

fig.show()

In [None]:
fig3_a = os.path.join(figs_path, 'fig3a.pdf')
fig.write_image(fig3_a)

## Gene PAV analysis
Analyze the effect of sequencing depth and assembly quality on gene PAV calling and nonreference gene detection.  
Each pan-genome was compared to the a pan-genome constructed from HQ assemblies using the same pipeline.

### Load and pre-process data

In [None]:
# Load DN PAV matrices
dn_pav = {
    pg :
    pd.read_csv(os.path.join(dn_pan_genomes[pg],"all_samples/pan_genome/pan_PAV.tsv"), sep='\t', index_col='gene')
    for pg in dn_pan_genomes
}
# Ensure same samples order in all matrices
for pg in dn_pav:
    dn_pav[pg].columns = [s.split('_')[0] for s in dn_pav[pg].columns]
    dn_pav[pg] = dn_pav[pg][samples]
    
# Load MTP PAV matrices
mtp_pav = {
    pg :
    pd.read_csv(os.path.join(mtp_pan_genomes[pg],"all_samples/pan_genome/pan_PAV.tsv"), sep='\t', index_col='gene')
    for pg in mtp_pan_genomes
}
# Ensure same samples order in all matrices
for pg in mtp_pav:
    mtp_pav[pg].columns = [s.split('_')[0] for s in mtp_pav[pg].columns]
    mtp_pav[pg] = mtp_pav[pg][samples]

In [None]:
# DN name matching
dn_nonref_name_match = {pg:
                        pd.read_csv(dn_compare[pg] + '/A_thaliana_DN_%s_vs_A_thaliana_DN_HQ_asm_max_weight_matches.tsv' % pg,
                                    sep='\t', index_col=0, header=0, usecols=[0,1], names=['orig_name','HQ_name'])
                        for pg in dn_compare
                       }

# MTP name matching
mtp_nonref_name_match = {pg:
                        pd.read_csv(mtp_compare[pg] + '/A_thaliana_MTP_%s_vs_A_thaliana_MTP_HQ_asm_max_weight_matches.tsv' % pg,
                                    sep='\t', index_col=0, header=0, usecols=[0,1], names=['orig_name','HQ_name'])
                        for pg in mtp_compare
                       }

### Calculate occupancy and occupancy class
(core, shell, singleton)

In [None]:
def occup_class(occup, core_cut):
    if occup >= core_cut:
        return 'Core'
    elif occup == 1:
        return 'Singleton'
    else:
        return 'Shell'

In [None]:
for pg in dn_pav:
    # calculate occupancy
    dn_pav[pg]['occupancy'] = dn_pav[pg].apply(sum, axis=1)
    # discard genes with occupancy 0
    dn_pav[pg] = dn_pav[pg].query('occupancy > 0')
    # occupancy class
    dn_pav[pg]['occup_class'] = dn_pav[pg].apply(lambda row: occup_class(row['occupancy'], n_samples), axis=1)
    
for pg in mtp_pav:
    # calculate occupancy
    mtp_pav[pg]['occupancy'] = mtp_pav[pg].apply(sum, axis=1)
    # discard genes with occupancy 0
    mtp_pav[pg] = mtp_pav[pg].query('occupancy > 0')
    # occupancy class
    mtp_pav[pg]['occup_class'] = mtp_pav[pg].apply(lambda row: occup_class(row['occupancy'], n_samples), axis=1)

### Pan-genome size and composition
Basic stats of the total sizes and occupancy classes of the various PGs

In [None]:
dn_pg_composition = pd.concat([dn_pav[pg]['occup_class'].value_counts().rename(pg).sort_index()
           for pg in pg_order], axis=1).transpose()
dn_pg_composition['Total'] = dn_pg_composition.apply(sum, axis=1)

In [None]:
mtp_pg_composition = pd.concat([mtp_pav[pg]['occup_class'].value_counts().rename(pg).sort_index()
           for pg in pg_order], axis=1).transpose()
mtp_pg_composition['Total'] = mtp_pg_composition.apply(sum, axis=1)

In [None]:
pg_composition = dn_pg_composition.join(mtp_pg_composition, rsuffix='_MTP')
pg_composition.columns = pd.MultiIndex.from_product([['De novo','Map-to-pan'],['Core','Shell','Singletons','Total']])
pg_composition

In [None]:
fig = make_subplots(rows=2, cols=1,
                    shared_xaxes=True,
                    vertical_spacing=0.1,
                   subplot_titles=('De novo', 'Map-to-pan'),
                   y_title="Number of pan-genes")

fig.add_trace(go.Bar(x=dn_pg_composition.index, y=dn_pg_composition['Core'], name='Core', legendrank=3), row=1, col=1)
fig.add_trace(go.Bar(x=dn_pg_composition.index, y=dn_pg_composition['Shell'], name='Shell', legendrank=2), row=1, col=1)
fig.add_trace(go.Bar(x=dn_pg_composition.index, y=dn_pg_composition['Singleton'], name='Singleton', legendrank=1), row=1, col=1)

fig.add_trace(go.Bar(x=mtp_pg_composition.index, y=mtp_pg_composition['Core'], name='Core', showlegend=False), row=2, col=1)
fig.add_trace(go.Bar(x=mtp_pg_composition.index, y=mtp_pg_composition['Shell'], name='Shell', showlegend=False), row=2, col=1)
fig.add_trace(go.Bar(x=mtp_pg_composition.index, y=mtp_pg_composition['Singleton'], name='Singleton', showlegend=False), row=2, col=1)


fig.update_layout(barmode='stack', colorway=colors[2:])
fig.update_xaxes(mirror=True, showline=True, linecolor='black')
fig.update_yaxes(mirror=True, showline=True, linecolor='black', showgrid=False)

fig.show()

In [None]:
fig3_b = os.path.join(figs_path, 'fig3b.pdf')
fig.write_image(fig3_b)

### Per-sample number of genes
Compare the number of genes detected as present per sample, across pan-genomes (increasing depth).

In [None]:
# generate df
dn_gene_counts = pd.concat([dn_pav[pg][samples].sum() for pg in pg_order], axis=1)
dn_gene_counts.columns = pg_order
mtp_gene_counts = pd.concat([mtp_pav[pg][samples].sum() for pg in pg_order], axis=1)
mtp_gene_counts.columns = pg_order

gene_counts = dn_gene_counts.join(mtp_gene_counts, rsuffix='MTP_')
gene_counts.columns = pd.MultiIndex.from_product([['De novo','Map-to-pan'],pg_order])
gene_counts

In [None]:
dn_gene_counts_melt = dn_gene_counts.reset_index().melt(id_vars='index', value_vars=pg_order)
dn_gene_counts_melt.columns = ['sample','PG','count']
mtp_gene_counts_melt = mtp_gene_counts.reset_index().melt(id_vars='index', value_vars=pg_order)
mtp_gene_counts_melt.columns = ['sample','PG','count']

In [None]:
dn_gene_counts_melt['pipeline'] = 'De novo'
mtp_gene_counts_melt['pipeline'] = 'Map-to-pan'
gene_counts_melt = pd.concat([dn_gene_counts_melt,mtp_gene_counts_melt])

In [None]:
xbase = pd.Series(gene_counts_melt["PG"].unique()).reset_index().rename(columns={"index":"x",0:"PG"})
gene_counts_melt = gene_counts_melt.merge(xbase, on="PG").set_index("pipeline")

In [None]:
#samples_color_map = dict(zip(gene_counts_melt['sample'].unique(), pio.templates['plotly'].layout.colorway[:8]))
gene_counts_melt['color'] = gene_counts_melt.apply(lambda row: sample_colors[row['sample']], axis=1)

pipeline_symbol_map = {'De novo': 'square',
                      'Map-to-pan': 'cross'}
gene_counts_melt['symbol'] = gene_counts_melt.apply(lambda row: pipeline_symbol_map[row.name], axis=1)

In [None]:
dn_gene_counts_median = dn_gene_counts_melt.groupby('PG').median()
mtp_gene_counts_median = mtp_gene_counts_melt.groupby('PG').median()

In [None]:
dn_gene_counts_median = dn_gene_counts_median.merge(xbase, on='PG').sort_values('x')
mtp_gene_counts_median = mtp_gene_counts_median.merge(xbase, on='PG').sort_values('x')
mtp_gene_counts_median['x'] = mtp_gene_counts_median['x'] + 1/5

In [None]:
fig = go.Figure(
    [
        go.Scatter(
            name=p,
            x=gene_counts_melt.loc[p, "x"] + i/5,
            y=gene_counts_melt.loc[p, "count"],
            text=gene_counts_melt.loc[p, "PG"],
            mode="markers",
            marker={"color": gene_counts_melt.loc[p, "color"], "symbol": gene_counts_melt.loc[p, "symbol"], "size":7},
            hovertemplate="(%{text},%{y})"
        )
        for i, p in enumerate(gene_counts_melt.index.get_level_values("pipeline").unique())
    ]
)
fig.add_trace(go.Scatter(x=dn_gene_counts_median['x'], y=dn_gene_counts_median['count'],
                    mode='lines',
                    name='De novo Median',
                    line={'color':'black','dash':'dash'}))

fig.add_trace(go.Scatter(x=mtp_gene_counts_median['x'], y=mtp_gene_counts_median['count'],
                    mode='lines',
                    name='Map-to-pan Median',
                    line={'color':'darkgrey','dash':'dash'}))

fig.update_layout(xaxis={"tickmode":"array", "tickvals":xbase["x"], "ticktext":xbase["PG"]},
                 yaxis={'title': 'Number of genes'},
                 )

fig.update_xaxes(mirror=True, showline=True, linecolor='black')
fig.update_yaxes(mirror=True, showline=True, linecolor='black', showgrid=False)

fig.show()

In [None]:
fig3_c = os.path.join(figs_path, 'fig3c.pdf')
fig.write_image(fig3_c)

### Nonreference gene pool
Analyze the effect of sequencing depth on the number of detected nonreference genes. Compare the nonreference pool of each pan-genome to that of the corresponding HQ pan-genome.

#### Nonreference gene matching
First, rename nonreference pan-genes according to their matching to the HQ pan-genomes.

In [None]:
dn_nonref_name_match = {pg:
                        pd.read_csv(dn_compare[pg] + '/A_thaliana_DN_%s_vs_A_thaliana_DN_HQ_asm_max_weight_matches.tsv' % pg.replace('-','_'), sep='\t', index_col=0)
                        for pg in pg_order[:-1]
                       }

mtp_nonref_name_match = {pg:
                         pd.read_csv(mtp_compare[pg] + '/A_thaliana_MTP_%s_vs_A_thaliana_MTP_HQ_asm_max_weight_matches.tsv' % pg.replace('-','_'), sep='\t', index_col=0)
                         for pg in pg_order[:-1]
                        }

In [None]:
def convert_gene_name(name, conv_df, pg_name):
    if name.startswith('transcript'):
        return name.replace(':','_')
    if name in conv_df.index:
        return conv_df.loc[name][0]
    else:
        return "%s__%s_unmatched" %(name, pg_name)

In [None]:
dn_name_match = {pg:
                 dn_pav[pg].apply(lambda row: convert_gene_name(row.name, dn_nonref_name_match[pg], pg), axis=1)
                 for pg in pg_order[:-1]
                }

mtp_name_match = {pg:
                  mtp_pav[pg].apply(lambda row: convert_gene_name(row.name, mtp_nonref_name_match[pg], pg), axis=1)
                  for pg in pg_order[:-1]
                 }

In [None]:
dn_pav_rename = {pg: dn_pav[pg].set_index(dn_pav[pg].index.map(dn_name_match[pg])) for pg in pg_order[:-1]}
mtp_pav_rename = {pg: mtp_pav[pg].set_index(mtp_pav[pg].index.map(mtp_name_match[pg])) for pg in pg_order[:-1]}

#### Nonreference stats

In [None]:
# How many non-ref genes per PG and how many of these match HQ non-ref
dn_nonref_matches = {pg:
                     pd.read_csv(os.path.join(dn_compare[pg],"A_thaliana_DN_%s_vs_A_thaliana_DN_HQ_asm_max_weight_matches.tsv" % pg.replace('-','_')), sep='\t')
                     for pg in pg_order[:-1]
                    }

mtp_nonref_matches = {pg:
                      pd.read_csv(os.path.join(mtp_compare[pg],"A_thaliana_MTP_%s_vs_A_thaliana_MTP_HQ_asm_max_weight_matches.tsv" % pg.replace('-','_')), sep='\t')
                      for pg in pg_order[:-1]
                     }

In [None]:
def count_nonref(pg1_pav_df, pg2_pav_df, matches_df):
    pg1_nonref = pg1_pav_df.index.str.startswith('PanGene').sum()
    pg2_nonref = pg2_pav_df.index.str.startswith('PanGene').sum()
    matched_nonref = matches_df.shape[0]
    return (matched_nonref, pg1_nonref - matched_nonref, pg2_nonref - matched_nonref)

In [None]:
dn_nonref_counts = {pg:
                    count_nonref(dn_pav[pg], dn_pav['HQ-assembly'], dn_nonref_matches[pg])
                    for pg in pg_order[:-1]
                   }

mtp_nonref_counts = {pg:
                     count_nonref(mtp_pav[pg], mtp_pav['HQ-assembly'], mtp_nonref_matches[pg])
                     for pg in pg_order[:-1]
                    }

In [None]:
dn_nonref_counts_df = pd.DataFrame.from_dict(dn_nonref_counts, orient='index')
dn_nonref_counts_df.columns = ['Matched','PG+|HQ-','PG-|HQ+']
dn_nonref_counts_df['Total nonref'] = dn_nonref_counts_df['Matched'] + dn_nonref_counts_df['PG+|HQ-'] + dn_nonref_counts_df['PG-|HQ+']

mtp_nonref_counts_df = pd.DataFrame.from_dict(mtp_nonref_counts, orient='index')
mtp_nonref_counts_df.columns = ['Matched','PG+|HQ-','PG-|HQ+']
mtp_nonref_counts_df['Total nonref'] = mtp_nonref_counts_df['Matched'] + mtp_nonref_counts_df['PG+|HQ-'] + mtp_nonref_counts_df['PG-|HQ+']

nonref_counts_df = dn_nonref_counts_df.join(mtp_nonref_counts_df, rsuffix='MTP_')
nonref_counts_df.columns = pd.MultiIndex.from_product([['De novo','Map-to-pan'],dn_nonref_counts_df.columns])

nonref_counts_df

In [None]:
fig = make_subplots(rows=2, cols=1,
                    shared_xaxes=True,
                    subplot_titles=("De novo", "Map-to-pan"),
                   vertical_spacing=0.1,
                   y_title="Number of nonreference <br> pan-genes")

fig.add_trace(go.Bar(name='Matched', x=pg_order[:-1], y=[dn_nonref_counts[pg][0] for pg in pg_order[:-1]], legendgroup='matched', marker_color='darkseagreen'), row=1, col=1)    ,
fig.add_trace(go.Bar(name='PG+|HQ-', x=pg_order[:-1], y=[dn_nonref_counts[pg][1] for pg in pg_order[:-1]], legendgroup='PG+|HQ-', marker_color='darkmagenta'), row=1, col=1)
fig.add_trace(go.Bar(name='PG-|HQ+', x=pg_order[:-1], y=[dn_nonref_counts[pg][2] for pg in pg_order[:-1]], legendgroup='PG-|HQ+', marker_color='royalblue'), row=1, col=1)
fig.add_trace(go.Bar(name='Matched', x=pg_order[:-1], y=[mtp_nonref_counts[pg][0] for pg in pg_order[:-1]], legendgroup='matched', showlegend=False, marker_color='darkseagreen'), row=2, col=1)    ,
fig.add_trace(go.Bar(name='PG+|HQ-', x=pg_order[:-1], y=[mtp_nonref_counts[pg][1] for pg in pg_order[:-1]], legendgroup='PG+|HQ-', showlegend=False, marker_color='darkmagenta'), row=2, col=1)
fig.add_trace(go.Bar(name='PG-|HQ+', x=pg_order[:-1], y=[mtp_nonref_counts[pg][2] for pg in pg_order[:-1]], legendgroup='PG-|HQ+', showlegend=False, marker_color='royalblue'), row=2, col=1)


# Change the bar mode
fig.update_layout(barmode='stack')
fig.update_xaxes(mirror=True, showline=True, linecolor='black')
fig.update_yaxes(mirror=True, showline=True, linecolor='black', showgrid=False)
fig.update_layout(height=500)
fig.show()

In [None]:
fig3_d = os.path.join(figs_path, 'fig3d.pdf')
fig.write_image(fig3_d)

#### Unmatched nonreference transcript mapping
To better understand the origin of unmatched nonreference genes, transcripts of such genes were mapped to all  assemblies in the other pan-genome, and the number of transcripts that could not be mapped (95% transcript sequence coverage) to any assembly was calculated.  
It is assumed that unmatched transcripts that could not be mapped originate from the absence of the relevant sequences in the assembly, whereas mapped transcripts indicate another source, e.g. gene duplications or clustering issues.

In [None]:
# Load mapping data
dn_trans_mapping = {pg:[pd.read_csv(os.path.join(dn_compare[pg],"A_thaliana_DN_%s_trans_vs_A_thaliana_DN_HQ_asm_assemblies/transcript_mapping.tsv" % pg.replace('-','_')), sep='\t'),
                             pd.read_csv(os.path.join(dn_compare[pg],"A_thaliana_DN_HQ_asm_trans_vs_A_thaliana_DN_%s_assemblies/transcript_mapping.tsv" % pg.replace('-','_')), sep='\t')]
                        for pg in pg_order[:-1]}

mtp_trans_mapping = {pg: [pd.read_csv(os.path.join(mtp_compare[pg],"A_thaliana_MTP_%s_trans_vs_A_thaliana_MTP_HQ_asm_assemblies/transcript_mapping.tsv" % pg.replace('-','_')), sep='\t'),
                             pd.read_csv(os.path.join(mtp_compare[pg],"A_thaliana_MTP_HQ_asm_trans_vs_A_thaliana_MTP_%s_assemblies/transcript_mapping.tsv" % pg.replace('-','_')), sep='\t')]
                        for pg in pg_order[:-1]}

In [None]:
# Replace ":" in gene names with "_"
for pg in mtp_trans_mapping:
    mtp_trans_mapping[pg][0]['Query_sequence_name'] = mtp_trans_mapping[pg][0]['Query_sequence_name'].apply(lambda x: x.replace(':','_'))
    mtp_trans_mapping[pg][1]['Query_sequence_name'] = mtp_trans_mapping[pg][1]['Query_sequence_name'].apply(lambda x: x.replace(':','_'))

In [None]:
# Generate sets of nonreference gene names
dn_pg_nonref = {pg:
                set(dn_pav[pg].loc[dn_pav[pg].index.str.startswith('PanGene')].index)
                for pg in pg_order
               }

mtp_pg_nonref = {pg:
                set(mtp_pav[pg].loc[mtp_pav[pg].index.str.startswith('PanGene')].index)
                for pg in pg_order
               }

In [None]:
# lists of unmatched genrs
dn_unmatched = {pg: [dn_pg_nonref[pg] - set(dn_nonref_matches[pg].iloc[:,1]),
                    dn_pg_nonref['HQ-assembly'] - set(dn_nonref_matches[pg].iloc[:,2])] for pg in pg_order[:-1]}
mtp_unmatched = {pg: [mtp_pg_nonref[pg] - set(mtp_nonref_matches[pg].iloc[:,1]),
                    mtp_pg_nonref['HQ-assembly'] - set(mtp_nonref_matches[pg].iloc[:,2])] for pg in pg_order[:-1]}

# Unmatched transcript mapping
dn_unmatched_mapping = {pg:
                        [dn_trans_mapping[pg][i].loc[dn_trans_mapping[pg][i]['Query_sequence_name'].isin(dn_unmatched[pg][i])] for i in range(2)]
                        for pg in pg_order[:-1]
                       }
mtp_unmatched_mapping = {pg:
                         [mtp_trans_mapping[pg][i].loc[mtp_trans_mapping[pg][i]['Query_sequence_name'].isin(mtp_unmatched[pg][i])] for i in range(2)]
                         for pg in pg_order[:-1]
                        }

In [None]:
c = 0.95
def is_mapped(x):
    # x : chr1:100-500;0.92
    if type(x) is float:
        return np.nan
    score = float(x.split(';')[1])
    if score >= c:
        return True
    else:
        return np.nan
    
# return [# of mapped, # of unmapped]
def unmatched_mapping(mapping_df):
    tmp = mapping_df.set_index('Query_sequence_name')
    tmp = tmp.applymap(is_mapped)
    tmp = tmp.isnull().all(axis=1).value_counts()
    return [tmp[False], tmp[True]]

In [None]:
dn_unmatched_mapping_counts = {pg:
                               [unmatched_mapping(dn_unmatched_mapping[pg][0]), unmatched_mapping(dn_unmatched_mapping[pg][1])]
                               for pg in pg_order[:-1]
                              }

mtp_unmatched_mapping_counts = {pg:
                                [unmatched_mapping(mtp_unmatched_mapping[pg][0]), unmatched_mapping(mtp_unmatched_mapping[pg][1])]
                                for pg in pg_order[:-1]
                               } 

In [None]:
def flatten(t):
    return [item for sublist in t for item in sublist]

dn_unmatched_mapping_counts_df = pd.DataFrame.from_dict({k: flatten(dn_unmatched_mapping_counts[k]) for k in dn_unmatched_mapping_counts}, orient='index')
dn_unmatched_mapping_counts_df.columns = ['PG+|HQ- - mapped','PG+|HQ- - unmapped','PG-|HQ+ - mapped','PG-|HQ+ - unmapped']

mtp_unmatched_mapping_counts_df = pd.DataFrame.from_dict({k: flatten(mtp_unmatched_mapping_counts[k]) for k in mtp_unmatched_mapping_counts}, orient='index')
mtp_unmatched_mapping_counts_df.columns = ['PG+|HQ- - mapped','PG+|HQ- - unmapped','PG-|HQ+ - mapped','PG-|HQ+ - unmapped']

unmatched_mapping_counts_df = dn_unmatched_mapping_counts_df.join(mtp_unmatched_mapping_counts_df, rsuffix='MTP_')
unmatched_mapping_counts_df.columns = pd.MultiIndex.from_product([['De novo','Map-to-pan'],dn_unmatched_mapping_counts_df.columns])

unmatched_mapping_counts_df

In [None]:
x = [
    list(chain(*[[pg]*2 for pg in pg_order[:-1]])),
    ['PG+|HQ-','PG-|HQ+']*len(pg_order)
]
y1 = list(chain(*[[dn_unmatched_mapping_counts[pg][0][0], dn_unmatched_mapping_counts[pg][1][0]] for pg in pg_order[:-1]]))
y2 = list(chain(*[[dn_unmatched_mapping_counts[pg][0][1], dn_unmatched_mapping_counts[pg][1][1]] for pg in pg_order[:-1]]))
fig = go.Figure()
fig.add_bar(name="Mapped", x=x, y=y1)
fig.add_bar(name="Unmapped", x=x, y=y2)
fig.update_layout(barmode="relative", title_text="De novo")

fig.update_yaxes(title_text="Number of nonreference <br> unmatched genes")
fig.update_xaxes(mirror=True, showline=True, linecolor='black')
fig.update_yaxes(mirror=True, showline=True, linecolor='black', showgrid=False)
fig.show()

In [None]:
y1 = list(chain(*[[mtp_unmatched_mapping_counts[pg][0][0], mtp_unmatched_mapping_counts[pg][1][0]] for pg in pg_order[:-1]]))
y2 = list(chain(*[[mtp_unmatched_mapping_counts[pg][0][1], mtp_unmatched_mapping_counts[pg][1][1]] for pg in pg_order[:-1]]))
fig = go.Figure()
fig.add_bar(name="Mapped", x=x, y=y1)
fig.add_bar(name="Unmapped", x=x, y=y2)
fig.update_layout(barmode="relative", title_text="Map-to-pan")

fig.update_yaxes(title_text="Number of nonreference <br> unmatched genes")
fig.update_xaxes(mirror=True, showline=True, linecolor='black')
fig.update_yaxes(mirror=True, showline=True, linecolor='black', showgrid=False)
fig.show()

### PAV discrepancies
Compare PAV tables of each pan-genome to the corresponding HQ-assemblies pan-genome.

In [None]:
# Calculate discrepancies tables

# De novo discrepancies
dn_pav_discrep = {}
for pg in pg_order[:-1]:
    # matched genes only
    pg_pav_df = dn_pav_rename[pg].loc[~ dn_pav_rename[pg].index.str.endswith('_unmatched')]
    hq_pav_df = dn_pav['HQ-assembly'].loc[pg_pav_df.index]
    # find which genes are core in both
    pg_core = set(pg_pav_df.query('occup_class == "Core"').index)
    hq_core = set(hq_pav_df.query('occup_class == "Core"').index)
    core_in_both = pg_core.intersection(hq_core)
    pg_pav_df = pg_pav_df.loc[~pg_pav_df.index.isin(core_in_both)][samples]
    hq_pav_df = hq_pav_df.loc[~hq_pav_df.index.isin(core_in_both)][samples]
    # find discrepancies
    discrep = pg_pav_df - hq_pav_df
    dn_pav_discrep[pg] = discrep
    
# Map-to-pan discrepancies
mtp_pav_discrep = {}
for pg in pg_order[:-1]:
    # matched genes only
    pg_pav_df = mtp_pav_rename[pg].loc[~ mtp_pav_rename[pg].index.str.endswith('_unmatched')]
    hq_pav_df = mtp_pav['HQ-assembly']
    hq_pav_df.index = hq_pav_df.index.map(lambda x: x.replace(':','_'))
    hq_pav_df = mtp_pav['HQ-assembly'].loc[pg_pav_df.index]
    # find which genes are core in both
    pg_core = set(pg_pav_df.query('occup_class == "Core"').index)
    hq_core = set(hq_pav_df.query('occup_class == "Core"').index)
    core_in_both = pg_core.intersection(hq_core)
    pg_pav_df = pg_pav_df.loc[~pg_pav_df.index.isin(core_in_both)][samples]
    hq_pav_df = hq_pav_df.loc[~hq_pav_df.index.isin(core_in_both)][samples]
    # find discrepancies
    discrep = pg_pav_df - hq_pav_df
    mtp_pav_discrep[pg] = discrep

#### Total discrepancies and types
How many discrepancies are there in each PG, and of what type (PG-|HQ+ vs. PG+|HQ-)

In [None]:
# DN
dn_discrep_types = pd.concat([dn_pav_discrep[pg].apply(lambda row: row.value_counts(), axis=1).sum().loc[[-1,1]] for pg in pg_order[:-1]], axis=1)
dn_discrep_types.index = ['PG-|HQ+','PG+|HQ-']
dn_discrep_types.columns = pg_order[:-1]
dn_discrep_types = dn_discrep_types.transpose()
dn_discrep_types['Total discrepancies'] = dn_discrep_types.apply(sum, axis=1)
dn_discrep_types['Total PA calls'] = [dn_pav_discrep[pg].shape[0] * dn_pav_discrep[pg].shape[1] for pg in pg_order[:-1]]
dn_discrep_types['% Discrepancies'] = dn_discrep_types['Total discrepancies']/dn_discrep_types['Total PA calls']*100

# MTP
mtp_discrep_types = pd.concat([mtp_pav_discrep[pg].apply(lambda row: row.value_counts(), axis=1).sum().loc[[-1,1]] for pg in pg_order[:-1]], axis=1)
mtp_discrep_types.index = ['PG-|HQ+','PG+|HQ-']
mtp_discrep_types.columns = pg_order[:-1]
mtp_discrep_types = mtp_discrep_types.transpose()
mtp_discrep_types['Total discrepancies'] = mtp_discrep_types.apply(sum, axis=1)
mtp_discrep_types['Total PA calls'] = [mtp_pav_discrep[pg].shape[0] * mtp_pav_discrep[pg].shape[1] for pg in pg_order[:-1]]
mtp_discrep_types['% Discrepancies'] = mtp_discrep_types['Total discrepancies']/mtp_discrep_types['Total PA calls']*100

In [None]:
discrep_types = dn_discrep_types.join(mtp_discrep_types, rsuffix='_MTP')
discrep_types.columns = pd.MultiIndex.from_product([['De novo','Map-to-pan'],dn_discrep_types.columns])
discrep_types

In [None]:
fig = make_subplots(rows=2, cols=1,
                    shared_xaxes=True,
                    subplot_titles=("De novo", "Map-to-pan"),
                   vertical_spacing=0.1,
                   y_title="Number of <br> PAV discrepancies")

fig.add_trace(go.Bar(name='PG-|HQ+', x=pg_order[:-1], y=dn_discrep_types['PG-|HQ+'], marker_color='grey'), row=1, col=1)
fig.add_trace(go.Bar(name='PG+|HQ-', x=pg_order[:-1], y=dn_discrep_types['PG+|HQ-'], marker_color='black'), row=1, col=1)
fig.add_trace(go.Bar(name='PG-|HQ+', x=pg_order[:-1], y=mtp_discrep_types['PG-|HQ+'], marker_color='grey', showlegend=False), row=2, col=1)
fig.add_trace(go.Bar(name='PG+|HQ-', x=pg_order[:-1], y=mtp_discrep_types['PG+|HQ-'], marker_color='black', showlegend=False), row=2, col=1)

# Change the bar mode
fig.update_layout(barmode='stack')
fig.update_xaxes(mirror=True, showline=True, linecolor='black')
fig.update_yaxes(mirror=True, showline=True, linecolor='black', showgrid=False)
fig.update_layout(height=500)
fig.show()

In [None]:
fig3_e = os.path.join(figs_path, 'fig3e.pdf')
fig.write_image(fig3_e)

#### Mapped/unmapped discrepancies
To try and better understand the causes of discrepancies in gene PAV calling between pan-genomes, transcripts of genes detected as present in sample X in PG1, but absent in sample X in PG2 are searched in the genome assembly of sample X in PG2. Discrepancies are classified into "mapped" and "unmapped" based on whether or not they were found in the assembly. This is similar to the mapped/unmapped analysis performed for nonreference unmatched genes, but transcripts are only searched in the relevant assemblies.  
Unmatched genes are ignored in this analysis.

In [None]:
# create melted discrepancies tables
dn_pav_discrep_melt = {pg:
                       dn_pav_discrep[pg].melt(value_vars=dn_pav_discrep[pg].columns, ignore_index=False, var_name='sample', value_name='type').query('type != 0')
                       for pg in pg_order[:-1]
                      }

mtp_pav_discrep_melt = {pg:
                       mtp_pav_discrep[pg].melt(value_vars=mtp_pav_discrep[pg].columns, ignore_index=False, var_name='sample', value_name='type').query('type != 0')
                       for pg in pg_order[:-1]
                      }

In [None]:
# join discrepancies tables with transcript mappings
# {pg: [PG-/HQ+ discrepancies, PG+/HQ- discrepancies]}
dn_discrep_trans_mapping = {pg :
                            [dn_pav_discrep_melt[pg].query("type == -1").merge(dn_trans_mapping[pg][1], left_on='gene', right_on='Query_sequence_name'),
                            dn_pav_discrep_melt[pg].query("type == 1").merge(dn_trans_mapping[pg][0], left_on='gene', right_on='Query_sequence_name')]
                           for pg in pg_order[:-1]}

mtp_discrep_trans_mapping = {pg :
                            [mtp_pav_discrep_melt[pg].query("type == -1").merge(mtp_trans_mapping[pg][1], left_on='gene', right_on='Query_sequence_name'),
                            mtp_pav_discrep_melt[pg].query("type == 1").merge(mtp_trans_mapping[pg][0], left_on='gene', right_on='Query_sequence_name')]
                           for pg in pg_order[:-1]}

In [None]:
def trans_mapped(row):
    sample = row['sample']
    if type(row[sample]) is float:
        return False
    score = float(row[sample].split(';')[1])
    if score > c:
        return True
    else:
        return False

def count_mapped_discrep(df):
    df['Mapped'] = df.apply(trans_mapped, axis=1)
    mapped_count = df['Mapped'].value_counts()
    for val in [True, False]:
        if val not in mapped_count:
            mapped_count = mapped_count.append(pd.Series([0], index=[val]))
    return [mapped_count[True], mapped_count[False]]

In [None]:
# {pg: [[PG-/HQ+ mapped, PG-/HQ+ unmapped], [PG+/HQ- mapped, PG+/HQ- unmapped] ]}
dn_discrep_trans_mapped_count = {pg :
                                 [count_mapped_discrep(dn_discrep_trans_mapping[pg][0]),
                                  count_mapped_discrep(dn_discrep_trans_mapping[pg][1])]
                                 for pg in pg_order[:-1]}

In [None]:
mtp_discrep_trans_mapped_count = {pg :
                                 [count_mapped_discrep(mtp_discrep_trans_mapping[pg][0]),
                                  count_mapped_discrep(mtp_discrep_trans_mapping[pg][1])]
                                 for pg in pg_order[:-1]}

In [None]:
dn_discrep_trans_mapped_count_df = pd.DataFrame([list(chain(*dn_discrep_trans_mapped_count[pg])) for pg in pg_order[:-1]])
dn_discrep_trans_mapped_count_df.index = pg_order[:-1]
dn_discrep_trans_mapped_count_df.columns = pd.MultiIndex.from_product([['De novo'],['PG-/HQ+','PG+/HQ-'],['Mapped','Unmapped']])

mtp_discrep_trans_mapped_count_df = pd.DataFrame([list(chain(*mtp_discrep_trans_mapped_count[pg])) for pg in pg_order[:-1]])
mtp_discrep_trans_mapped_count_df.index = pg_order[:-1]
mtp_discrep_trans_mapped_count_df.columns = pd.MultiIndex.from_product([['Map-to-pan'],['PG-/HQ+','PG+/HQ-'],['Mapped','Unmapped']])

discrep_trans_mapped_count_df = dn_discrep_trans_mapped_count_df.join(mtp_discrep_trans_mapped_count_df, rsuffix='MTP_')
discrep_trans_mapped_count_df

In [None]:
x = [
    list(chain(*[[pg]*2 for pg in pg_order[:-1]])),
    ['PG+|HQ-','PG-|HQ+']*len(pg_order)
]
y1 = list(chain(*[[dn_discrep_trans_mapped_count[pg][0][0], dn_discrep_trans_mapped_count[pg][1][0]] for pg in pg_order[:-1]]))
y2 = list(chain(*[[dn_discrep_trans_mapped_count[pg][0][1], dn_discrep_trans_mapped_count[pg][1][1]] for pg in pg_order[:-1]]))
fig = go.Figure()
fig.add_bar(name="Mapped", x=x, y=y1)
fig.add_bar(name="Unmapped", x=x, y=y2)
fig.update_layout(barmode="relative", title_text="De novo")

fig.update_yaxes(title_text="Number of discrepancies")
fig.update_xaxes(mirror=True, showline=True, linecolor='black')
fig.update_yaxes(mirror=True, showline=True, linecolor='black', showgrid=False)
fig.show()

In [None]:
y1 = list(chain(*[[mtp_discrep_trans_mapped_count[pg][0][0], mtp_discrep_trans_mapped_count[pg][1][0]] for pg in pg_order[:-1]]))
y2 = list(chain(*[[mtp_discrep_trans_mapped_count[pg][0][1], mtp_discrep_trans_mapped_count[pg][1][1]] for pg in pg_order[:-1]]))
fig = go.Figure()
fig.add_bar(name="Mapped", x=x, y=y1)
fig.add_bar(name="Unmapped", x=x, y=y2)
fig.update_layout(barmode="relative", title_text="Map-to-pan")

fig.update_yaxes(title_text="Number of discrepancies")
fig.update_xaxes(mirror=True, showline=True, linecolor='black')
fig.update_yaxes(mirror=True, showline=True, linecolor='black', showgrid=False)
fig.show()