# The effect of pan-genome construction approach
This notebook contains the analysis of the effect of construction approach on pan-genome results.  
The analysis mainly consists of comparing two cultivated soybean pan-genomes, constructed with either the de novo (DN), the map-to-pan (MTP), or the iterative assembly (IA) approach, based on the same 50x sequencing data and annotation evidence.

In [None]:
import os
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.io as pio
from Bio import SeqIO
from itertools import chain

In [None]:
pio.templates.default = "plotly_white"
colors = ['grey','purple','darkgreen','lightblue','orange']
pd.set_option("display.max_columns", None)

## Paths

In [None]:
base_dir = "/groups/itay_mayrose_nosnap/liorglic/Projects/PGCM/output/soybean_pan_genome"
dn_pg = os.path.join(base_dir, "de_novo/x50/RESULT_minia")
mtp_pg = os.path.join(base_dir, "map_to_pan/x50/RESULT_minia")
ia_pg = os.path.join(base_dir, "iterative_mapping/x50/RESULT")
compare_dir = os.path.join(base_dir, "compare_pan_genomes/DN_x50_vs_MTP_x50/RESULT_minia")

In [None]:
dn_pav_tsv = os.path.join(dn_pg, "all_samples/pan_genome/pan_PAV.tsv")
mtp_pav_tsv = os.path.join(mtp_pg, "all_samples/pan_genome/pan_PAV.tsv")
ia_pav_tsv = os.path.join(ia_pg, "all_samples/pan_genome/pan_PAV.tsv")

In [None]:
figs_path = "/groups/itay_mayrose_nosnap/liorglic/Projects/PGCM/figs/FINAL"

## Basic stats comparison
Extract and compare general stats of the two pan-genomes

In [None]:
# Read PAV TSVs as pandas data frames
dn_pg_pav_df = pd.read_csv(dn_pav_tsv, sep='\t', index_col=0)
dn_pg_pav_df.columns = [col.split('_')[0] for col in dn_pg_pav_df.columns]
mtp_pg_pav_df = pd.read_csv(mtp_pav_tsv, sep='\t', index_col=0)
mtp_pg_pav_df.columns = [col.split('_')[0] for col in mtp_pg_pav_df.columns]
ia_pg_pav_df = pd.read_csv(ia_pav_tsv, sep='\t', index_col=0)
ia_pg_pav_df.columns = [col.split('_')[0] for col in ia_pg_pav_df.columns]

In [None]:
# Rename TAIR10 to col-0
dn_pg_pav_df.columns = [col if col != 'TAIR10' else 'Col-0' for col in dn_pg_pav_df.columns]
mtp_pg_pav_df.columns = [col if col != 'TAIR10' else 'Col-0' for col in mtp_pg_pav_df.columns]
ia_pg_pav_df.columns = [col if col != 'TAIR10' else 'Col-0' for col in ia_pg_pav_df.columns]

In [None]:
# Calculate stats
def stats_from_pav_df(df):
    total_pangenes = df.shape[0]
    non_ref_pangenes = df.loc[df.index.str.startswith('PanGene')].shape[0]
    ref_pangenes = total_pangenes - non_ref_pangenes
    n_samples = df.shape[1]
    occup = df.sum(axis=1)
    core = (occup == n_samples).sum()
    shell = (occup.between(1,n_samples,inclusive='neither')).sum()
    singletons = (occup == 1).sum()
    index = ['Total pan-genes', 'Reference pan-genes', 'Non-reference pan-genes',
             'Core pan-genes', 'Shell pan-genes', 'Singletons']
    values =  [total_pangenes, ref_pangenes, non_ref_pangenes, core, shell, singletons]
    return pd.Series(values, index = index)

dn_pg_stats = stats_from_pav_df(dn_pg_pav_df)
mtp_pg_stats = stats_from_pav_df(mtp_pg_pav_df)
ia_pg_stats = stats_from_pav_df(ia_pg_pav_df)

In [None]:
# Display stats
stats_df = pd.concat([dn_pg_stats, mtp_pg_stats, ia_pg_stats], axis=1)
stats_df.columns = ['De novo', 'Map-to-pan', 'Iterative assembly']
stats_df

In [None]:
ref_nonref = stats_df.loc[['Reference pan-genes','Non-reference pan-genes']].transpose()
pg_composition = stats_df.loc[['Core pan-genes','Shell pan-genes','Singletons']].transpose()

In [None]:
fig = make_subplots(rows=1, cols=2, shared_yaxes=True)

fig_a_t1 = go.Bar(x=ref_nonref.index, y=ref_nonref['Reference pan-genes'], name='Reference', legendrank=5)
fig_a_t2 = go.Bar(x=ref_nonref.index, y=ref_nonref['Non-reference pan-genes'], name='Nonreference', legendrank=4)
fig.add_trace(fig_a_t1, row=1, col=1)
fig.add_trace(fig_a_t2, row=1, col=1)

fig_b_t1 = go.Bar(x=pg_composition.index, y=pg_composition['Core pan-genes'], name='Core', legendrank=3)
fig_b_t2 = go.Bar(x=pg_composition.index, y=pg_composition['Shell pan-genes'], name='Shell', legendrank=2)
fig_b_t3 = go.Bar(x=pg_composition.index, y=pg_composition['Singletons'], name='Singletons', legendrank=1)
fig.add_trace(fig_b_t1, row=1, col=2)
fig.add_trace(fig_b_t2, row=1, col=2)
fig.add_trace(fig_b_t3, row=1, col=2)

fig.update_layout(barmode='stack', colorway=colors, yaxis_title="Number of pan-genes")
fig.update_xaxes(mirror=True, showline=True, linecolor='black')
fig.update_yaxes(mirror=True, showline=True, linecolor='black', showgrid=False)
fig.show()

In [None]:
fig2s_a = os.path.join(figs_path, 'figS2a.pdf')
fig.write_image(fig2s_a)

## Per sample reference/nonreference

In [None]:
# Create table
dn_ref_nonref = dn_pg_pav_df.apply(lambda row: 'Reference' if not row.name.startswith('PanGene') else 'Nonreference', axis=1)
mtp_ref_nonref = mtp_pg_pav_df.apply(lambda row: 'Reference' if not row.name.startswith('PanGene') else 'Nonreference', axis=1)
ia_ref_nonref = ia_pg_pav_df.apply(lambda row: 'Reference' if not row.name.startswith('PanGene') else 'Nonreference', axis=1)

In [None]:
def ref_nonref_per_sample(df, vec):
    per_sample = []
    for sample in df.columns:
        sample_pav = df[sample]
        sample_present = sample_pav.loc[sample_pav == 1]
        counts = pd.concat([sample_present, vec], axis=1, join='inner')[0].value_counts()
        counts.name = sample
        per_sample.append(counts)
    return pd.concat(per_sample, axis=1)

In [None]:
dn_ref_nonref_per_sample = ref_nonref_per_sample(dn_pg_pav_df, dn_ref_nonref).fillna(0)
mtp_ref_nonref_per_sample = ref_nonref_per_sample(mtp_pg_pav_df, mtp_ref_nonref).fillna(0)
ia_ref_nonref_per_sample = ref_nonref_per_sample(ia_pg_pav_df, ia_ref_nonref).fillna(0)
# Order columns alphabetically
dn_ref_nonref_per_sample = dn_ref_nonref_per_sample[dn_ref_nonref_per_sample.columns.sort_values()]
mtp_ref_nonref_per_sample = mtp_ref_nonref_per_sample[mtp_ref_nonref_per_sample.columns.sort_values()]
ia_ref_nonref_per_sample = ia_ref_nonref_per_sample[ia_ref_nonref_per_sample.columns.sort_values()]
# Add _MTP and _IA suffices to columns names
mtp_ref_nonref_per_sample.columns = [col + '_1MTP' for col in mtp_ref_nonref_per_sample.columns]
ia_ref_nonref_per_sample.columns = [col + '_IA' for col in ia_ref_nonref_per_sample.columns]

ref_nonref_per_sample_df = pd.concat([dn_ref_nonref_per_sample,mtp_ref_nonref_per_sample,ia_ref_nonref_per_sample], axis=1)
ref_nonref_per_sample_df = ref_nonref_per_sample_df[ref_nonref_per_sample_df.columns.sort_values()]
ref_nonref_per_sample_df.columns = [col.split('_')[0] for col in ref_nonref_per_sample_df.columns]
ref_nonref_per_sample_df.columns = pd.MultiIndex.from_product([dn_ref_nonref_per_sample.columns,['De novo','Map-to-pan','Iterative assembly']])
# Make Col-0 the last column
ref_nonref_per_sample_df = ref_nonref_per_sample_df[pd.MultiIndex.from_tuples([x for x in ref_nonref_per_sample_df.columns if x[0] != 'Col-0'] + [x for x in ref_nonref_per_sample_df.columns if x[0] == 'Col-0'])]
ref_nonref_per_sample_df

In [None]:
# Plot
dn_ref_nonref_per_sample_t = dn_ref_nonref_per_sample.transpose()
samples_order = [s for s in dn_ref_nonref_per_sample_t.index if s != 'Wm82'] + ['Wm82']
dn_ref_nonref_per_sample_t = dn_ref_nonref_per_sample_t.reindex(samples_order)

mtp_ref_nonref_per_sample_t = mtp_ref_nonref_per_sample.transpose()
mtp_ref_nonref_per_sample_t.index = [s.replace('_1MTP','') for s in mtp_ref_nonref_per_sample_t.index]
mtp_ref_nonref_per_sample_t = mtp_ref_nonref_per_sample_t.reindex(samples_order)

ia_ref_nonref_per_sample_t = ia_ref_nonref_per_sample.transpose()
ia_ref_nonref_per_sample_t.index = [s.replace('_IA','') for s in ia_ref_nonref_per_sample_t.index]
ia_ref_nonref_per_sample_t = ia_ref_nonref_per_sample_t.reindex(samples_order)

In [None]:
fig = go.Figure()
x = [
    list(chain(*[[s]*3 for s in dn_ref_nonref_per_sample_t.index])),
    ['DN', 'MTP', 'IA']*len(dn_ref_nonref_per_sample_t.index)
]
y1 = list(chain(*zip(list(dn_ref_nonref_per_sample_t['Reference']),list(mtp_ref_nonref_per_sample_t['Reference']),list(ia_ref_nonref_per_sample_t['Reference']))))
y2 = list(chain(*zip(list(dn_ref_nonref_per_sample_t['Nonreference']),list(mtp_ref_nonref_per_sample_t['Nonreference']),list(ia_ref_nonref_per_sample_t['Nonreference']))))
fig.add_bar(name="Reference", x=x, y=y1, legendrank=2)
fig.add_bar(name="Nonreference", x=x, y=y2, legendrank=1)
fig.update_layout(barmode='stack', colorway=colors, yaxis_title="Number of pan-genes", bargap=0.1)
fig.update_xaxes(mirror=True, showline=True, linecolor='black')
fig.update_yaxes(mirror=True, showline=True, linecolor='black', showgrid=False)
fig.show()

In [None]:
fig2s_b = os.path.join(figs_path, 'figS2b.pdf')
fig.write_image(fig2s_b)

## Occupancy analysis

In [None]:
def occup_to_cat(occup, n_samples):
    if occup == 1:
        return "Singleton"
    elif occup < n_samples:
        return "Shell"
    elif occup == n_samples:
        return "Core"

def occup_categories_per_sample(df):
    occup = df.sum(axis=1)
    n_samples = df.shape[1]
    occup_cat = occup.apply(occup_to_cat, args=(n_samples,))
    
    per_sample = []
    for sample in df.columns:
        sample_pav = df[sample]
        sample_present = sample_pav.loc[sample_pav == 1]
        cat_counts = pd.concat([sample_present, occup_cat], axis=1, join='inner')[0].value_counts()
        cat_counts.name = sample
        per_sample.append(cat_counts)
    return pd.concat(per_sample, axis=1)

In [None]:
dn_occup_cat_per_sample = occup_categories_per_sample(dn_pg_pav_df)
mtp_occup_cat_per_sample = occup_categories_per_sample(mtp_pg_pav_df)
ia_occup_cat_per_sample = occup_categories_per_sample(ia_pg_pav_df)
# Order columns alphabetically
dn_occup_cat_per_sample = dn_occup_cat_per_sample[dn_occup_cat_per_sample.columns.sort_values()]
mtp_occup_cat_per_sample = mtp_occup_cat_per_sample[mtp_occup_cat_per_sample.columns.sort_values()]
ia_occup_cat_per_sample = ia_occup_cat_per_sample[ia_occup_cat_per_sample.columns.sort_values()]
# Add _MTP suffices to columns names
mtp_occup_cat_per_sample.columns = [col + '_1MTP' for col in mtp_occup_cat_per_sample.columns]
ia_occup_cat_per_sample.columns = [col + '_IA' for col in ia_occup_cat_per_sample.columns]

occup_cat_per_sample_df = pd.concat([dn_occup_cat_per_sample,mtp_occup_cat_per_sample,ia_occup_cat_per_sample], axis=1)
occup_cat_per_sample_df = occup_cat_per_sample_df[occup_cat_per_sample_df.columns.sort_values()]
occup_cat_per_sample_df.columns = [col.split('_')[0] for col in occup_cat_per_sample_df.columns]
occup_cat_per_sample_df.columns = pd.MultiIndex.from_product([dn_occup_cat_per_sample.columns,['De novo','Map-to-pan','Iterative assembly']])
# Make Col-0 the last column
occup_cat_per_sample_df = occup_cat_per_sample_df[pd.MultiIndex.from_tuples([x for x in occup_cat_per_sample_df.columns if x[0] != 'Col-0'] + [x for x in occup_cat_per_sample_df.columns if x[0] == 'Col-0'])]
occup_cat_per_sample_df

## Nonreference gene pool
DN vs. MTP only

In [None]:
nonref_matched = compare_dir + '/soybean_DN_x50_vs_soybean_MTP_x50_max_weight_matches.tsv'
nonref_matched_df = pd.read_csv(nonref_matched, sep='\t')

In [None]:
# how many matched?
nonref_matched_df.shape

## Compare PAV matrices
To compare occupancies and detect PA discrepancies between the pan-genomes, we focus on reference and matched nonreference genes. Genes which are considered core in both pan-genomes were removed too.

In [None]:
# remove unmatched genes
dn_pg_pav_matched_df = dn_pg_pav_df.loc[(~dn_pg_pav_df.index.str.startswith('PanGene')) | (dn_pg_pav_df.index.isin(nonref_matched_df['soybean_DN_x50']))]
mtp_pg_pav_matched_df = mtp_pg_pav_df.loc[(~mtp_pg_pav_df.index.str.startswith('PanGene')) | (mtp_pg_pav_df.index.isin(nonref_matched_df['soybean_MTP_x50']))]
# rename MTP matched nonreference to match DN
def tmp_func(x):
    if x in nonref_matched_df['soybean_MTP_x50'].unique():
        return nonref_matched_df.loc[nonref_matched_df['soybean_MTP_x50'] == x]['soybean_DN_x50'].iloc[0]
    else:
        return x.replace(':','_')
mtp_pg_pav_matched_df.index = mtp_pg_pav_matched_df.index.map(tmp_func)
# sort rows and columns of PAV tables to get the same order
dn_pg_pav_matched_df.sort_index(inplace=True)
mtp_pg_pav_matched_df.sort_index(inplace=True)
dn_pg_pav_matched_df = dn_pg_pav_matched_df[dn_pg_pav_matched_df.columns.sort_values()]
mtp_pg_pav_matched_df = mtp_pg_pav_matched_df[mtp_pg_pav_matched_df.columns.sort_values()]

In [None]:
assert all(dn_pg_pav_matched_df.columns == mtp_pg_pav_matched_df.columns) and all(dn_pg_pav_matched_df.index == mtp_pg_pav_matched_df.index)

In [None]:
# Calculate occupancies
dn_pg_matched_occup = dn_pg_pav_matched_df.sum(axis=1)
mtp_pg_matched_occup = mtp_pg_pav_matched_df.sum(axis=1)
# Core sets
dn_pg_matched_core = set(dn_pg_matched_occup.loc[dn_pg_matched_occup == 8].index)
mtp_pg_matched_core = set(mtp_pg_matched_occup.loc[mtp_pg_matched_occup == 8].index)
# Core in both DN and MTP
both_matched_core = dn_pg_matched_core.intersection(mtp_pg_matched_core)
print("Number of genes which are core in both DN and MTP: %s" % len(both_matched_core))
print("Out of %s matched genes" % len(dn_pg_matched_occup))

In [None]:
# remove genes which are core in both (keep noncore)
dn_pg_pav_matched_noncore_df = dn_pg_pav_matched_df.loc[~ dn_pg_pav_matched_df.index.isin(both_matched_core)]
mtp_pg_pav_matched_noncore_df = mtp_pg_pav_matched_df.loc[~ mtp_pg_pav_matched_df.index.isin(both_matched_core)]

In [None]:
# Discrepancies table
discrep_df = dn_pg_pav_matched_noncore_df - mtp_pg_pav_matched_noncore_df
# Remove reference Col-0
discrep_df = discrep_df[[acc for acc in discrep_df.columns if acc != 'Wm82']]

In [None]:
# Count discrepancies per gene
def count_discrep_types(row):
    val_counts = row.value_counts()
    for x in [0,-1,1]:
        if x not in val_counts:
            val_counts[x] = 0
    return val_counts.sort_index()

discrep_per_gene = discrep_df.apply(count_discrep_types, axis=1, result_type="expand")
discrep_per_gene.columns = ['DN-|MTP+', 'match', 'DN+|MTP-']

In [None]:
# How many with at least one discrepancy?
discrep_per_gene.query('match != 7').shape[0]

In [None]:
# Sum across all genes
tot_discrep_types = discrep_per_gene.sum()
tot_pav_calls = tot_discrep_types.sum()
print("Total PAV calls: %s" % tot_pav_calls)
tot_discrep_types