# Pan-genomes comparison report

In [None]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn3
import plotly.graph_objects as go
import plotly.express as px
import plotly.figure_factory as ff

In [None]:
# inputs
# PG names
pg1_name = "<PG1_NAME>"
pg2_name = "<PG2_NAME>"
# PAV
pg1_pav = "<PG1_PAV>"
pg2_pav = "<PG2_PAV>"
true_pg_pav = "<TRUE_PAV>"
# Non-ref matches TSV
pg1_vs_pg2_matches = "<PG1_VS_PG2_NON_REF_MATCHES>"
pg1_vs_true_matches = "<PG1_VS_TRUE_NON_REF_MATCHES>"
pg2_vs_true_matches = "<PG2_VS_TRUE_NON_REF_MATCHES>"

In [None]:
# read in PAV and matches tables
pg1_pav_df = pd.read_csv(pg1_pav, sep='\t', index_col = 0)
pg2_pav_df = pd.read_csv(pg2_pav, sep='\t', index_col = 0)
# assuming same sample names, adjust order
pg2_pav_df = pg2_pav_df[list(pg1_pav_df.columns)]
pg1_vs_pg2_matches_df = pd.read_csv(pg1_vs_pg2_matches, sep='\t')

In [None]:
# number of samples in PGs
n_samples = pg1_pav_df.shape[1]

In [None]:
# convert PG1 and PG2 to common naming, according to true PG
def match_name(row, matches_df, pg_name, other_pg_name, rename):
    if row.name.startswith('PanGene'):
        if row.name in matches_df[pg_name].values:
            if rename:
                return matches_df.loc[matches_df[pg_name] == row.name][other_pg_name].iloc[0]
            else:
                return row.name
        else:
            return row.name + '__' + pg_name + "_unmatched"
    else:
        return re.sub(r'[^0-9a-zA-Z\-\._]+','_',row.name)

In [None]:
# create rename series
pg1_rename = pg1_pav_df.apply(match_name, args=(pg1_vs_pg2_matches_df, pg1_name, pg2_name, False), axis=1)
pg2_rename = pg2_pav_df.apply(match_name, args=(pg1_vs_pg2_matches_df, pg2_name, pg1_name, True), axis=1)

In [None]:
# rename
pg1_pav_df.index = pg1_pav_df.index.map(pg1_rename)
pg2_pav_df.index = pg2_pav_df.index.map(pg2_rename)

In [None]:
# calculate pan-gene occupancies
pg1_occupancy = pg1_pav_df.sum(axis=1)
pg1_occupancy = pg1_occupancy.loc[pg1_occupancy > 0]
pg2_occupancy = pg2_pav_df.sum(axis=1)
pg2_occupancy = pg2_occupancy.loc[pg2_occupancy > 0]

## Basic stats

In [None]:
def stats_from_pav_df(df):
    total_pangenes = df.shape[0]
    non_ref_pangenes = df.loc[df.index.str.startswith('PanGene')].shape[0]
    ref_pangenes = total_pangenes - non_ref_pangenes
    non_ref_unmatched = df.loc[(df.index.str.startswith('PanGene')) & (df.index.str.endswith('_unmatched'))].shape[0]
    non_ref_matched = non_ref_pangenes - non_ref_unmatched
    n_samples = df.shape[1]
    occup = df.sum(axis=1)
    core = (occup == n_samples).sum()
    shell = (occup.between(1,n_samples,inclusive=False)).sum()
    singletons = (occup == 1).sum()
    index = ['Total pan-genes', 'Reference pan-genes', 'Non-reference pan-genes',
             'Matched non-reference pan-genes', 'Unmatched non-reference pan-genes',
            'Core pan-genes', 'Shell pan-genes', 'Singletons']
    values =  [total_pangenes, ref_pangenes, non_ref_pangenes, non_ref_matched, non_ref_unmatched, core, shell, singletons]
    return pd.Series(values, index = index)

In [None]:
pg1_stats = stats_from_pav_df(pg1_pav_df)
pg2_stats = stats_from_pav_df(pg2_pav_df)
stats_df = pd.concat([pg1_stats,pg2_stats], axis=1)
stats_df.columns = [pg1_name, pg2_name]
stats_df

In [None]:
# plot overlap of non-ref genes
pg1_nonref_genes = set(pg1_pav_df.loc[pg1_pav_df.index.str.startswith('PanGene')].index)
pg2_nonref_genes = set(pg2_pav_df.loc[pg2_pav_df.index.str.startswith('PanGene')].index)
venn2([pg1_nonref_genes, pg2_nonref_genes], set_labels=[pg1_name,pg2_name])
plt.title('Overlap of non-reference genes')
plt.show()

In [None]:
# plot occupancy distributions
pg1_occup_counts = pg1_occupancy.value_counts().sort_index()
pg2_occup_counts = pg2_occupancy.value_counts().sort_index()
x = pg1_occup_counts.index
fig = go.Figure(data=[
    go.Bar(name=pg1_name, x=x, y=pg1_occup_counts),
    go.Bar(name=pg2_name, x=x, y=pg2_occup_counts)]
               )
# Change the bar mode
fig.update_layout(barmode='group', title='Occupancy histogram', xaxis_title="Occupancy", yaxis_title="# of pan-genes")
fig.show()

In [None]:
# plot number of genes per accession
pg1_genes_per_acc = pg1_pav_df.sum()
pg2_genes_per_acc = pg2_pav_df.sum()
x = pg1_genes_per_acc.index
fig = go.Figure(data=[
    go.Bar(name=pg1_name, x=x, y=pg1_genes_per_acc),
    go.Bar(name=pg2_name, x=x, y=pg2_genes_per_acc),
    ])
# Change the bar mode
fig.update_layout(barmode='group', title='Pan-genes per accession', xaxis_title="Accession", yaxis_title="# of pan-genes")
fig.show()

## Discrepancies between pan-genomes

In [None]:
# Add unmatched pan-genes from each PG to the other PG (as absent in all samples)
# this ensures both PGs have the same set of genes
pg1_unmatched_df = pg1_pav_df.loc[~pg1_pav_df.index.isin(pg2_pav_df.index)]
for col in pg1_unmatched_df.columns:
    pg1_unmatched_df[col].values[:] = 0
pg2_unmatched_df = pg2_pav_df.loc[~pg2_pav_df.index.isin(pg1_pav_df.index)]
for col in pg2_unmatched_df.columns:
    pg2_unmatched_df[col].values[:] = 0

In [None]:
pg1_pav_df_plus_pg2_unmatched = pg1_pav_df.append(pg2_unmatched_df)
pg2_pav_df_plus_pg1_unmatched = pg2_pav_df.append(pg1_unmatched_df)

In [None]:
# sort columns and gene nmaes in both DFs, so the order is identical
accessions = list(pg1_pav_df_plus_pg2_unmatched.columns.sort_values())
pg1_pav_df_plus_pg2_unmatched = pg1_pav_df_plus_pg2_unmatched[accessions].sort_index()
pg2_pav_df_plus_pg1_unmatched = pg2_pav_df_plus_pg1_unmatched[accessions].sort_index()

In [None]:
# find discrepancies
pav_diff = (pg1_pav_df_plus_pg2_unmatched - pg2_pav_df_plus_pg1_unmatched)

In [None]:
pg1_raname_df = pd.DataFrame(pg1_rename).reset_index()
pg1_raname_df.columns = [pg1_name + '_orig_name', 'new_name']
pg2_raname_df = pd.DataFrame(pg2_rename).reset_index()
pg2_raname_df.columns = [pg2_name + '_orig_name', 'new_name']

In [None]:
# create discrepancies table
discrep_df = pav_diff.reset_index().melt(id_vars='gene', value_vars=pav_diff.columns)
discrep_df.columns = ['gene','sample','type']
discrep_df = discrep_df.loc[discrep_df['type'] != 0]
# add original gene names 
discrep_df = discrep_df.merge(pg1_raname_df, how='left', left_on='gene', right_on='new_name')
discrep_df =discrep_df.merge(pg2_raname_df, how='left', left_on='gene', right_on='new_name')
discrep_df = discrep_df[['gene', pg1_name + '_orig_name', pg2_name + '_orig_name', 'sample', 'type']]
# print to file
discrep_df.to_csv('discrepancies.tsv', sep='\t', index=False)

In [None]:
# calculate stats (separate by ref vs. non-ref)
total_cells = pav_diff.count().sum()
total_discrep = (pav_diff != 0).astype(int).sum(axis=1).sum()
in_pg1_not_in_pg2 = (pav_diff == 1).astype(int).sum(axis=1).sum()
in_pg2_not_in_pg1 = (pav_diff == -1).astype(int).sum(axis=1).sum() 
pav_diff_ref = pav_diff.loc[~(pav_diff.index.str.startswith('PanGene'))]
pav_diff_nonref = pav_diff.loc[pav_diff.index.str.startswith('PanGene')]
total_ref_cells = pav_diff_ref.count().sum()
total_nonref_cells = pav_diff_nonref.count().sum()
ref_discrep = (pav_diff_ref != 0).astype(int).sum(axis=1).sum()
ref_in_pg1_not_in_pg2 = (pav_diff_ref == 1).astype(int).sum(axis=1).sum()
ref_in_pg2_not_in_pg1 = (pav_diff_ref == -1).astype(int).sum(axis=1).sum()
nonref_discrep = (pav_diff_nonref != 0).astype(int).sum(axis=1).sum()
nonref_in_pg1_not_in_pg2 = (pav_diff_nonref == 1).astype(int).sum(axis=1).sum()
nonref_in_pg2_not_in_pg1 = (pav_diff_nonref == -1).astype(int).sum(axis=1).sum()

In [None]:
# create discrepancies stats table
ind = ['All', 'Ref', 'Non-ref']
cells = [total_cells, total_ref_cells, total_nonref_cells]
discrep = [total_discrep, ref_discrep, nonref_discrep]
pres_in_pg1_abs_in_pg2 = [in_pg1_not_in_pg2, ref_in_pg1_not_in_pg2, nonref_in_pg1_not_in_pg2]
pres_in_pg2_abs_in_pg1 = [in_pg2_not_in_pg1, ref_in_pg2_not_in_pg1, nonref_in_pg2_not_in_pg1]
discrep_stats_df = pd.DataFrame({'Cells': cells,
                                 "Total discrepancies": discrep,
                                 "P in %s and A in %s" %(pg1_name,pg2_name) : pres_in_pg1_abs_in_pg2,
                                "P in %s and A in %s" %(pg2_name,pg1_name) : pres_in_pg2_abs_in_pg1},
                               index = ind)
discrep_stats_df

In [None]:
# discrepancies per gene
discrep_per_gene = pav_diff.apply(lambda row: abs(row).sum(), axis=1)

In [None]:
fig = px.histogram(discrep_per_gene, title="Histogram of discrepancies per pan-gene",
                  labels={'value': '# of discrepancies'})
fig.show()

In [None]:
# discrepancies per gene - non-ref only
discrep_per_nonref_gene = pav_diff_nonref.apply(lambda row: abs(row).sum(), axis=1)
fig = px.histogram(discrep_per_nonref_gene, title="Histogram of discrepancies per non-ref pan-gene",
                  labels={'value': '# of discrepancies'})
fig.show()

In [None]:
# occupancy diff
pg1_pav_df_plus_pg2_unmatched['occupancy'] = pg1_pav_df_plus_pg2_unmatched.sum(axis=1)
pg2_pav_df_plus_pg1_unmatched['occupancy'] = pg2_pav_df_plus_pg1_unmatched.sum(axis=1)
occup_diff = pg1_pav_df_plus_pg2_unmatched['occupancy'] - pg2_pav_df_plus_pg1_unmatched['occupancy']
fig = px.histogram(occup_diff, title="Histogram of occupancy differences",
                  labels={'value': 'Occupancy difference'})
fig.show()

In [None]:
# occupancy diff - non-ref only
occup_diff_nonref = pg1_pav_df_plus_pg2_unmatched.loc[pav_diff.index.str.startswith('PanGene')]['occupancy'] - pg2_pav_df_plus_pg1_unmatched.loc[pav_diff.index.str.startswith('PanGene')]['occupancy']
fig = px.histogram(occup_diff_nonref, title="Histogram of occupancy differences of non-reference pan-genes",
                  labels={'value': 'Occupancy difference'})
fig.show()

In [None]:
# occupancy in PG1 vs. occupancy in PG2
tmp_df = pd.concat([pg1_pav_df_plus_pg2_unmatched['occupancy'], pg2_pav_df_plus_pg1_unmatched['occupancy']], axis=1)
tmp_df['pan-gene'] = tmp_df.index
tmp_df.columns = [pg1_name + ' occupancy', pg2_name + ' occupancy','pan-gene']
tmp_df = tmp_df.groupby([pg1_name + ' occupancy', pg2_name + ' occupancy']).count().unstack(level=0).fillna(0)
tmp_df.columns = tmp_df.columns.droplevel(0)
tmp_df = tmp_df.transpose()
tmp_df.loc[:,0:n_samples] = tmp_df.loc[:,0:n_samples].div(tmp_df.sum(axis=1), axis=0)*100
fig = px.imshow(tmp_df)
fig.show()

In [None]:
# occupancy vs. discrepancies
# use occupancies of true PG
tmp_df = pd.concat([pg1_occupancy, discrep_per_gene], axis=1, join='inner')
tmp_df['pan-gene'] = tmp_df.index
tmp_df.columns = ['occupancy','discrepancies','pan-gene']
tmp_df = tmp_df.groupby(['occupancy', 'discrepancies']).count().unstack(level=0).fillna(0)
tmp_df.columns = tmp_df.columns.droplevel(0)
tmp_df = tmp_df.transpose()
tmp_df.loc[:,0:n_samples] = tmp_df.loc[:,0:n_samples].div(tmp_df.sum(axis=1), axis=0)*100
fig = px.imshow(tmp_df)
fig.show()