# De novo orthology clustering inflation parameter
This notebook contains an analysis of the effect of the MCL inflation value used by OrthoFinder. We focus on the effect on the overall pan-genome composition.

In [None]:
import os
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
from plotly.colors import n_colors, named_colorscales, sample_colorscale
from scipy.stats import ttest_ind

In [None]:
pio.templates.default = "plotly_white"
colors = ['grey','purple','darkgreen','lightblue','orange']

## Paths

In [None]:
base_dir = "/groups/itay_mayrose_nosnap/liorglic/Projects/PGCM/output/A_thaliana_pan_genome/analyze_OF_inflation"

In [None]:
figs_path = "/groups/itay_mayrose_nosnap/liorglic/Projects/PGCM/figs/FINAL"

## Extract stats from PAV matrices

In [None]:
def pav_matrix_stats(p):
    """
    Given a path to a gene PAV matrix tsv,
    calculates: pan-genome size, % core,
    % singletons, overall occupancy
    """
    pav_df = pd.read_csv(p, sep='\t', index_col=0)
    pg_size, n_samples = pav_df.shape
    nonref = sum(pav_df["TAIR10"] == 0)
    occup = pav_df.sum(axis=1)
    perc_core = sum(occup == n_samples)/pg_size*100
    perc_singleton = sum(occup == 1)/pg_size*100
    perc_occup = sum(occup)/(pg_size*n_samples)*100
    return pg_size, nonref, perc_core, perc_singleton, perc_occup

In [None]:
inflation_vals = np.arange(1,5.2,0.2)

In [None]:
rows = []
i = 1.0
for i in inflation_vals:
    i = round(i,1)
    if i.is_integer():
        i = int(i)
    pav_path = os.path.join(base_dir, 'PAV_I%s.tsv' % i)
    pg_size, nonref, perc_core, perc_singleton, perc_occup = pav_matrix_stats(pav_path)
    rows.append([i, pg_size, nonref, perc_core, perc_singleton, perc_occup])

In [None]:
inflation_stats_df = pd.DataFrame(rows)
inflation_stats_df.columns = ['Inflation value', 'Pan-genome size', 'Nonreference pan-genes',
                              "Core pan-genes (%)", "Singletons (%)", "Overall gene occupancy (%)"]
inflation_stats_df['Shell pan-genes (%)'] = 100 - inflation_stats_df["Core pan-genes (%)"] - inflation_stats_df["Singletons (%)"]

In [None]:
fig = go.Figure()

fig.add_trace(go.Line(x=inflation_stats_df['Inflation value'], y=inflation_stats_df['Core pan-genes (%)'], name='Core', mode='lines+markers', marker_color='darkgreen'))
fig.add_trace(go.Line(x=inflation_stats_df['Inflation value'], y=inflation_stats_df['Shell pan-genes (%)'], name='Shell', mode='lines+markers', marker_color='lightblue'))
fig.add_trace(go.Line(x=inflation_stats_df['Inflation value'], y=inflation_stats_df['Singletons (%)'], name='Singletons', mode='lines+markers', marker_color='orange'))

fig.update_xaxes(title='Inflation value')
fig.update_yaxes(title='Pan-genome composition (%)', range=[0,100])
fig.update_xaxes(mirror=True, showline=True, linecolor='black')
fig.update_yaxes(mirror=True, showline=True, linecolor='black', showgrid=False)

fig.show()

In [None]:
fig6s_a = os.path.join(figs_path, 'figS6a.pdf')
fig.write_image(fig6s_a)

In [None]:
fig = px.line(inflation_stats_df, x='Inflation value', y='Nonreference pan-genes', markers=True, color_discrete_sequence=['black'])
fig.update_xaxes(mirror=True, showline=True, linecolor='black')
fig.update_yaxes(mirror=True, showline=True, linecolor='black', showgrid=False)
fig.show()

In [None]:
fig6s_b = os.path.join(figs_path, 'figS6b.pdf')
fig.write_image(fig6s_b)