In [3]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

# Annotation stats

This notebook contains all calculations for the statistics mentioned on the annotation status of the pVOGs.

Processing of all the annotation terms found on the database is based on the script `process_annotations.py`.
The purpose  is to reach to a single informative annotation per pVOG. 
If that is not the case, the annotation status is defined to be unknown.

### How many pVOGs have a meaningful annotation in the database?

In [14]:
# Read in the result of the processing step
ann_df = pd.read_csv(snakemake.input.annotations_tsv,
                    sep="\t")

# sanity check - how many pvogs are in there
all_pvogs = ann_df.shape[0]

# How many pvogs have been processed to be 'unknown'
unknowns = (ann_df['annotation_processed'] == 'unknown').sum()

print(
    "pVOGs with processed annotation value set to 'unknown' : {} / {} ({} %)"
    .format(unknowns,
            all_pvogs,
            (round(unknowns / all_pvogs, 3) * 100)))

In [17]:
# Read in the results of the predictions
predictions = pd.read_csv(snakemake.input.final_table_tsv, 
                             sep = "\t"
                            )
# Split the interaction string (pvog1-pvog2) to two separate columns
# and include them in hte dataframe
pvog1 = [i.split('-')[0] for i in predictions['interaction'].values]
pvog2 = [i.split('-')[1] for i in predictions['interaction'].values]
predictions['pvog1'] = pvog1
predictions['pvog2'] = pvog2

###  How many unique pvogs are covered in this dataset?

In [23]:
unique_pvogs = set(predictions['pvog1'].values).union(set(predictions['pvog2'].values))

print("pVOGs used for prediction : {} / {} ({} %)".
      format(len(unique_pvogs),
             all_pvogs,
             round((len(unique_pvogs) / all_pvogs), 3) * 100)
     )

### How many are interacting/associated?

In [34]:
positive_predictions = predictions[(predictions['label'] == 1.0)]

print("All positive predictions: {}".format(positive_predictions.shape[0]))

### How many of the postive interactions have probability higher than 0.65 (arbitrary high-confidence cutoff)

In [36]:
high_confidence = positive_predictions.loc[positive_predictions['proba'] >= 0.65]
print('High confidence (proba >= 0.65) : {} ({} % )'
      .format(high_confidence.shape[0], 
              round((high_confidence.shape[0] / positive_predictions.shape[0]), 3) * 100))

### How many of the positive interactions occur between pairs where both pVOGs are annotated?

In [40]:
both_known = positive_predictions.loc[
    positive_predictions['pvog1_annotation_processed'].ne('unknown') & 
    positive_predictions['pvog2_annotation_processed'].ne('unknown')]
    
print("Number of positive interactions with both pVOGs annotated : {} ({} %)"
      .format(both_known.shape[0],
              round((both_known.shape[0] / positive_predictions.shape[0]), 3) * 100) )


### TOP 20 - Both annotated

In [64]:
both_known.sort_values('proba', 
                       ascending=False)[['pvog1', 
                                         'pvog2', 
                                         'proba', 
                                         'pvog1_annotation_raw', 
                                         'pvog2_annotation_raw']
                                       ].head(n=20)

### How many of the positive interactions occur between pairs where at least one pVOG is annotated?

In [41]:
one_unknown = positive_predictions.loc[
    positive_predictions['pvog1_annotation_processed'].ne('unknown') & 
      positive_predictions['pvog2_annotation_processed'].eq('unknown')
     | positive_predictions['pvog1_annotation_processed'].eq('unknown') & 
        positive_predictions['pvog2_annotation_processed'].ne('unknown'),
    ]

print("Number of interactions with at least one pVOGs annotated : {} ({} %)"
      .format(one_unknown.shape[0], 
              round((one_unknown.shape[0] / positive_predictions.shape[0]), 3) * 100) )

### How many of the positive interactions occur between pairs where neither pVOG is annotated?

In [45]:
both_unknown = positive_predictions.loc[
    positive_predictions['pvog1_annotation_processed'].eq('unknown') & 
    positive_predictions['pvog2_annotation_processed'].eq('unknown'),
    ]
print("Number of interactions neither pVOG annotated : {} ({} %)"
      .format(both_unknown.shape[0], 
              round((both_unknown.shape[0] / positive_predictions.shape[0]), 3) * 100) )

## A picture is a 1000 words - some say...

In [46]:
# Put the steps fom above in a figure
df = predictions[['interaction', 'label', 'proba', 
                'pvog1_annotation_processed',
                'pvog2_annotation_processed']]

In [47]:
# https://stackoverflow.com/a/60244752
# Define the three conditions to plot
# These will be used as hue for sns.displot()
conditions = [
    # Both unknown
    df['pvog1_annotation_processed'].eq('unknown') & 
     df['pvog2_annotation_processed'].eq('unknown'),
    # One known
    df['pvog1_annotation_processed'].ne('unknown') & 
      df['pvog2_annotation_processed'].eq('unknown')
     | df['pvog1_annotation_processed'].eq('unknown') & 
        df['pvog2_annotation_processed'].ne('unknown'),
    # Both known
    df['pvog1_annotation_processed'].ne('unknown') & 
     df['pvog2_annotation_processed'].ne('unknown'),
]

# The choices are the values to be filled in
# when one condition from above is met.
# e.g. (False, True, False) will give 'One known'
choices = ['Both unknown', 'One known', 'Both known']

In [49]:
# Fill in the values in an ann_status column
df['ann_status'] = np.select(conditions, choices)

In [52]:
# make the plot
sns.color_palette("colorblind")
g = sns.displot(df, 
                x='proba', 
                hue='ann_status', 
                kind='hist', 
                multiple = 'stack',
                height=5, 
                aspect=2, 
                facet_kws={'legend_out': False}
               )
g.set_xlabels("Probability")
g.set_ylabels("No. of predictions")
g._legend.set_title("Annotation status")
g.ax.set_xticks(np.arange(0.0, 1.1, 0.1))
g.savefig(snakemake.output[0], dpi=600)

In [53]:
# And what does the density look like?
gg = sns.displot(df, 
                 x='proba', 
                 hue='ann_status', 
                 kind='kde',
                 height=5,
                 aspect=2, 
                 facet_kws={'legend_out': False}
               )
gg.set_xlabels("Probability")
gg.set_ylabels("Desnsity")
gg._legend.set_title("Annotation status")
gg.ax.set_xticks(np.arange(0.0, 1.1, 0.1))