# Setting up

In [None]:
# %load ../snippets/basic_settings.py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from pathlib import Path
import seaborn as sns
import sys
import plotly.express as px
import yaml
import pyranges as pr
import plotly.io as pio
import re

sns.set_context("notebook", font_scale=1.1)
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 100)
plt.rcParams["figure.figsize"] = (16, 12)
plt.rcParams['savefig.dpi'] = 200
plt.rcParams['figure.autolayout'] = False
plt.rcParams['axes.labelsize'] = 18
plt.rcParams['axes.titlesize'] = 20
plt.rcParams['font.size'] = 16
plt.rcParams['lines.linewidth'] = 2.0
plt.rcParams['lines.markersize'] = 8
plt.rcParams['legend.fontsize'] = 14
pd.set_option('display.float_format', lambda x: '{:,.2f}'.format(x))

In [None]:
with open("nguyenb_config.yaml", "r") as fh:
    config_dict = yaml.safe_load(fh)

root= Path(config_dict['root'])
out_dir = Path(config_dict['out_dir'])
map_dir = root/config_dict['map_dir']
analysis_dir = root/config_dict['analysis_dir']
#analysis_dir_5bcs = Path(config_dict['analysis_dir_5bcs'])

gff = pr.read_gff3(root/config_dict['gff_file'])
cds = gff[gff.Feature == 'CDS'].as_df()
# counts_dir = Path(config_dict['counts_dir'])
sd = pd.read_csv(root/config_dict['sample_data_file'])
sd['name'] = sd['mouse'] + "_" + sd['library'] + "_" + sd['day'] + "_"+ sd['dnaid']
sd = sd.rename(columns={'sampleID':'sample_id'})


alphabetClrs = px.colors.qualitative.Alphabet
sushi_colors_dict = {'red': '#C0504D',
                     'orange': '#F79646',
                     'medSea': '#4BACC6',
                     'black': '#000000',
                     'dgreen': '#00B04E',
                     'lgreen': '#92D050',
                     'dblue': '#366092',
                     'lblue': '#95B3D7',
                     'grey': alphabetClrs[8]}

# Maps

In [None]:
maps = list(map_dir.rglob("*annotated.csv.gz"))
map_df = pd.concat([pd.read_csv(f).assign(library=f.stem.split(".annotated")[0]) for f in maps])
map_sum = map_df.groupby('library').agg({'barcode':['nunique'], 'ID':['nunique']}).reset_index()
map_sum.columns = ['library', 'num_inserts', 'num_genes']
map_df['Library'] = map_df['library'].str.replace("library_", '').str.replace('_', '.').astype(float)

In [None]:
map_df.library.unique()

In [None]:
def get_combined_site(insertion_sites, chrs):
    """
    Computes combined insertion sites based on chromosome-specific offsets to help plot all of the on one axis.

    Given a list of insertion sites and corresponding chromosome identifiers,
    this function adjusts the insertion sites to account for the cumulative
    lengths and predefined offsets of each chromosome. The result is a list
    of combined insertion sites that can be used for unified genomic analysis.

    Parameters:
    insertion_sites (list of int): A list of insertion site positions.
    chrs (list of str): A list of chromosome identifiers corresponding to the insertion sites.
                        Valid chromosome identifiers are 'FQ312003.1', 'HE654724.1', 
                        'HE654725.1', and 'HE654726.1'.

    Returns:
    list of int: A list of combined insertion site positions adjusted according 
                 to chromosome lengths and offsets.

    Example:
    >>> insertion_sites = [1000, 2000, 3000, 4000]
    >>> chrs = ['FQ312003.1', 'HE654724.1', 'HE654725.1', 'HE654726.1']
    >>> get_combined_site(insertion_sites, chrs)
    [1000, 5078012, 5161854, 5248762]

    """
    chr_len = {'FQ312003.1': 4878012, 
               'HE654724.1': 93842,
               'HE654725.1': 86908,
               'HE654726.1': 8688}
    new_sites = []
    for s, c in zip(insertion_sites, chrs):
        if c == 'FQ312003.1':
            new_sites.append(s)
        elif c == 'HE654724.1':
            new_sites.append(s+100000+chr_len['FQ312003.1'])
        elif c == 'HE654725.1':
            new_sites.append(s+200000+chr_len['FQ312003.1'] + chr_len['HE654724.1'])
        elif c == 'HE654726.1':
             new_sites.append(s+300000+chr_len['FQ312003.1'] + chr_len['HE654724.1']+ chr_len['HE654725.1'])
    return new_sites

In [None]:
map_df['new_sites'] = get_combined_site(map_df.insertion_site, map_df.chr)

In [None]:
map_df.groupby('library').agg({'barcode':['nunique'], 'locus_tag':['nunique']}).median()

## Coverage plots

- Uncomment #pio.write_image to save the figures

In [None]:

prefix='map_coverage_log_11-1_highlight'

fig = px.histogram(map_df.sort_values('Library'), x='new_sites', color='Library', nbins=200, 
             template='plotly_white', width=2000, height=500, color_discrete_sequence=["#FDBE10"] + px.colors.sequential.deep, 
             labels={'new_sites': 'Position, bp'}, 
             category_orders={"Library": [11.1, 9.1, 10.1, 10.2,  11.2, 12.1, 12.2, 13.1, 13.2, 14.2, 15.1]},
             #log_y=True, 
             )

fig.update_layout(bargap=0.1, font=dict(size=20), font_family="Arial")
fig.update_xaxes(showline=True, linewidth=1, linecolor='black', 
                tickfont=dict(size=24, color='black'),  titlefont=dict(size=24, color='black'),
                tickvals =[ 0, 1000000, 2000000, 3000000, 4000000])
fig.update_yaxes(showline=True, linewidth=1, linecolor='black', title='Number of insertions',
                 type="log", 
                  tickvals =[ 0,10, 100, 1000],
                tickfont=dict(size=20, color='black'), titlefont=dict(size=30, color='black'))
fig.add_vline(4925000, line_width=1, line_dash="dash", line_color="grey")
fig.add_vline(5125000, line_width=1, line_dash="dash", line_color="grey")
fig.add_vline(5325000, line_width=1, line_dash="dash", line_color="grey")
#pio.write_image(fig, out_dir/f"{prefix}.svg", width=2000, height=500, scale=2)
fig

In [None]:
prefix='map_coverage_log'
fig = px.histogram(map_df.sort_values('Library'), x='new_sites', color='Library', nbins=200, 
             template='plotly_white', width=2000, height=500, color_discrete_sequence= px.colors.sequential.deep, 
             labels={'new_sites': 'Position, bp'}, 
             category_orders={"Library": [11.1, 9.1, 10.1, 10.2,  11.2, 12.1, 12.2, 13.1, 13.2, 14.2, 15.1]},
             log_y=True, 
             )

fig.update_layout(bargap=0.1, font=dict(size=20),font_family="Arial")
fig.update_xaxes(showline=True, linewidth=1, linecolor='black', 
                tickfont=dict(size=24, color='black'),  titlefont=dict(size=24, color='black'),
                tickvals =[ 0, 1000000, 2000000, 3000000, 4000000])
fig.update_yaxes(showline=True, linewidth=1, linecolor='black', title='Number of insertions',
                 tickvals =[ 0,10, 100, 1000],
                tickfont=dict(size=20, color='black'), titlefont=dict(size=30, color='black'))
fig.add_vline(4925000, line_width=1, line_dash="dash", line_color="grey")
fig.add_vline(5125000, line_width=1, line_dash="dash", line_color="grey")
fig.add_vline(5325000, line_width=1, line_dash="dash", line_color="grey")
#pio.write_image(fig, out_dir/f"{prefix}.svg", width=2000, height=500, scale=2)

In [None]:
print(f"Number unique barcodes: {map_df.barcode.nunique()}")
print(f"Number unique genes: {map_df.Name.nunique()}")

## Summary of insertions accross libraries

In [None]:
map_gene_summary = map_df.groupby('Name').library.nunique().reset_index()
prefix = "number_of_libraries_with_gene_disruption"
fig = px.histogram(map_gene_summary, x='library', 
             template='plotly_white', width=900, height=700, color_discrete_sequence=px.colors.sequential.gray, 
             labels={'insertion_site': 'Position, bp'}, log_y=False)

fig.update_layout(bargap=0.1, font_family="Arial")
fig.update_xaxes(showline=True, linewidth=1, linecolor='black', title='Libraries containing the gene disruption', tickvals = [1,2,3,4,5,6,7,8,9,10,11],
                tickfont=dict(size=20, color='black'),  titlefont=dict(size=24, color='black'))
fig.update_yaxes(showline=True, linewidth=1, linecolor='black', title='Number of genes',
                tickfont=dict(size=24, color='black'), titlefont=dict(size=30, color='black'),
                range=[0, 1000],
                tickvals=[0, 250, 500, 750, 1000])
#pio.write_image(fig, out_dir/f"{prefix}.svg", width=900, height=700, scale=2)
fig

In [None]:
# Genes hit only once
map_genes_once = set(map_gene_summary[map_gene_summary.library == 1].Name.unique())
print(f"Number of genes hit only once: {len(map_genes_once)}")

In [None]:
# Genes that were not hit at all
annotated_genes = gff[gff.Feature == 'gene'].as_df()
genes_with_no_hits = map_df[['Name',  'ID']].drop_duplicates().merge(annotated_genes[['Chromosome', 'Start', 'End', 'Strand', 'Name', 'locus_tag']], on=['Name'], how='outer')
genes_with_no_hits = genes_with_no_hits[genes_with_no_hits.ID.isna()][['Chromosome', 'Start', 'End', 'Strand', 'Name', 'locus_tag']].sort_values(['Chromosome', 'Start'])
#genes_with_no_hits.to_csv(out_dir/'genes_without_hits.csv')

## Number of genes disrupted by library

In [None]:
from itertools import combinations 
import random

def get_genes_disrupted(df, all_libs, r=2):
    lib_combs = list(combinations(all_libs, r))
    if len(lib_combs) > 1000:
        lib_combs = random.choices(lib_combs, k=1000)
    genes_with_disruptions = []
    for c in lib_combs:
        fdf = df[df.library.isin(c)].copy()
        ngenes = fdf[fdf.distance_to_feature == 0].Name.nunique()
        genes_with_disruptions.append(ngenes)
    return genes_with_disruptions

dfl = []
all_libs = map_df.library.unique()
for i in range(1,21):
    genes = get_genes_disrupted(map_df, all_libs, i)
    Ngenomes = [i]*len(genes)
    dfl.append(pd.DataFrame([Ngenomes, genes]).T)
num_genes_hit = pd.concat(dfl)
num_genes_hit.columns = ['Number of libraries', 'Number of genes with an insertion']
# sns.boxplot(data=num_genes_hit, x='Number of libraries', y='Number of genes with an insertion')
# plt.axvline(x=9, color='grey', linestyle='--')

In [None]:
num_genes_hit['Number of libraries'] = num_genes_hit['Number of libraries'].astype(int)

In [None]:
print(f"Number of genes disrupted in library_11_1: {map_df[map_df.library == 'library_11_1'].Name.nunique()}")
print(f"Number of genes disrupted in 11 libraryes: {map_df.Name.nunique()}")

In [None]:
font_size=24
h=600
w=750
prefix='number_of_genes_with_insertion_over_libraries'
fig = px.box(num_genes_hit, x='Number of libraries', y='Number of genes with an insertion', 
             color_discrete_sequence = ['black']*20, 
             height=h, width=w,  template='plotly_white')

fig.update_xaxes(showline=True, linewidth=2, linecolor='black',
                          tickfont=dict(size=font_size-6, color='black'), 
                  titlefont=dict(size=font_size, color='black'))
fig.update_yaxes(showline=True, linewidth=2, linecolor='black', 
                         tickfont=dict(size=font_size-6, color='black'), 
                  titlefont=dict(size=font_size, color='black'), 
                  range=[0, 4000])
#fig.add_vline(x=11, line_width=1, line_dash='dash', line_color=sushi_colors_dict['red'])
fig.add_hline(y=853, line_width=2, line_dash='dash', line_color='black', annotation_text="853")
fig.add_hline(y=3076, line_width=2, line_dash='dash', line_color=sushi_colors_dict['red'], annotation_text="3076", )
fig.update_layout(showlegend=False, font_size=24, font_family="Arial")
#pio.write_image(fig, out_dir/f"{prefix}.svg", width=w, height=h, scale=2)
fig

# Results

In [None]:
# Some experiments were conducted with 15 control barcodes, and some with 5, these had to be analyzed independently
result_files = list((analysis_dir/"15bc").glob("*rra_results.csv.gz"))
result_files5 = list((analysis_dir/"5bc").glob("*rra_results.csv.gz"))
df_list = []
for f in result_files:
    df = pd.read_csv(f).assign(library=f.stem.split("_rra")[0], num_barcodes=15)
    df_list.append(df)

for f in result_files5:
    df = pd.read_csv(f).assign(library=f.stem.split("_rra")[0], num_barcodes=5)
    df_list.append(df)


fdf = pd.concat(df_list).drop_duplicates()
fdf = fdf[~fdf.locus_tag.str.contains(":")]
fdf['padj'] = fdf[['neg_selection_fdr','pos_selection_fdr']].min(axis=1)


def identify_hits(row):
    """
    Identifies significant hits based on log fold change (LFC) and adjusted p-value (padj).

    This function evaluates a row of data to determine if it represents a significant
    hit. A significant hit is defined based on the following criteria:
    - If the log fold change (LFC) is less than -1 and the adjusted p-value (padj) is less than 0.05, it is considered a negative hit.
    - If the log fold change (LFC) is greater than 1 and the adjusted p-value (padj) is less than 0.05, it is considered a positive hit.
    - Otherwise, it is not considered a significant hit.

    Parameters:
    row (pandas.Series): A row of data containing at least the columns 'LFC' and 'padj'.

    Returns:
    int: Returns -1 for a negative hit, 1 for a positive hit, and 0 for no significant hit.

    Example:
    >>> import pandas as pd
    >>> data = {'LFC': [-1.5, 2.0, 0.5], 'padj': [0.01, 0.04, 0.2]}
    >>> df = pd.DataFrame(data)
    >>> df['hit'] = df.apply(identify_hits, axis=1)
    >>> df
       LFC  padj  hit
    0 -1.5  0.01   -1
    1  2.0  0.04    1
    2  0.5  0.20    0
    """
    if row['LFC'] < -1 and row['padj'] < 0.05:
        return -1
    elif row['LFC'] > 1 and row['padj'] < 0.05:
        return 1
    return 0

fdf['hit'] = fdf.apply(identify_hits, axis=1)
# Eleminate all genes that were not hits
rsig = fdf[fdf.hit != 0]
# How many unique libraries identify a specific gene as a hit (hit includes a direction)
num_hits = rsig.groupby(['locus_tag', 'contrast', 'hit']).library.nunique().reset_index().rename(columns={'library': 'num_hit'})
# Identify ambigious hits, i.e. hits that were sometimes negative and sometimes positive
amb_hits = rsig.groupby(['locus_tag', 'contrast']).hit.value_counts(normalize=True)
amb_hits = pd.DataFrame(amb_hits).rename(columns={'hit':'proportion'}).reset_index() 
# These were identified as negative (or positive) hits 100% of the time
true_hits = amb_hits[amb_hits.proportion == 1]
# Discrard the ones that are positive 50% of the time and negative 50% of the time
amb_hits = amb_hits.query("proportion != 1 & proportion != 0.5")
# Pick the direction of the majority
amb_hits = amb_hits.loc[amb_hits.groupby('locus_tag')['proportion'].transform(max) == amb_hits['proportion']]
amb_hits = pd.concat([true_hits, amb_hits])
rsig = rsig.rename(columns={'hit': 'library_specific_hit'}).merge(amb_hits, on=['locus_tag', 'contrast'],how='inner')
rsig = rsig.merge(num_hits, on=['locus_tag', 'contrast', 'hit'], how='left')


## Table 1

In [None]:
table1 = fdf.groupby(['library', 'contrast']).agg({'locus_tag':['nunique'], 'hit':['sum'],
                                          'LFC': ['median']}).reset_index()
table1.columns = ['library', 'day', 'number_genes_passing_qc', 'number_hits', 'median_LFC']
table1 = table1.pivot(index=['library'], columns=['day'], values = ['number_genes_passing_qc', 'number_hits']).fillna(0)    
table1.columns = [f'{col_name}_{day}' for col_name in ['number_genes_passing_qc', 'number_hits'] for day in ['d1', 'd2', 'd3', 'd4']]                           
table1 = table1.astype(int)

mice_summary = pd.read_csv(root/config_dict['mice_file'], index_col=0)
print(mice_summary.sum())
table1 = table1.merge(mice_summary, left_index=True, right_index=True)

In [None]:
table1

In [None]:
#table1.to_csv(out_dir/'table1_number_of_hits_per_library.csv')

## Correlation between libraries

In [None]:
# Get correlations
fdf['library_num_barcodes'] = fdf['library'] + '_' + fdf['num_barcodes'].astype(str)
lib_col = 'library_num_barcodes'
cor_df = (fdf[['locus_tag', 'contrast', 'LFC', lib_col]]
          .pivot(index=['locus_tag', 'contrast'], columns=lib_col)
          .reset_index()
          .set_index('locus_tag')
          .groupby('contrast')
          .corr()
          .reset_index())
df_list = []
for i, g in cor_df.groupby('contrast'):
    df = g.drop(['level_1'], axis=1).set_index(['contrast', lib_col])
    df = (df.mask(np.triu(np.ones(df.shape, dtype=np.bool_)))
          .stack()
          .rename_axis(('contrast', 'lib1', 'lib2'))
          .reset_index()
          .rename(columns={'LFC': 'R'}))
    df_list.append(df)
cor_df = pd.concat(df_list)

In [None]:
font_size=24
w=400
h=500
prefix='correlations_between_libraries'
cor_df['Day'] = cor_df.contrast.replace({'d1':'Day 1', 'd2':'Day 2', 'd3': 'Day 3', 'd4': 'Day 4'})
fig = px.box(cor_df, x='Day', y='R', color='Day',
                  color_discrete_map = {'Day 1': sushi_colors_dict['red'], 
                                           'Day 2': sushi_colors_dict['dgreen'], 
                                           'Day 3': sushi_colors_dict['dblue'], 
                                           'Day 4': sushi_colors_dict['orange']},
                labels={"Day":'', 'R': "Pearson's <i>r</i>"},

                  height=h, width=w,  template='plotly_white', hover_data=['lib1', 'lib2'])
fig.update_layout(showlegend=False, font_family="Arial")
fig.update_xaxes(showline=True, linewidth=2, linecolor='black',
                  tickfont=dict(size=font_size-6, color='black'), 
                  tickangle=-50,
                 titlefont=dict(size=font_size, color='black'))
fig.update_yaxes(showline=True, linewidth=2, linecolor='black', range=[0, 1],
                        tickfont=dict(size=font_size-6, color='black'), 
                 titlefont=dict(size=font_size, color='black'))
#pio.write_image(fig, out_dir/f"{prefix}.svg", width=w, height=h, scale=2)
fig

## Show example for 2 libraries

In [None]:
ex_df = (fdf.query('(library_num_barcodes=="library_10_2_15" | library_num_barcodes == "library_11_2_15") ')[['locus_tag', 'LFC', 'library_num_barcodes', 'contrast']]
        .pivot(index=['locus_tag', 'contrast'], columns='library_num_barcodes')
        .dropna()
        .reset_index())
ex_df.columns = ['locus_tag','day', 'library_10_1', 'library_12_1']
ex_df['Day'] = ex_df.day.replace({'d1':'Day 1', 'd2':'Day 2', 'd3': 'Day 3', 'd4': 'Day 4'})

In [None]:
h=600
w=800
prefix='example_correlation_10_1_12_1'
px.scatter(ex_df, x='library_10_1', y='library_12_1',color='Day', width=500, height=500, 
            template='plotly_white', trendline='ols' )



fig = px.scatter(ex_df, x='library_10_1', y='library_12_1', color='Day', 
                     height=h, width=w,
                     template = 'plotly_white', 
                     labels = {'library_10_1': 'LFC(library 10-1)', 
                               'library_12_1': 'LFC (library 12-1)'},
                     color_discrete_map = {'Day 1': sushi_colors_dict['red'], 
                                           'Day 2': sushi_colors_dict['dgreen'], 
                                           'Day 3': sushi_colors_dict['dblue'], 
                                           'Day 4': sushi_colors_dict['orange']},
                #hover_data=['locus_tag', 'gene'],
                category_orders = {'Day':['Day 1', 'Day 2', 'Day 3','Day 4']}, trendline='ols')

fig.update_traces(marker=dict(size=14, line=dict(width=1, color='DarkSlateGrey'), 
                                opacity=0.9),
                    selector=dict(mode='markers'))
fig.update_xaxes(showline=True, linewidth=2, linecolor='black',
                        tickfont=dict(size=font_size-6, color='black'), 
                    titlefont=dict(size=font_size, color='black'), range=[-14,8])
fig.update_yaxes(showline=True, linewidth=2, linecolor='black',
                    tickfont=dict(size=font_size-6, color='black'), 
                    titlefont=dict(size=font_size, color='black'), range=[-14,8])

fig.update_layout(legend=dict(font=dict(size=font_size-2)), 
                    legend_title=dict(font=dict(size=font_size)), font_family="Arial")

tr_line=[]
for  k, trace  in enumerate(fig.data):
        if trace.mode is not None and trace.mode == 'lines':
            tr_line.append(k)
print(tr_line)

for id in tr_line:
    fig.data[id].update(line_width=6)

#pio.write_image(fig, out_dir/f"{prefix}.svg", width=w, height=h, scale=2)
fig

In [None]:
#fdf.to_csv(out_dir/'final-results-all-libraries.csv', index=False)

## Generate final results table

In [None]:
# Aggregate information 
rsig_agg = rsig[['locus_tag', 'contrast', 'hit', 'num_hit']].drop_duplicates().copy()
num_hit = (fdf.groupby(['contrast', 'locus_tag'])
           .agg({'library':['nunique'], 'LFC':['median'], 'padj':['min', 'max']})
           .reset_index())
num_hit.columns = ['contrast', 'locus_tag', 'num_lib', 'LFC_median', 'padj_min', 'padj_max']
num_hit =  (num_hit.merge(rsig_agg, on=['contrast', 'locus_tag'], how='outer'))
num_hit['num_hit'] = num_hit['num_hit'].fillna(0).astype(int)
num_hit['hit'] = num_hit['hit'].fillna(0).astype(int)
num_hit['Detection frequency'] = num_hit['num_hit'].astype(str) + "/" + num_hit['num_lib'].astype(str)
num_hit.columns = ['day', 'Name', 'number_of_libraries_with_mutant', 'LFC_median', 'padj_min', 'padj_max',  'hit','number_of_times_detected_as_hit', 'Detection frequency']
num_hit['padj_min_formatted'] = num_hit['padj_min'].apply(lambda x: '< 0.01' if x < 0.01 else str(round(x, 2)))
num_hit['padj_max_formatted'] = num_hit['padj_max'].apply(lambda x: '< 0.01' if x < 0.01 else str(round(x, 2)))
num_hit['padj_range'] = num_hit['padj_min_formatted'] + " - " + num_hit['padj_max_formatted']
num_hit['FC'] = 2**num_hit['LFC_median']
num_hit['FC_with_detection_frequency'] = num_hit['FC'].round(2).astype(str) + " (" + num_hit['Detection frequency'] + ")"
num_hit['FC'] = num_hit["FC"].round(4) 
num_hit_short = num_hit[['Name', 'day', 'FC_with_detection_frequency', 'padj_range', 'FC', 'LFC_median', 'Detection frequency', 'padj_min_formatted', 'padj_max_formatted', 'padj_min', 'padj_max', 'hit']].sort_values(['day','LFC_median'])

only_once = num_hit_short[num_hit_short.Name.isin(map_genes_once)]
final_summary_only_once =  only_once.pivot(index=['Name'], columns='day', 
              values=['FC_with_detection_frequency', 'padj_range', 'FC', 'LFC_median', 'padj_min_formatted', 'padj_max_formatted', 'padj_min', 'padj_max', 'hit']).reset_index()
final_summary = num_hit_short.pivot(index=['Name'], columns='day', 
              values=['FC_with_detection_frequency', 'padj_range', 'FC', 'LFC_median', 'padj_min_formatted', 'padj_max_formatted', 'padj_min', 'padj_max', 'hit']).reset_index()
col_names = [f'{col_name}_{day}' for col_name in ['FC_with_detection_frequency', 'padj_range', 'FC', 'LFC_median', 'padj_min_formatted', 'padj_max_formatted', 'padj_min', 'padj_max', 'hit'] for day in ['d1', 'd2', 'd3', 'd4']]
final_summary.columns = ['Name'] + col_names
final_summary_only_once.columns = ['Name'] + col_names

In [None]:
final_summary_only_once = final_summary_only_once[['Name'] + 'FC_d1	FC_d2	FC_d3	FC_d4'.split() + "padj_min_formatted_d1	padj_min_formatted_d2	padj_min_formatted_d3	padj_min_formatted_d4".split()]
final_summary_only_once.columns = [c.replace("_min_formatted", '') for c in final_summary_only_once.columns]

In [None]:
final_summary_only_once = final_summary_only_once[["Name", "FC_d1", "padj_d1", "FC_d2", "padj_d2", "FC_d3", "padj_d3", "FC_d4", "padj_d4"]].sort_values("Name")

In [None]:
#final_summary_only_once.to_csv(out_dir/'final-results-gene-level-summary-only-once.csv', index=0)

In [None]:
final_summary = final_summary.merge(annotated_genes[['Name', 'Chromosome']].drop_duplicates(), on='Name', how='left')

In [None]:
late = final_summary.dropna(subset=['hit_d3', 'hit_d4']).query("hit_d1 == 0 & hit_d2 == 0 & hit_d3 != 0 & hit_d4 != 0 & hit_d3 == hit_d4" )
#late.to_csv(out_dir/'final-results-late-hits-only.csv', index=0)

In [None]:
final_summary_chr = final_summary[final_summary.Chromosome == 'FQ312003.1'].drop(columns=['Chromosome'])
#final_summary_chr.to_csv(out_dir/'final-results-gene-level-summary-chromosome.csv', index=0)
final_summary_plasmid = final_summary[final_summary.Chromosome != 'FQ312003.1'].drop(columns=['Chromosome']).drop_duplicates()
#final_summary_plasmid.to_csv(out_dir/'final-results-gene-level-summary-plasmid.csv', index=0)

In [None]:
#final_summary.to_csv(out_dir/'final-results-gene-level-summary.csv', index=0)

# Functional annotation

In [None]:
# Load string annotation and merge with gff
cds_short = cds[['Name', 'gene', 'locus_tag', 'product']].copy()
cds_short['gene'] = cds_short['gene'].fillna(cds_short['locus_tag'])
cds_short['Name'] = cds_short["Name"].str.split(".", expand=True)[0]
string = pd.read_table(root/config_dict['string_enrichment_file'])
string['Name'] = string['#string_protein_id'].str.split(".", expand=True)[1] 
anot = string.merge(cds_short, on='Name', how='left')

In [None]:
# Subset annotation to only include hits
hits = list(rsig.locus_tag.unique())
hits_anot = anot[(anot.gene.isin(hits))|(anot.locus_tag).isin(hits)]

In [None]:
# Group by term to see the biggest / most representative cats
(hits_anot.groupby(['term', 'description'])
 .locus_tag.nunique()
 .reset_index()
 .sort_values('locus_tag', ascending=False))

In [None]:
terms = [('GO:0005976','Lipopolysaccharide/O-Antigen biosynthesis',56), #Polysaccharide metabolic process
        ('GO:0008653','Lipopolysaccharide/O-Antigen biosynthesis',33), # Lipopolysaccharide metabolic process
        ('GO:1903509','Lipopolysaccharide/O-Antigen biosynthesis',33), # Liposaccharide metabolic process
        ('CL:7099',"Secretion, and Virulence",35), #"Mixed, incl. Secretion, and Virulence"
         ('map03070','Secretion, and Virulence',24), #'Bacterial secretion system'
         ('GO:0006259','DNA metabolic process',53),
         ('GO:0006310','DNA metabolic process',24), #'DNA recombination'
         ('CL:546',"DNA metabolic process",15), #"DNA replication, and DNA repair"
         ('GO:0006520','Cellular amino acid metabolic process',79),
         ('GO:0006629','Lipid metabolic process',55),
        ('GO:0006950','Response to stress/stimulus',63), #'Response to stress' 
         ('map02024','Response to stress/stimulus',57), #'Quorum sensing'
         ('GO:0050896','Response to stress/stimulus',130), #'Response to stimulus'
         ('GO:0019222','Regulation of metabolic process',146),
         ('map01100','Metabolic process',465), #'Metabolic pathways'
         ('GO:0008152','Metabolic process',625),
         ('GO:0006810','Transport',243),
         ('map02010','Transport',52), #'ABC transporters'
         ('CL:4188','Transport',38), #ABC transporters
        ('CL:4747','Lipopolysaccharide/O-Antigen biosynthesis',22), #"Mixed, incl. O-Antigen nucleotide sugar biosynthesis, and Extracellular polysaccharide metabolic process"
         ('map00010','Glycolysis / Gluconeogenesis',24),
         ('map00020','Citrate cycle (TCA cycle)',17),
         ('GO:0043711','Pilus/Fimbria',10), #'Pilus organization'
         ('GO:0015473','Pilus/Fimbria',7), #'Fimbrial usher porin activity'
         ('GOCC:0042995','Pilus/Fimbria',18), #'Cell projection'
         ('GO:0006457','Protein folding', 23),
         ('GO:0044183','Protein folding',10), #'Protein folding chaperone'
         ('GO:0042254','Ribosome biogenesis',16),
         ('CL:8097',"Bacteriophage",39), #Mixed, incl. Viral life cycle, and Coiled coil
         ('GO:0009117','Nucleotide metabolic process',39),
         ('CL:7533',"Motility",41), #"Mixed, incl. Flagellar assembly, and Bacterial chemotaxis"
         ('GO:0005975','Carbohydrate metabolic process',131),
         ('GO:0010467','Gene expression',51),
         ('GO:0009058','Biosynthetic process', 218), #Biosynthetic process
         ('GO:0009056','Catabolic process',94),
         ('GO:0003677','DNA binding',148),
         ('GO:0031224','Membrane',346), #Intrinsic component of membrane
         ('GO:0016020','Membrane',454),
         ('KW-0732','Signal',163),
         ]

In [None]:
final_anot = {}
for t in terms: 
    genes = hits_anot[hits_anot.term == t[0]].locus_tag.values
    
    for gene in genes:
        if gene not in final_anot.keys():
            final_anot[gene] = t[1]
print(len(final_anot.keys()))

for h in hits_anot.locus_tag.unique():
    if h not in final_anot.keys():
        final_anot[h] = 'Other'
print(len(final_anot.keys()))
final_anot = pd.DataFrame.from_dict(final_anot, orient='index').reset_index()
final_anot.columns = ['locus_tag', 'Function']

In [None]:
sushi_colors = ["#C0504D", "#F79646", "#4BACC6",
                "#00B050", "#92D050", "#366092", "#95B3D7",
                "#808080",  "#D9D9D9",  "#FB8072", "#BC80BD",
                "#377EB8", "#CCEBC5​", "#BEBADA", "#CCEBC5", "#FCCDE5",
                "#FFFFB3", "#4DAF4A",]

to_keep = ['Metabolic process',
           'Lipopolysaccharide/O-Antigen biosynthesis',
           'Regulation of metabolic process',
           'Response to stress/stimulus',
           'Transport',
           'DNA metabolic process',
           'Cellular amino acid metabolic process',
           'Secretion, and Virulence', 'Other',
           ]
category_colors = {c: col for c, col in zip(
    to_keep, sushi_colors[:len(to_keep)])}


def format_color_groups(df):
    x = df.copy()
    for factor in df.Function.unique():
        style = f'background-color: {category_colors[factor]}; opacity: 0.5'
        x.loc[x['Function'] == factor, :] = style
    return x

In [None]:
final_summary_short = final_summary[['Name', 'FC_d1', 'FC_d2', 'FC_d3', 'FC_d4', 
                                     'padj_range_d1','padj_range_d2', 'padj_range_d3','padj_range_d4']].copy()

In [None]:
# Fitness defects
s = num_hit.query('number_of_times_detected_as_hit > 1 & hit == -1 & LFC_median < 0')
s = (s.drop(['number_of_libraries_with_mutant'], axis=1)
          .drop_duplicates()
          .rename(columns={'Name':'gene'}))
s= s.merge(cds_short, on='gene').merge(final_anot, on='locus_tag')

s['cat_ab'] = s.groupby('Function').locus_tag.transform('count')
s.loc[s.cat_ab < 15, 'Function'] = 'Other'

hi_conf_anotated_hits = s[['gene', 'locus_tag', 'product', 'Function', 'day']].drop_duplicates().copy()
hits_from_figure = (hi_conf_anotated_hits[['gene', 'locus_tag','product', 'Function']]
                    .merge(final_summary_short, left_on='gene', right_on='Name', how='left').drop_duplicates())
# (hits_from_figure.sort_values(['Function', 'locus_tag'])
#  .style.apply(format_color_groups, axis=None)
#  .to_excel(out_dir/'functional_annotation_of_hits_down_matching_figure1H.xlsx', engine='openpyxl'))

s = s.groupby(['day', 'Function']).locus_tag.nunique().reset_index()
s = s.sort_values('locus_tag', ascending=False)

prefix = 'functional_annotation_of_hits_down'
w = 2000
h=500
fig = px.bar(s, y='Function', x='locus_tag', 
       color='Function', width=w, height=h, text = 'locus_tag', 
       template='plotly_white',  orientation='h', facet_col='day', facet_col_spacing=0.02,
       #color_discrete_sequence=list(sushi_colors.values()),
       color_discrete_map=category_colors,
       category_orders={"day": ["d1", "d2", "d3", "d4"],
                        "Function": to_keep},
                                   labels=({'Function': "", "locus_tag":""}))
fig.update_layout(font=dict(size=20), showlegend=False, font_family="Arial")
#pio.write_image(fig, out_dir/f"{prefix}.svg", width=w, height=h, scale=2)
fig

In [None]:
# Fitness advantage
s = num_hit.query('number_of_times_detected_as_hit > 1 & hit == 1')
s = (s.drop(['number_of_libraries_with_mutant'], axis=1)
     .drop_duplicates()
     .rename(columns={'Name': 'gene'}))
s = s.merge(cds_short, on='gene').merge(final_anot, on='locus_tag')
s['cat_ab'] = s.groupby('Function').locus_tag.transform('count')
s.loc[s.cat_ab < 6, 'Function'] = 'Other'
hi_conf_anotated_hits = s[['gene', 'locus_tag',
                           'product', 'day', 'Function']].drop_duplicates().copy()
hits_from_figure = (hi_conf_anotated_hits[['gene', 'locus_tag', 'product', 'Function']]
                    .merge(final_summary_short, left_on='gene', right_on='Name', how='left').drop_duplicates())
# (hits_from_figure.sort_values(['Function', 'locus_tag'])
#  .style.apply(format_color_groups, axis=None)
#  .to_excel(out_dir/'functional_annotation_of_hits_up_matching_figure1H.xlsx', engine='openpyxl'))

s = s.groupby(['day', 'Function']).locus_tag.nunique().reset_index()
s = s.sort_values('locus_tag', ascending=False)

prefix = 'functional_annotation_of_hits_up'
w = 2000
h = 500
fig = px.bar(s, y='Function', x='locus_tag',
             color='Function', width=w, height=h, text='locus_tag',
             template='plotly_white',  orientation='h', facet_col='day', facet_col_spacing=0.02,
             color_discrete_map=category_colors,
             category_orders={"day": ["d1", "d2", "d3", "d4"],
                              "Function": ['Metabolic process',
                                           'Secretion, and Virulence',
                                           'Regulation of metabolic process',
                                           'Response to stress/stimulus',
                                           'Transport',
                                           'DNA metabolic process',
                                           'Cellular amino acid metabolic process',
                                           'Other',
                                           ]},
             labels=({'Function': "", "locus_tag": ""}))
fig.update_layout(font=dict(size=20), showlegend=False, font_family="Arial")
fig.update_xaxes(range=[0, 45])
fig.update_yaxes(range=[-0.5, 8])

# pio.write_image(fig, out_dir/f"{prefix}.svg", width=w, height=h, scale=2)
fig

# Inoculum

In [None]:
# Import data
data_dir = root/config_dict['inoculum_dir']
result_files = data_dir.rglob("*_rra_results.csv")
fdf = pd.concat([pd.read_csv(f).assign(library=f.stem.split("_rra")[0]) for f in result_files])
fdf['padj'] = fdf[['neg_selection_fdr', 'pos_selection_fdr']].min(axis=1)
fdf['hit'] = fdf.apply(identify_hits, axis=1)
fdf = fdf[fdf.Name.str.len() < 10]
rsig = fdf[fdf.hit != 0].copy()
num_hits = rsig.groupby(['Name', 'hit']).library.nunique().reset_index().rename(columns={'library': 'num_hit'})
# Identify ambigious hits, i.e. hits that were sometimes negative and sometimes positive
amb_hits = rsig.groupby(['Name']).hit.value_counts(normalize=True)
amb_hits = pd.DataFrame(amb_hits).rename(columns={'hit':'proportion'}).reset_index()
 
# # These were identified as negative (or positive) hits 100% of the time
true_hits = amb_hits[amb_hits.proportion == 1]
rsig = true_hits.rename(columns={'hit': 'library_specific_hit'}).merge(rsig, on=['Name'],how='left')
rsig = rsig.merge(num_hits, on=['Name',  'hit'], how='left')

In [None]:
# Aggregate information 
rsig_agg = rsig[['Name', 'contrast', 'hit', 'num_hit']].drop_duplicates().copy()
num_hit = (fdf.groupby(['contrast', 'Name'])
           .agg({'library':['nunique'], 'LFC':['median'], 'padj':['min', 'max']})
           .reset_index())
num_hit.columns = ['contrast', 'Name', 'num_lib', 'LFC_median', 'padj_min', 'padj_max']
num_hit =  (num_hit.merge(rsig_agg, on=['contrast', 'Name'], how='outer'))
num_hit['num_hit'] = num_hit['num_hit'].fillna(0).astype(int)
num_hit['hit'] = num_hit['hit'].fillna(0).astype(int)
num_hit['Detection frequency'] = num_hit['num_hit'].astype(str) + "/" + num_hit['num_lib'].astype(str)
num_hit.columns = ['day', 'Name', 'number_of_libraries_with_mutant', 'LFC_median', 'padj_min', 'padj_max',  'hit','number_of_times_detected_as_hit', 'Detection frequency']
num_hit['padj_min_formatted'] = num_hit['padj_min'].apply(lambda x: '< 0.01' if x < 0.01 else str(round(x, 2)))
num_hit['padj_max_formatted'] = num_hit['padj_max'].apply(lambda x: '< 0.01' if x < 0.01 else str(round(x, 2)))
num_hit['padj_range'] = num_hit['padj_min_formatted'] + " - " + num_hit['padj_max_formatted']
num_hit['FC'] = 2**num_hit['LFC_median']
num_hit['FC_with_detection_frequency'] = num_hit['FC'].round(2).astype(str) + " (" + num_hit['Detection frequency'] + ")"
num_hit['FC'] = num_hit["FC"].round(4) 
num_hit_short = num_hit[['Name', 'day', 'FC_with_detection_frequency', 'padj_range', 'FC', 'LFC_median', 'Detection frequency', 'padj_min_formatted', 'padj_max_formatted', 'padj_min', 'padj_max', 'hit']].sort_values(['day','LFC_median'])


In [None]:
#num_hit_short.sort_values("Detection frequency", ascending=False).to_csv(out_dir/"inoculum-results-gene-level-summary.csv", index=0)