In [None]:
# %load ../snippets/basic_settings.py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from pathlib import Path
import seaborn as sns
import sys
import plotly.express as px
import yaml

sns.set_context("notebook", font_scale=1.1)
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 100)
plt.rcParams["figure.figsize"] = (16, 12)
plt.rcParams['savefig.dpi'] = 200
plt.rcParams['figure.autolayout'] = False
plt.rcParams['axes.labelsize'] = 18
plt.rcParams['axes.titlesize'] = 20
plt.rcParams['font.size'] = 16
plt.rcParams['lines.linewidth'] = 2.0
plt.rcParams['lines.markersize'] = 8
plt.rcParams['legend.fontsize'] = 14
#pd.set_option('display.float_format', lambda x: '{:,.2f}'.format(x))

In [None]:
# %load ../snippets/load_server_config.py
import yaml

config_file = "../nguyenb_config.yaml"
with open(config_file) as file:
    # The FullLoader parameter handles the conversion from YAML
    # scalar values to Python the dictionary format
    configs = yaml.load(file, Loader=yaml.FullLoader)
    
# Run on server:
root = Path(configs['root']['server'])
scratchDir = Path(configs['scratchDir']['server'])
mapDir = root/configs['mapDir']
countDir = root/configs['libraryCountsDir']
resultDir = root/configs['resultDir']
sample_data_file = root/configs['sampleData']

In [None]:
sampleData = pd.read_csv(sample_data_file)

In [None]:
libraries = ['library_15_1', 'library_13_2', 'library_9_1', 
             'library_10_1', 'library_11_2', 'library_12_1',
             'library_12_2', 'library_13_1', 'library_10_2',
             'library_14_2']

# Maps

In [None]:
mapFiles = [f for f in (mapDir).glob('*.annotated.csv')]

In [None]:
mapsDf = pd.concat([pd.read_csv(f).assign(library=f.name.split('.annotated.csv')[0]) 
                    for f in mapFiles])

## Summary Statistics

In [None]:
table1 = (mapsDf.groupby('library')
          .agg({'barcode':['nunique'], 'multimap':['sum'], 
                'distance_to_feature': [lambda x: sum(x!=0) ]})
          .reset_index())
table1.columns = ["Library", '# of insertions', 
                 '# of barcodes mapped to multiple locations', '# of insertions outside of CDS']
table1 = table1.set_index('Library')
table1['# of gene with insertion'] = mapsDf[mapsDf.distance_to_feature == 0].groupby('library').Name.nunique()
# # table1['Library'] = table1.Library.str.replace("_", '-')

In [None]:
table1

In [None]:
table2 = mapsDf.groupby(['library', 'Name']).barcode.nunique().reset_index()

In [None]:
table2[table2.barcode >20]

## Number of times each gene was disrupted

In [None]:
gff_file = Path("/nfs/cds-peta/exports/biol_micro_cds_gr_sunagawa/scratch/Projects_NCCR/"+
                "ref/SL1344/ncbi-genomes-2021-08-25/GCA_000210855.2_ASM21085v2_genomic.gff")

In [None]:
gffDf = pd.read_table(gff_file, skiprows=7, header=None)
gffDf.columns = ['chr', 'src', 'feat_id', 'start', 'end', 'DN', 'strand', 'DN2', 'attribute']
gffDf = gffDf[['chr', 'feat_id', 'start', 'end', 'strand' , 'attribute']]
gffDf = gffDf[gffDf.chr == 'FQ312003.1']
gffDf = gffDf[gffDf.feat_id == 'gene']
gffDf['geneLen'] = abs(gffDf['end'] - gffDf['start'])/1000

gffDf['Name'] = gffDf.attribute.apply(lambda x: x.split("Name=")[1].split(";")[0])
gffDf['locus_tag'] = gffDf.attribute.apply(lambda x: x.split("locus_tag=")[1].split(";")[0] if 'locus_tag' in x else None)
gffDf.sample(5)

In [None]:
num_libs = mapsDf[mapsDf.library.isin(libraries)].groupby('Name').library.nunique().reset_index()
num_libs.columns = ['Name', 'num_libs']
num_libs = num_libs[~num_libs.Name.str.contains(":")]
num_libs = num_libs.merge(gffDf,how='outer', on=['Name']).fillna(0)
num_libs.sample(10)

In [None]:
num_libs

In [None]:
sns.set(font_scale=1.5)
sns.set_style('ticks')
fig, ax = plt.subplots(figsize=(12,6))
sns.histplot(data=num_libs, discrete=True,
             x='num_libs',  color=(0.20973515, 0.09747934, 0.24238489), bins=20)
ax.set_xticks(range(0,13));
plt.xlim(-1, 13);
plt.xlabel("Number of libraries with gene disruption")



## Number of insertion per gene

In [None]:
lib = 'library_13_1'
lib11 = mapsDf[(mapsDf.library == lib) & (~mapsDf.Name.str.contains(':'))]
bc_per_gene = lib11.groupby('Name').barcode.nunique().reset_index()
#bc_per_gene = bc_per_gene.merge(gffDf,how='outer', on=['Name']).fillna(0)

In [None]:
bc_per_gene.shape

In [None]:
bc_per_gene[bc_per_gene.barcode == 0].shape

In [None]:
bc_per_gene[bc_per_gene.barcode == 1].shape

In [None]:
bc_per_gene[bc_per_gene.barcode > 1].shape

In [None]:
sns.histplot(data=bc_per_gene, discrete=True,
             x='barcode',  color=(0.20973515, 0.09747934, 0.24238489), bins=10)

In [None]:
num_insertions = mapsDf[~mapsDf.Name.str.contains(':')].groupby('Name').barcode.nunique().reset_index()
num_insertions.columns = ['Name', 'num_insertions']
gene_insertions = num_insertions.merge(gffDf,how='outer', on=['Name'])

In [None]:
gene_insertions['num_insertions'] = gene_insertions['num_insertions'].fillna(0)


In [None]:
gene_insertions.sample(10)

In [None]:
gene_insertions['geneIns'] = gene_insertions['num_insertions']/gene_insertions['geneLen']

In [None]:
gene_insertions.geneIns.hist(bins=300)
#plt.yscale('log')

In [None]:
gene_insertions.geneIns.max()

In [None]:
gene_insertions[gene_insertions.geneIns > 60]

In [None]:
px.scatter(gene_insertions, x='start', y='geneIns', hover_data=['Name'])

# Results

In [None]:
result_files = [f for f in scratchDir.glob("*_rra_results.csv")]

In [None]:
res = pd.concat([pd.read_csv(f, index_col=0).assign(library=f.stem.split("_rra")[0]) for f in result_files])

In [None]:
res.sample(10)

In [None]:
day1_genes = res[(res.contrast == 'd1') & (res.LFC < -1) & (res.neg_selection_fdr < 0.01)].Name.unique()

In [None]:
day1_genes = [g for g in day1_genes if ':' not in g]

In [None]:
for g in day1_genes:
    print(g)

In [None]:
res[(res.Name.isin(['sapA', 'sapF', 'sapD'])) & (res.contrast == 'd1')]

In [None]:
res[(res.Name.isin(['recA', 'recB', 'recC','recD', 'recG', 'recO'])) & (res.contrast == 'd1')].sort_values('Name')