In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path


In [None]:
%ls /nfs/nas22/fs2202/biol_micro_bioinf_nccr/hardt/nguyenb/tnseq/scratch/08_21/l0

In [None]:
root =Path("/nfs/nas22/fs2202/biol_micro_bioinf_nccr/hardt/nguyenb/tnseq")
dataDir = root/"scratch/08_21/"
mapFiles = [f for f in (dataDir/'maps').glob('*/*barcode_map.annotated.csv')]
mapFilesUnFiltered = [f for f in (dataDir/'l0/maps').glob('*/*barcode_map.annotated.csv') ]

- For each library, number of insertions, number of genes disrupted, histogram of # of insertions per gene. What are the genes with most insertions? 

In [None]:
mapsDf = pd.concat([pd.read_csv(f).assign(library=f.name.split('.barcode_map.annotated.csv')[0]) for f in mapFiles])

In [None]:
table1 = (mapsDf.groupby('library')
          .agg({'barcode':['nunique'], 'ShortName':['nunique', lambda x: x.isna().sum()],
               'multimap':['sum']})
          .reset_index())
table1.columns = ["Library", '# of insertions', '# of genes with insertion', 
                  '# of insertions outside of CDS', '# of barcodes mapped to multiple locations']
table1['Library'] = table1.Library.str.replace("_", '-')

In [None]:
#table1.to_csv(dataDir/"14-10-2021-table1.csv")

In [None]:
table1

In [None]:
y = (mapsDf.groupby(['library', 'ShortName'])
          .barcode.count().reset_index()).groupby('library').barcode.value_counts(normalize=True)
y.name = 'num_b'
y = y.reset_index()
y[y.barcode == 1].num_b

In [None]:
table2 = (mapsDf.groupby(['library', 'ShortName'])
          .barcode.count().reset_index().groupby('library')
          .agg({'barcode': ['median', lambda x: round(x.mean(), 2),'min', 'max']})
          .reset_index())
table2.columns = ['Library', 'Median insertions per gene', 'Mean insertions per gene', 
                  'Min insertions per gene', 'Max insertions per gene' ]
table2['Library'] = table2.Library.str.replace("_", '-')

In [None]:
table2[['Library','Median insertions per gene', 'Max insertions per gene']]

In [None]:
insertionSum = (mapsDf.groupby(['library', 'locus_tag'])
                .barcode.count().reset_index())
hiDisrupt = insertionSum[insertionSum.barcode >= 10]
table3 = (hiDisrupt.groupby('locus_tag')
          .agg({'library': ['nunique'], 'barcode': ['median']})
          .reset_index())
table3.columns = ["Locus Tag",
                  "# of libraries with insertion", "Median # of insertions"]
gene_loc = (mapsDf[['locus_tag', 'ShortName', 'sseqid', 'sstart']].drop_duplicates()
            .groupby(['locus_tag', 'ShortName', 'sseqid']).sstart.min()
            .reset_index())
table3 = table3.merge(gene_loc, left_on='Locus Tag',
                      right_on='locus_tag', how='left')
table3 = table3[['ShortName', "# of libraries with insertion", "Median # of insertions",
                 'sseqid', 'sstart']].sort_values(['sseqid', 'sstart'])

In [None]:
table3

In [None]:
table3_fq 

In [None]:
table3_fq = table3[table3.sseqid == 'HE654725.1'].copy()
table3_fq['sstart'] = table3_fq.sstart/1000000

# (p9.ggplot(table3_fq, p9.aes(x='sstart', y='Median # of insertions', color='# of libraries with insertion'))
#  + p9.geom_point(size=6)
#  + p9.geom_segment(p9.aes(x='sstart', xend='sstart',
#                    y='Median # of insertions', yend=0), size=3, alpha=0.8)
#  + p9.theme_classic()
#  + p9.theme(text=p9.element_text(size=24), figure_size=(20, 10))
#  + p9.xlab(f"Postion, Mb")
#  + p9.scale_color_gradientn(colors=sns.color_palette('rocket_r'))
#     + p9.scale_y_log10())

In [None]:
table3_fq = table3[table3.sseqid == 'FQ312003.1'].copy()
table3_fq['sstart'] = table3_fq.sstart/1000000

# (p9.ggplot(table3_fq, p9.aes(x='sstart', y='Median # of insertions', color='# of libraries with insertion'))
#  + p9.geom_point(size=6)
#  + p9.geom_segment(p9.aes(x='sstart', xend='sstart',
#                    y='Median # of insertions', yend=0), size=3, alpha=0.8)
#  + p9.theme_classic()
#  + p9.theme(text=p9.element_text(size=24), figure_size=(20, 10))
#  + p9.xlab(f"Postion, Mb")
#  + p9.scale_color_gradientn(colors=sns.color_palette('rocket_r'))
#     + p9.scale_y_log10())

In [None]:
sns.set(font_scale=2)
sns.set_style('ticks')
fig, ax = plt.subplots(figsize=(20,10))
sns.histplot(data=insertionSum[insertionSum['library'] == 'library_16_2'], 
             x='barcode',  color=(0.20973515, 0.09747934, 0.24238489))
ax.set_xticks([1, 2, 3, 4, 5, 10]);
plt.title("Library 16-2")
plt.xlabel('Number of insertions per gene')
plt.xlim(0, 10);

In [None]:
sns.set(font_scale=1.5)
sns.set_style('ticks')
fig, ax = plt.subplots(figsize=(15,5))
sns.histplot(data=insertionSum, x='barcode', hue='library',multiple='stack', palette='rocket')
ax.set_xticks([1, 2, 3, 4, 5, 10]);
plt.xlim(0, 10);

In [None]:
mapsDf

Overlaps at gene level: histogram of # of insertions per gene across all libraries. Genes that were disrupted in most libraries.

In [None]:
geneDisruption = mapsDf.groupby('locus_tag').library.nunique().reset_index()

In [None]:
sns.set(font_scale=2)
sns.set_style('ticks')
fig, ax = plt.subplots(figsize=(20,10))
sns.histplot(data=geneDisruption, 
             x='library',  color=(0.20973515, 0.09747934, 0.24238489), bins=20)
ax.set_xticks(range(1,21));
plt.title("Number of libraries with gene disruption")
plt.xlabel('Number of libraries with gene disruption')
plt.xlim(0, 21);




In [None]:
geneDisruption[geneDisruption.library >2].locus_tag.nunique()

In [None]:
geneDisruption[geneDisruption.library == 20]

In [None]:
mapsDf[['barcode', 'sseqid', 'sstart']].drop_duplicates()