In [None]:
# %load ../snippets/basic_settings.py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path


sns.set_context("notebook", font_scale=1.1)
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 100)
plt.rcParams["figure.figsize"] = (16, 12)
plt.rcParams['savefig.dpi'] = 200
plt.rcParams['figure.autolayout'] = False
plt.rcParams['axes.labelsize'] = 18
plt.rcParams['axes.titlesize'] = 20
plt.rcParams['font.size'] = 16
plt.rcParams['lines.linewidth'] = 2.0
plt.rcParams['lines.markersize'] = 8
plt.rcParams['legend.fontsize'] = 14
plt.rcParams['text.usetex'] = False  # True activates latex output in fonts!
plt.rcParams['font.family'] = "serif"
plt.rcParams['font.serif'] = "cm"
pd.set_option('display.float_format', lambda x: '{:,.2f}'.format(x))

In [None]:
root =Path("/nfs/nas22/fs2202/biol_micro_bioinf_nccr/hardt/nguyenb/tnseq")
dataDir = root/"scratch/08_21/"
mapFiles = [f for f in (dataDir/'maps').glob('*/*barcode_map.annotated.csv')]
mapFilesUnFiltered = [f for f in (dataDir/'l0/maps').glob('*/*barcode_map.annotated.csv') ]

# Loading mapping files

In [None]:
mapsDf = pd.concat([pd.read_csv(f).assign(library=f.name.split('.barcode_map.annotated.csv')[0]) for f in mapFiles])

In [None]:
mapsDf = mapsDf[mapsDf.sseqid == 'FQ312003.1']

In [None]:
mapsDf.sample(5)

# Count number of insertions per gene

In [None]:
num_insertions = mapsDf.groupby('ShortName').agg({'barcode':['nunique']}).reset_index()
num_insertions.columns = ["ShortName", 'numIns']

In [None]:
num_insertions.shape

# Get all genes and gene length

In [None]:
gff_file = Path("/nfs/cds-peta/exports/biol_micro_cds_gr_sunagawa/scratch/Projects_NCCR/"+
                "ref/SL1344/ncbi-genomes-2021-08-25/GCA_000210855.2_ASM21085v2_genomic.gff")

In [None]:
gffDf = pd.read_table(gff_file, skiprows=7, header=None)
gffDf.columns = ['chr', 'src', 'feat_id', 'start', 'end', 'DN', 'strand', 'DN2', 'attribute']
gffDf = gffDf[['chr', 'feat_id', 'start', 'end', 'strand' , 'attribute']]
gffDf = gffDf[gffDf.chr == 'FQ312003.1']
gffDf = gffDf[gffDf.feat_id == 'gene']
gffDf['geneLen'] = abs(gffDf['end'] - gffDf['start'])/1000

In [None]:
gffDf['ShortName'] = gffDf.attribute.apply(lambda x: x.split("Name=")[1].split(";")[0])
gffDf['locus_tag'] = gffDf.attribute.apply(lambda x: x.split("locus_tag=")[1].split(";")[0] if 'locus_tag' in x else None)
gffDf.sample(5)

In [None]:
gffDf.shape

# Calculate geneInsertion index

In [None]:
df = num_insertions.merge(gffDf,how='outer', on=['ShortName'])
df['numIns'] = df['numIns'].fillna(0)

In [None]:
df['geneIns'] = df['numIns']/df['geneLen']

In [None]:
df.shape

In [None]:
df.geneIns.hist(bins=2000)
plt.xlim(0, 10)

In [None]:
plt.figure(figsize=(25, 5))
lotsInsertion = df[df.geneIns >50]
plt.eventplot(lotsInsertion.start, orientation="horizontal")

In [None]:
plt.figure(figsize=(25, 5))
noInsertion = df[df.geneIns == 0]
plt.eventplot(noInsertion.start, orientation="horizontal")

In [None]:
df.start.describe()

In [None]:
dfq1 = df[df.start < 1500000]
dfq2 = df[(df.start > 1500000) & (df.start < 2000000)]
dfq3 = df[(df.start > 3900000) & (df.start < 4100000)]

In [None]:
dfq1.geneIns.hist(bins=1000, alpha=0.7, label='1')
#dfq2.geneIns.hist(bins=1000, alpha=0.7, label='2')
dfq3.geneIns.hist(bins=1000, alpha=0.7, label='3')
plt.xlim(0,10);
plt.yscale('log')
plt.legend()

In [None]:
df['rolAvIns'] = df.geneIns.rolling(window=300).mean()

In [None]:
df = df.sort_values('start')
plt.plot(df.start, df.rolAvIns, 'k')

In [None]:
dfq3

# Calculate permutations

In [None]:
import random

In [None]:
dfq3_shuffle = pd.DataFrame()
l = dfq3.geneIns.values
dfq3_shuffle['gene'] = dfq3['ShortName']
shuffles = []
for i in range(10000):
    random.shuffle(l)
    shuffled = l.copy()
    shuffles.append(shuffled)
shDf = pd.DataFrame(shuffles).T
shDf['ShortName'] = dfq3.ShortName.values
shDf = shDf.set_index('ShortName').T
x = 0
sum(shDf.sadA<=x)/10000
shDf = pd.DataFrame(shuffles).T
shDf['ShortName'] = dfq3.ShortName.values
shDf = shDf.set_index('ShortName').T
x = 0
sum(shDf.sadA<=x)/10000

In [None]:
shDf = pd.DataFrame(shuffles).T
shDf['ShortName'] = dfq3.ShortName.values
shDf = shDf.set_index('ShortName').T
x = 0
sum(shDf.sadA<=x)/10000

In [None]:
shDf

In [None]:
shDf.sadA.hist(bins=100)

In [None]:
shDf

In [None]:
x = 0
sum(shDf.sadA<=x)/10000