In [None]:
# %load ../snippets/basic_settings.py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from pathlib import Path
import seaborn as sns
import sys
import plotly.express as px
import yaml

sns.set_context("notebook", font_scale=1.1)
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 100)
plt.rcParams["figure.figsize"] = (16, 12)
plt.rcParams['savefig.dpi'] = 200
plt.rcParams['figure.autolayout'] = False
plt.rcParams['axes.labelsize'] = 18
plt.rcParams['axes.titlesize'] = 20
plt.rcParams['font.size'] = 16
plt.rcParams['lines.linewidth'] = 2.0
plt.rcParams['lines.markersize'] = 8
plt.rcParams['legend.fontsize'] = 14
plt.rcParams['font.serif'] = "cm"
#pd.set_option('display.float_format', lambda x: '{:,.2f}'.format(x))

In [None]:
root = Path("/nfs/nas22/fs2202/biol_micro_bioinf_nccr/hardt/nguyenb/tnseq/scratch/deutschbauer/fastq")

# Maps

In [None]:
map1 = pd.read_csv(root/"test_out/TnSeq_SB2B_ML5_l0.annotated.csv")
map2 = pd.read_csv(root/"test_out/TnSeq_SB2B_ML5_tn2_l0.annotated.csv")

In [None]:
map1.sample(20000).number_of_reads.hist(bins=1000)
plt.xlim(0, 100)

In [None]:
map1.shape

In [None]:
map1[map1.number_of_reads > 100].shape

In [None]:
np.log2(20)

In [None]:
map2.shape

In [None]:
map2[map2.number_of_reads > 20].shape

In [None]:
map2.sample(40000).number_of_reads.hist(bins=1000)
plt.xlim(0, 100)

In [None]:
map2.sort_values('number_of_reads').tail(20)

In [None]:
map1[map1.barcode == 'CTCAACATTTGAAGATGTTT']

In [None]:
map2[map2.barcode == "CTTATGCTTCACAAATTGAG" ]

In [None]:
np.quantile(map2.number_of_reads, 0.1)

In [None]:
map2[map2.number_of_reads > 10].shape

In [None]:
map2[map2.number_of_reads > 20].shape

In [None]:
#map2[map2.multimap == True].sample(20000).number_of_reads.hist(bins=500)
np.log2(map2.sample(20000).number_of_reads).hist(bins=500)

In [None]:
blast_file = root/"test_out/TnSeq_SB2B_ML5_l0.blastn"

In [None]:
df = pd.read_table(blast_file, header=None)

In [None]:
df.columns = "qseqid sseqid pident length qstart qend sstart send evalue bitscore qseq sstrand".split()

In [None]:
df.shape

In [None]:
df = df[(df.evalue < 0.1) & (df.length > 20)]

In [None]:
df3

In [None]:
best_hits = df.groupby('qseqid').agg({'bitscore': ['max']}).reset_index()

In [None]:
best_hits.columns = ['qseqid', 'bitscore']
best_hits['barcode'] = best_hits['qseqid'].str.split('_', expand=True)[[2]]

In [None]:
best_hits.head()

In [None]:
best_hits['cnt'] = best_hits['qseqid'].str.split('_', expand=True)[[4]].astype(int)

In [None]:
best_hits.head()

In [None]:
#total_count = best_hits.groupby('barcode').cnt.sum().reset_index()

In [None]:
#total_count.columns = ['barcode', 'total_count']

In [None]:
#total_count.head()

In [None]:
#best_hits = best_hits.merge(total_count, how='left', on='barcode')

In [None]:
query_best_hits = best_hits.merge(df, how='left', on=['qseqid', 'bitscore'])

In [None]:
query_best_hits[query_best_hits.barcode == 'CTCTTGGACGTTGGCGCGAG']

In [None]:
total_counts = query_best_hits.groupby(['barcode', 'sstart']).cnt.sum().reset_index()
total_counts.columns = ['barcode', 'sstart', 'total_cnt']
total_counts['tts'] = total_counts['total_cnt'] / total_counts.groupby('barcode')['total_cnt'].transform('sum')

In [None]:
total_counts[total_counts.barcode == "AAGACGCCCTGCAGGGATGT"]

In [None]:
total_counts[(total_counts.tts > 0.75) & (total_counts.total_cnt > 10)]

In [None]:
mp = total_counts[(total_counts.tts > 0.1) & (total_counts.tts < 0.75)].groupby('barcode').total_cnt.sum().reset_index()
mp[mp.total_cnt > 10].shape




In [None]:
total_counts[total_counts.total_cnt > 10].tts.hist(bins=500)

In [None]:
query_best_hits = query_best_hits.sort_values(['barcode', 'cnt'], ascending=False)
query_best_hits['rank'] = query_best_hits.groupby(['barcode']).cumcount()
#query_best_hits = query_best_hits[query_best_hits['rank'] == 0].copy()
#query_best_hits.drop('rank', axis=1, inplace=True)

In [None]:
query_best_hits[query_best_hits.barcode == 'CTCTTGGACGTTGGCGCGAG']

In [None]:
best_hits[best_hits.barcode == 'CTCTTGGACGTTGGCGCGAG']

In [None]:
query_best_hits[query_best_hits.barcode == 'CTCTTGGACGTTGGCGCGAG']

In [None]:


        # Note: Total counts are calculated with cnt 1 included,
        # but low counts are filtered out right after
        
        
        # Create best hits data frame by merging best_hits with other columns from blast file
        # There still could be multiple hits for each qseqid, if they have the same blast score
        
        multimap = (query_best_hits.groupby(['barcode']).sstart.std(ddof=0) > 5).reset_index().rename(
            {'sstart': 'multimap'},
            axis=1)
        query_best_hits = query_best_hits.merge(multimap, on='barcode')
        # For each barcode select the position supported by most reads
        query_best_hits = query_best_hits.sort_values(['barcode', 'cnt'], ascending=False)
        query_best_hits['rank'] = query_best_hits.groupby(['barcode']).cumcount()
        query_best_hits = query_best_hits[query_best_hits['rank'] == 0].copy()
        query_best_hits.drop('rank', axis=1, inplace=True)
        self.positions = query_best_hits