In [None]:
# %load ../snippets/basic_settings.py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from pathlib import Path
import seaborn as sns
import sys
import plotly.express as px
import yaml

sns.set_context("notebook", font_scale=1.1)
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 100)
plt.rcParams["figure.figsize"] = (16, 12)
plt.rcParams['savefig.dpi'] = 200
plt.rcParams['figure.autolayout'] = False
plt.rcParams['axes.labelsize'] = 18
plt.rcParams['axes.titlesize'] = 20
plt.rcParams['font.size'] = 16
plt.rcParams['lines.linewidth'] = 2.0
plt.rcParams['lines.markersize'] = 8
plt.rcParams['legend.fontsize'] = 14
plt.rcParams['text.usetex'] = False  # True activates latex output in fonts!
plt.rcParams['font.family'] = "serif"
plt.rcParams['font.serif'] = "cm"
pd.set_option('display.float_format', lambda x: '{:,.2f}'.format(x))

In [None]:
root = Path("/nfs/nas22/fs2202/biol_micro_bioinf_nccr/hardt/nguyenb/tnseq/scratch/deutschbauer/fastq/test_out")

In [None]:
%ls /nfs/nas22/fs2202/biol_micro_bioinf_nccr/hardt/nguyenb/tnseq/scratch/deutschbauer/fastq/test_out

In [None]:
blast_file = root/"TnSeq_SB2B_ML5_l10.blastn"

In [None]:
def _find_most_likely_positions_v2(temp_blastn_file, filter_below, perc_primary_location=0.75) -> None:
    """
     Takes in blast file, and provides most likely locations for each barcode
     :param: blast_file
     :param: filter_below
     :param: logger
     :return: pd.DataFrame
     """
    print("Running")

    def merge_similar_locations(df):
        df = df.sort_values(['sstart']).reset_index()
        df['Group']=((df.end.rolling(window=2,min_periods=1).min()
                    -df.sstart.rolling(window=2,min_periods=1).max())<0).cumsum()
        cnt = df.groupby(['Group']).agg({'cnt': ['sum']}).reset_index()
        cnt.columns = ['Group', 'total_count']
        loc = df.loc[df.groupby(['barcode', 'Group'])['cnt'].idxmax()]
        loc = loc.merge(cnt, on=['Group'])
        return loc[['sstart', 'sstrand', 'total_count']]
    print('reading the file')
    df = pd.read_table(temp_blastn_file, nrows=5000000, header=None)
    df.columns = "qseqid sseqid pident length qstart qend sstart send evalue bitscore qseq sstrand".split()
    # Filter out spurious hits
    print("Filter out spurious hits")
    df = df[(df.evalue < 0.1) & (df.length > 20)]
    # Get a best hit for each qseqID( barcode:host combo): group by qseqid, find max bitscore
    print("Get a best hit for each qseqID")
    best_hits = df.groupby('qseqid').agg({'bitscore': ['max']}).reset_index()
    best_hits.columns = ['qseqid', 'bitscore']
    # Get barcode out of qseqid
    best_hits['barcode'] = best_hits['qseqid'].str.split('_', expand=True)[[2]]
    # Get count out of qseqid
    best_hits['cnt'] = best_hits['qseqid'].str.split('_', expand=True)[[4]].astype(int)
    query_best_hits = best_hits.merge(df, how='left', on=['qseqid', 'bitscore'])
    query_best_hits['end'] = query_best_hits['sstart'] + 5
    print("merge similar positions")
    # for each barcode, find all positions detected, and count how many reads per position
    total_counts = query_best_hits.groupby(['barcode', 'sseqid']).apply(merge_similar_locations).reset_index()
    print("calculate prop / positions")
    total_counts['prop_read_per_position'] = total_counts['total_count'] / total_counts.groupby('barcode')['total_count'].transform('sum')
    likely_positions = total_counts[total_counts['prop_read_per_position'] > perc_primary_location].reset_index()
    likely_multimappers = (total_counts[(total_counts['prop_read_per_position'] < perc_primary_location) 
                                        & (likely_positions.total_count > filter_below)]
                          .barcode.nunique())
    print(likely_multimappers)
    likely_positions = likely_positions[likely_positions.total_count > filter_below]
    
    return likely_positions[['barcode', 'sseqid', 'sstrand', 'sstart', 'total_count', 'prop_read_per_position']]

In [None]:
lp2 = _find_most_likely_positions_v2(blast_file, 10, 0.75)

In [None]:
def _find_most_likely_positions_v3(temp_blastn_file, filter_below, perc_primary_location=0.75) -> None:
    """
     Takes in blast file, and provides most likely locations for each barcode
     :param: blast_file
     :param: filter_below
     :param: logger
     :return: pd.DataFrame
     """
    print("Running")
    print('reading the file')
    
    
    chunks = pd.read_table(blast_file, header=None, 
                       names ="qseqid sseqid pident length qstart qend sstart send evalue bitscore qseq sstrand".split(),
                       usecols = "qseqid sseqid pident length sstart evalue bitscore sstrand".split(),
                       chunksize=1000000)
    df = pd.concat([chunk[(chunk.evalue < 0.1) & (chunk.length > 20)] for chunk in chunks])
    
    # Get a best hit for each qseqID( barcode:host combo): group by qseqid, find max bitscore
    print("Get a best hit for each qseqID")
    best_hits = df.groupby('qseqid').agg({'bitscore': ['max']}).reset_index()
    best_hits.columns = ['qseqid', 'bitscore']
    # Get barcode out of qseqid
    best_hits['barcode'] = best_hits['qseqid'].str.split('_', expand=True)[[2]]
    # Get count out of qseqid
    best_hits['cnt'] = best_hits['qseqid'].str.split('_', expand=True)[[4]].astype(int)
    query_best_hits = best_hits.merge(df, how='left', on=['qseqid', 'bitscore'])
    query_best_hits['end'] = query_best_hits['sstart'] + 5
    print("merge similar positions")
    query_best_hits = query_best_hits.sort_values(['barcode', 'sseqid', 'sstart'])
    query_best_hits['Group']=((query_best_hits.end.rolling(window=2,min_periods=1).min()
                        -query_best_hits.sstart.rolling(window=2,min_periods=1).max())<0).cumsum()
    query_best_hits['Group'] = query_best_hits.barcode + "_" + query_best_hits.sseqid + "_" + query_best_hits.Group.astype(str)
    cnt = query_best_hits.groupby(['Group']).agg({'cnt': ['sum']}).reset_index()
    cnt.columns = ['Group', 'total_count']
    loc = query_best_hits.loc[query_best_hits.groupby(['Group'])['cnt'].idxmax()]
    loc = loc.merge(cnt, on=['Group'])
    total_counts = loc[['barcode', 'sseqid', 'sstart', 'sstrand', 'total_count']].copy()
    print("calculate prop / positions")
    total_counts['prop_read_per_position'] = total_counts['total_count'] / total_counts.groupby('barcode')['total_count'].transform('sum')
    likely_positions = total_counts[total_counts['prop_read_per_position'] > perc_primary_location]
    likely_multimappers = (total_counts[(total_counts['prop_read_per_position'] < perc_primary_location) 
                                        & (total_counts.total_count > filter_below)]
                          .barcode.nunique())
    print(likely_multimappers)
    likely_positions = likely_positions[likely_positions.total_count > filter_below]
    return likely_positions
    
    

In [None]:
lp3 = _find_most_likely_positions_v3(blast_file, 10, 0.75)

In [None]:
lp3.to_csv(root/"24-06-2022-sb2b-l10-map.csv")

In [None]:
lp3.total_count.hist(bins=1000)
plt.xlim(0,100)

In [None]:

def add(previous_result, new_result):
    return previous_result.add(new_result, fill_value=0)

In [None]:
chunks = pd.read_table(blast_file, header=None)
chunks.columns = "qseqid sseqid pident length qstart qend sstart send evalue bitscore qseq sstrand".split()
df = chunks[(chunks.evalue < 0.1) & (chunks.length > 20)]

In [None]:
def filter_chunks(chunk):
    #chunk.columns = "qseqid sseqid pident length qstart qend sstart send evalue bitscore qseq sstrand".split()
    return chunk[(chunk.evalue < 0.1) & (chunk.length > 20)]

In [None]:
chunks = pd.read_table(blast_file, header=None, 
                   names ="qseqid sseqid pident length qstart qend sstart send evalue bitscore qseq sstrand".split(),
                   usecols = "qseqid sseqid pident length sstart evalue bitscore sstrand".split(),
                   chunksize=1000000)
df = pd.concat([chunk[(chunk.evalue < 0.1) & (chunk.length > 20)] for chunk in chunks])


In [None]:
df.shape

In [None]:
fdf.sample(20)

In [None]:
fdf.head()

In [None]:
px.scatter(fdf, x='sstart_x', y='sstart_y')