In [None]:
import pandas as pd
import plotnine as p9
from pathlib import Path
from scipy import stats
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotnine as p9

In [None]:
results_dir = "../../data/processed/results"
samples = [f.stem for f in Path(results_dir).iterdir()]
libraries = {sample: [f.stem.split("_")[2] for f in (Path(results_dir)/sample).glob('*counts*.txt')]
            for sample in samples}
library_to_mapping_file = {}

In [None]:
samples

In [None]:

results_dir = "../../data/processed/results"
controls = pd.read_table('../../data/metadata/controls.txt', header=None, 
                        names = ['DN', 'barcode', 'phenotype', 'conc'])
meta_dir = "../../data/metadata"


def read_count_files(sample, exp, results_dir=results_dir):
    df = pd.read_table(Path(results_dir)/f'{sample}/{sample}_counts_{exp}.txt', sep=" ").assign(exp=exp)
    df = (df.reset_index().rename({'index':'barcode'}, axis=1)
          .melt(id_vars=['barcode','Position', 'Element', 'Strand', 'Feature', 'ShortName', 'exp']))
    df['proportion'] = df['value']/ df.groupby('variable')['value'].transform('sum')
    expansion = df['variable'].str.split('-', expand=True)
    df['mouse'], df['day'],df['organ'] = expansion[0], expansion[1], expansion[2]
    df = df.rename({'variable':'sample', 'value':'cnts'}, axis=1)
    return df


def load_sample(sample, meta_dir = meta_dir, results_dir = results_dir):
    meta = (pd.read_table(Path(meta_dir)/f'{sample}_metadata.txt', header=None, 
                        names = ['DN', 'lib', 'exp', 'DN2', 'sample', 'day', 'organ']).drop(['DN', 'DN2'], axis=1))
            
    exps = meta.exp.unique()
    dfs = [read_count_files(sample, exp) for exp in exps]
    fdf = pd.concat(dfs).assign(dnaid=sample)
    #fdf = fdf.merge(meta[['lib', 'exp']], how='left', left_on='exp', right_on='exp')
    return fdf


def load_samples(samples, meta_dir=meta_dir, results_dir=results_dir):
    dfs = [load_sample(sample, meta_dir, results_dir) for sample in samples]
    return pd.concat(dfs)


# Load all the count data for specified dnaids. 

fdf = load_samples(samples)

gene_info = fdf[['Feature', 'ShortName']].drop_duplicates().set_index('Feature')



def load_meta(sample, meta_dir = meta_dir, results_dir = results_dir):
    meta = (pd.read_table(Path(meta_dir)/f'{sample}_metadata.txt', header=None, 
                        names = ['DN', 'lib', 'exp', 'DN2', 'sample', 'day', 'organ']).drop(['DN', 'DN2'], axis=1))
    
    return meta
            
def load_all_meta(samples, meta_dir=meta_dir, results_dir=results_dir):
    dfs = [load_meta(sample, meta_dir, results_dir) for sample in samples]
    return pd.concat(dfs)
            
meta = load_all_meta(samples)
meta_dict = meta[['lib', 'exp']].set_index('exp').to_dict()
meta_dict = meta_dict['lib']
meta_dict

In [None]:
fdf['lib'] = fdf.exp.replace(meta_dict)
new = fdf[(fdf.barcode =='ATGGCCAGCCAATCTAG') & (fdf.ShortName == 'hilD') & (fdf.mouse == 'inoculum')]

new[['barcode', 'ShortName', 'cnts', 'lib', 'dnaid']].drop_duplicates()

In [None]:
df = fdf[ (fdf.mouse == 'inoculum')& (fdf.cnts > 1000)][['barcode', 'Position', 'Element','Feature','exp', 'dnaid', 'cnts']]
# df has all barcodes for all libraries where mouse == inoculum and counts are > 1000. 
df['lib'] = df.exp.replace(meta_dict)

df.head()

In [None]:
map_dir = "/nfs/cds-peta/exports/biol_micro_cds_gr_sunagawa/scratch/"\
                "chris/hardt/nguyenb/tnseq_mapping_2031/scratch/"


def get_library(mapping_file):
    library = pd.read_table(mapping_file, header=None, index_col=0, names=['lib_counts', 'position', 'chr',
                                                                          'strand', 'norm_count', 'locus', 'gene'])
    library['library'] = Path(mapping_file).parent.stem
    return library

def all_libraries(mapping_dir):
    files = [c/"barcode_map.txt" for c in Path(map_dir).iterdir()]
    df_list = [ get_library(f) for f in files] 
    return pd.concat(df_list)


all_lib = all_libraries(map_dir)
all_lib = all_lib.reset_index().rename({'index':'barcode'}, axis=1)
all_lib.sample(5)

In [None]:
df.head()

In [None]:
all_lib.shape

In [None]:
lib_barcodes = {}
lib_mapping_bc = {}
for i, g in df.groupby('lib'):
    mapping_bc = set(all_lib[all_lib.library == i].barcode.values)
    inoculum_bc = set(g.barcode.values)
    #print(len(set(g.barcode.values)))
    #print(len(set(all_lib[all_lib.library == i].barcode.values)))
    barcodes = inoculum_bc.intersection(mapping_bc)
    #print(len(barcodes))
    lib_barcodes[i] = barcodes
    lib_mapping_bc[i] = mapping_bc

In [None]:
for k,v in lib_barcodes.items():
    print(k)
    print(len(v))

In [None]:
len(lib_barcodes['library_12_1'].intersection(lib_barcodes['library_10_2']))

In [None]:
hc_df = []
for i, g in all_lib.groupby('library'):
    if not i in lib_barcodes.keys():
        continue
    df = g[g.barcode.isin(lib_barcodes[i])]
    hc_df.append(df)
lib_hi = pd.concat(hc_df)

In [None]:
lib_hi.head()

In [None]:
len(lib_mapping_bc['library_12_1'].intersection(lib_mapping_bc['library_10_2']))

In [None]:
lib_hi.head()

In [None]:
df.Element.unique() # Chromosome and 3 plasmids


def vis_library_insertions(df, element= 'FQ312003.1', div=1000000, unit='Mb'):
    ldf = df[df['Element'] == element]

    # Only interested in annotated insertions
    ldf = ldf[ldf.Feature != '-']
    per_gene_summary = ldf.groupby(['Feature']).agg({'barcode':'nunique',  'lib':'nunique'}).reset_index()
    per_gene_summary.columns = ['Feature',  'barcode_uniq','num_lib_present']
    fdf = ldf.merge(per_gene_summary[['Feature','barcode_uniq','num_lib_present']], how= 'left',on='Feature')
    fdf['Position'] = fdf['Position']/div
    p9.options.figure_size = (35, 10)
    g = (p9.ggplot(fdf, p9.aes(x='Position', y='cnts', color='num_lib_present'))
     #+ p9.geom_line()
     + p9.geom_point()
     + p9.geom_segment(p9.aes(x='Position', xend='Position', y='cnts', yend=0), alpha=0.5)
     + p9.theme_classic()
     + p9.theme(text = p9.element_text(size=24))
    # + p9.ylab("Count")
      + p9.xlab(f"Postion, {unit}")
      + p9.scale_y_log10())
     #+ p9.scale_color_distiller(palette=1, ))
    #  + p9.facet_wrap('~ sample'))
    return fdf, g
# to_plot = lib_sum.groupby(['Position']).agg({'lib':'nunique', 'cnts':'mean', 'barcode':'nunique'}).reset_index()
# to_plot['Position'] = to_plot['Position']/1000000
# to_plot.tail()




ldf, g = vis_library_insertions(df)
g


In [None]:
ldf = lib_hi[lib_hi['chr'] == 'FQ312003.1']

    
ldf = ldf[ldf.locus != '-']
per_gene_summary = ldf.groupby(['locus']).agg({'barcode':'nunique',  'library':'nunique'}).reset_index()
per_gene_summary.columns = ['locus',  'barcode_uniq','num_lib_present']
fdf = ldf.merge(per_gene_summary[['locus','barcode_uniq','num_lib_present']], how= 'left',on='locus')
fdf['position'] = fdf['position']/1000000
p9.options.figure_size = (40, 10)
g = (p9.ggplot(fdf, p9.aes(x='position', y='lib_counts', color='num_lib_present'))
 #+ p9.geom_line()
 + p9.geom_point()
 + p9.geom_segment(p9.aes(x='position', xend='position', y='lib_counts', yend=0), alpha=0.5)
 + p9.theme_classic()
 + p9.theme(text = p9.element_text(size=24))
    # + p9.geom_vline(fdf, p9.aes(xintercept =4.105675 ), linetype="dashed",  size=5)
# + p9.ylab("Count")
  + p9.xlab(f"Postion, Mb")
  + p9.scale_y_log10())

In [None]:
g

In [None]:
lib_hi.head()

In [None]:
ldf[ldf.Feature == 'gene-SL1344_4404'].head()

In [None]:

ldf, g = vis_library_insertions(df, element = 'NC_017720.1', div=1000, unit='Kb')
g


In [None]:
ldf, g = vis_library_insertions(df, element = 'NC_017718.1', div=1000, unit='Kb')
g


In [None]:
ldf, g = vis_library_insertions(df, element = 'NC_017719.1', div=1000, unit='Kb')
g

In [None]:
df.Element.unique()

In [None]:
#to_plot3 = lib_sum3.groupby(['Position']).agg({'cnts':'mean', 'barcode':'nunique'}).reset_index()
p9.options.figure_size = (30, 10)
(p9.ggplot(lib_sum3, p9.aes(x='Position', y='cnts', color='lib_y'))
 #+ p9.geom_line()
 + p9.geom_point()
 + p9.geom_segment(p9.aes(x='Position', xend='Position', y='cnts', yend=0), alpha=0.5)
 + p9.theme_classic()
 + p9.theme(text = p9.element_text(size=24))
# + p9.ylab("Count")
  + p9.xlab("Postion, Kb")
  + p9.scale_y_log10())
 #+ p9.scale_color_distiller(palette=1, ))
#  + p9.facet_wrap('~ sample'))

In [None]:
lib_sum2 = df[df['Element'] != 'FQ312003.1']
to_plot2 = lib_sum2.groupby(['Position']).agg({'lib':'nunique', 'cnts':'mean', 'barcode':'nunique'}).reset_index()
to_plot2['Position'] = to_plot2['Position']/1000
to_plot2.shape

In [None]:
to_plot.sort_values('barcode').tail()

In [None]:
df.groupby('lib').agg({'barcode':'nunique'}).sum()

In [None]:
df.head()

In [None]:
to_plot.shape

In [None]:
#TV5490A: library_13_1
counts_file = "/nfs/nas22/fs2202/biol_micro_bioinf_nccr/hardt/nguyenb/tnseq/"\
              "scratch/results/dnaid2023/dnaid2023_counts_TV5490A.txt"

mapping_file = "/nfs/nas22/fs2202/biol_micro_bioinf_nccr/hardt/nguyenb/tnseq/"\
                "scratch/results/dnaid2023/dnaid2023_features_TV5490A.txt"

results_file = "/nfs/nas22/fs2202/biol_micro_bioinf_nccr/hardt/nguyenb/tnseq/"\
                "scratch/results/dnaid2023/dnaid2023_results_TV5490A.txt"

omapping_file = "/nfs/cds-peta/exports/biol_micro_cds_gr_sunagawa/scratch/"\
                "chris/hardt/nguyenb/tnseq_mapping_2031/scratch/library_13_1/barcode_map.txt"

In [None]:
lib_13.head()

In [None]:
lib_13 = all_lib[all_lib.library == 'library_13_1']
bc_in_lib = [c for c in lib_13.barcode.values if c in inoc_bc]

In [None]:
len(bc_in_lib)

In [None]:
lib_13.lib_counts.hist(bins=100, alpha=1)
lib_13[lib_13.barcode.isin(bc_in_lib)].lib_counts.hist(bins=100, alpha=0.7)
plt.yscale('log')

In [None]:
all_lib[all_lib.lib_counts > 100].groupby('library').agg({'barcode':['nunique'], 'gene':['nunique'], 'position':['nunique']})

In [None]:
lib_info = all_lib.groupby('library').agg({'barcode':['nunique'], 'gene':['nunique'], 
                                'position':['nunique'], 'lib_counts': ['mean', 'max', 'min']})

In [None]:
lib_info.columns = ["# uniq barcodes", "# uniq genes", '# unique positions', 'mean counts per barcode', 
                   "max count", "min count"]
lib_info.sort_index()

In [None]:
plt.figure(figsize=(10, 6))
sns.set_style("white")
sns.set_context("notebook", font_scale=1.5)
all_lib[all_lib.library == 'library_13_1'].lib_counts.hist(bins=150)
plt.yscale('log')
plt.ylabel("Frequency")
plt.xlabel("Counts")
plt.title("library_13_1")

In [None]:
plt.figure(figsize=(10, 6))
sns.set_style("white")
sns.set_context("notebook", font_scale=1.5)

all_lib[all_lib.library == 'library_9_3'].lib_counts.hist(bins=150)
plt.yscale('log')
plt.ylabel("Frequency")
plt.xlabel("Counts")
plt.title("library_9_3")

In [None]:
all_lib[all_lib.chr == 'FQ312003.1'].head()

In [None]:
all_lib[all_lib.barcode == 'AGCTAGACGGAAGGACT']

In [None]:
all_lib[all_lib.barcode == 'ATGGCCAGCCAATCTAG']

In [None]:
df[df.barcode == 'ATGGCCAGCCAATCTAG']

In [None]:
all_lib[all_lib.barcode == 'ACGGTGAGGAGAGGGAG'][['barcode', 'lib_counts', 'position', 'chr', 'gene', 'library']]

In [None]:
t = lib_hi.groupby(['barcode', 'chr', 'position']).agg({'library':['count']}).reset_index()
t.columns = ['Barcode', 'Chromosome', 'Positon', 'num_lib']
t.sort_values('num_lib').tail(25)

In [None]:
chr_df = all_lib[all_lib.chr == 'FQ312003.1']

In [None]:
t = all_lib.groupby(['barcode', 'chr', 'position']).agg({'library':['count']}).reset_index()
t.columns = ['Barcode', 'Chromosome', 'Positon', 'num_lib']
t.sort_values('num_lib').tail(25)

In [None]:
sns.set_style("white")
sns.set_context("notebook", font_scale=1.5)
t.num_lib.hist(bins=25)
plt.yscale("log")
plt.xlabel('# of libraries with same insertion')
plt.ylabel("# of barcodes")
#plt.xlim(1, 22)

In [None]:
t.num_lib.hist(bins=25)
plt.yscale("log")
plt.xlabel('# of libraries with same insertion')
plt.ylabel("# of barcodes")

In [None]:
t.barcode.nunique()

In [None]:
t[t.num_lib == 1].shape[0]/t.shape[0]

In [None]:
all_lib.reset_index().rename({'index':'barcode'}, axis=1).groupby('library').barcode.nunique()

In [None]:
chrom = t[t.chr == 'FQ312003.1']
chrom.head()

In [None]:
test = chrom.sort_values(by='position')[chrom.counts > 1000]
test['position'] = test['position']/1000000
test

In [None]:
all_lib[all_lib.position == 4877761]

In [None]:
all_lib_chr = all_lib[all_lib['chr'] == 'FQ312003.1']
to_plot = all_lib_chr.groupby('position').agg({'library':'nunique', 'counts':'mean'}).reset_index()
to_plot['position'] = to_plot['position']/1000000
to_plot.head()

In [None]:
p9.options.figure_size = (30, 10)
(p9.ggplot(to_plot, p9.aes(x='position', y='counts', color='library'))
 #+ p9.geom_line()
 + p9.geom_point()
 + p9.geom_segment(p9.aes(x='position', xend='position', y='counts', yend=0), alpha=0.5)
 + p9.theme_classic()
#  + p9.ylab("Count")
#  + p9.xlab("Conc")
  + p9.scale_y_log10())
 #+ p9.scale_color_distiller(palette=1, ))
#  + p9.facet_wrap('~ sample'))

In [None]:
t = library_qc(omapping_file)
t.counts.hist(bins=5000)
plt.xlim(0, 20000)
plt.ylim(0, 100)

In [None]:
#chromosome
chrom = t[t.chr == 'FQ312003.1']
chrom = chrom.rename({'counts': 'read_num'}, axis=1)

bc_per_position = (chrom.groupby('position').read_num.count()>1)
pos_keep = bc_per_position[bc_per_position].index
fnc = lambda x: x.value_counts().sort_index().head(2)
chrom2 = chrom[chrom.position.isin(pos_keep)]
grp = chrom2.groupby('position')['read_num'].apply(fnc)#.reset_index(1, name='cnts')
# Get positions wiht more than one barcode mapped
#
##pos_with_multiple_bc = bc_per_position.where(bc_per_position > 1).dropna()
#position = pos_with_multiple_bc.index
# Subset to these positions, group by position 

In [None]:
grp.reset_index()

In [None]:
grp

In [None]:
x

In [None]:
chrom[chrom.position == 832]

In [None]:
t2 = t.groupby(['chr', 'position']).agg({'counts':['count', 'mean']}).reset_index()

In [None]:
t2[t2[('counts', 'count')] >1]

In [None]:
t[t.position == 4816912]['counts'].nlargest(2)[1]

In [None]:
t2[t2.chr == 'NC_017720.1'][[('counts', 'count')]].hist(bins=50)

In [None]:
t[t.counts > 2].shape

In [None]:
## old_map = pd.read_table(omapping_file, header=None)
old_map[6].value_counts().loc['gene-SL1344_RS24690']

In [None]:
old_map.head()

In [None]:
df = (pd.read_csv(counts_file, sep=" ").reset_index()
    .drop(['Position', 'Element', 'Strand', 'Feature', 'ShortName' ], axis=1))
features = pd.read_csv(mapping_file, sep=" ")
results = pd.read_csv(results_file, sep=" ")

In [None]:
results.head()

In [None]:
df2 = pd.melt(df, id_vars = 'index')
expansion = df2['variable'].str.split('-', expand=True)
df2['mouse'], df2['day'],df2['organ'] = expansion[0], expansion[1], expansion[2]
df2 = df2.set_index('index').drop(['variable'], axis=1).rename({'value': 'count'}, axis=1)
fdf = df2.merge(features[['Feature', 'ShortName']], how='left', left_index=True, right_index=True)
fdf = fdf.reset_index().rename({'index':'barcode'}, axis=1)

In [None]:
fdf.head()

### Want to see how many barcodes present in the inoculum vs. the library

In [None]:
inoc = fdf[fdf.mouse == 'inoculum']
inoc.head()

In [None]:
print("\n")
print(f"Number of unique barcodes identified: {inoc.barcode.nunique()}")
print(f"Number of barcodes that were not mapped: {sum(inoc.Feature == '-')}")
print("\n")
print("Dropping Unmapped Barcodes")
print("\n")

In [None]:
3036-579

In [None]:
inocm = inoc[inoc.Feature != '-']
geneInfo = pd.DataFrame(inocm.Feature.value_counts())
geneInfo

In [None]:
(p9.ggplot(geneInfo, p9.aes('Feature'))
 + p9.geom_histogram(bins=50, fill='#428bca')
 + p9.theme_classic()
 + p9.ylab("Count")
 + p9.xlab("# of barcodes per feature"))


In [None]:
inocm.head()

In [None]:
inoc_sum =inocm.groupby('Feature').agg({'count':['mean', 'std', 'count']})

In [None]:
inoc_sum.tail()

In [None]:
inocm[inocm.Feature == 'gene-SL1344_RS27340']

In [None]:
print(f"After removal of unmapped barcodes there are {inocm.shape[0]} unique BC")
print(f"There are {inocm[inocm['count']==0].shape[0]} barcodes with 0 counts")
print("\n")
print("Removing barcodes with 0 counts")

In [None]:
inocm2 = inocm[inocm["count"] >0]
inoc_sum2 =inocm2.groupby('Feature').agg({'count':['mean', 'std', 'count']})

In [None]:
inoc_sum[inoc_sum.index == 'gene-SL1344_4197']

In [None]:
inoc_sum2[inoc_sum2.index == 'gene-SL1344_4197']

In [None]:
inocm2[inocm2.Feature == 'gene-InvR']

In [None]:
geneInfo2 = pd.DataFrame(inocm2.Feature.value_counts())
print(geneInfo2.describe())
(p9.ggplot(geneInfo2, p9.aes('Feature'))
 + p9.geom_histogram(bins=50, fill='#428bca')
 + p9.theme_classic()
 + p9.ylab("Count")
 + p9.xlab("# of barcodes per feature"))

In [None]:
(p9.ggplot(inocm2, p9.aes('count'))
 + p9.geom_histogram(bins=50, fill='#428bca')
 + p9.theme_classic()
 + p9.ylab("# of BC")
 + p9.xlab("Count"))

In [None]:
inocm2['count'].describe()

In [None]:
fdf[fdf.Feature == 'gene-InvR']