In [None]:
# %load ../snippets/basic_settings.py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from pathlib import Path
import seaborn as sns
import sys
import plotly.express as px
import yaml

sns.set_context("notebook", font_scale=1.1)
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 100)
plt.rcParams["figure.figsize"] = (16, 12)
plt.rcParams['savefig.dpi'] = 200
plt.rcParams['figure.autolayout'] = False
plt.rcParams['axes.labelsize'] = 18
plt.rcParams['axes.titlesize'] = 20
plt.rcParams['font.size'] = 16
plt.rcParams['lines.linewidth'] = 2.0
plt.rcParams['lines.markersize'] = 8
plt.rcParams['legend.fontsize'] = 14
plt.rcParams['text.usetex'] = False  # True activates latex output in fonts!
plt.rcParams['font.family'] = "serif"
plt.rcParams['font.serif'] = "cm"
pd.set_option('display.float_format', lambda x: '{:,.2f}'.format(x))

# Load project configurations and set up data files

In [None]:
config_file = "../nguyenb_config.yaml"
with open(config_file) as file:
    # The FullLoader parameter handles the conversion from YAML
    # scalar values to Python the dictionary format
    configs = yaml.load(file, Loader=yaml.FullLoader)

In [None]:
# # Run locally:
# root = Path(configs['root']['local'])
# scratchDir = configs['scratchDir']['local']

In [None]:
# Run on server:
root = Path(configs['root']['server'])
scratchDir = configs['scratchDir']['server']

In [None]:
mapDir = root/configs['mapDir']
countDir = root/configs['libraryCountsDir']
resultDir = root/configs['resultDir']
sampleData = pd.read_csv(root/configs['sampleData'])

# QC Maps
## Load new and old maps

In [None]:
new_maps = pd.read_csv(mapDir/"14-04-22-concatenated_map.csv")
oldMapDir = Path("/nfs/nas22/fs2202/biol_micro_bioinf_nccr/hardt/nguyenb/tnseq/scratch/08_21/maps")
old_map_files = [f for f in oldMapDir.glob("*/*annotated.csv")]
old_maps =  pd.concat([pd.read_csv(f).assign(library=f.stem.split('.')[0]) for f in old_map_files])

In [None]:
new_maps.sample(5)

In [None]:
old_maps.sample(5)

In [None]:
merged_map = new_maps.merge(old_maps, how='outer', on=['barcode', 'library'])

In [None]:
to_show = ['barcode', 'cnt', 'number_of_reads', 'insertion_site', 'chr', 'sseqid', 'sstart', 'library']

In [None]:
merged_map[to_show].sample(5)

## See if any barcode are missing from new maps

In [None]:
merged_map[merged_map.insertion_site.isna()][to_show]

## See if there are new barcodes in the new maps

In [None]:
merged_map[merged_map.sstart.isna()].number_of_reads.hist(bins=50)

In [None]:
merged_map[(merged_map.sstart.isna()) & (merged_map.number_of_reads > 1000)][to_show]

## See if the locations are consistent between maps

In [None]:
mmap = merged_map[to_show].dropna()

In [None]:
mmap.shape

In [None]:
merged_map.shape

In [None]:
mmap[mmap.library == 'library_9_1'].plot("sstart", "insertion_site", 'scatter')

# QC Counts

## Load new and old counts

In [None]:
new_count_files = [f for f in countDir.glob("*mbarq_merged_counts.csv")]
newCounts = pd.concat([pd.read_csv(f).assign(library=f.stem.split('_mbarq')[0]) for f in new_count_files])
#oldMapDir = Path("/nfs/nas22/fs2202/biol_micro_bioinf_nccr/hardt/nguyenb/tnseq/scratch/08_21/maps")

#old_maps =  pd.concat([pd.read_csv(f).assign(library=f.stem.split('.')[0]) for f in old_map_files])

In [None]:
newCounts.sample(5)

In [None]:
old_root = Path("/nfs/nas22/fs2202/biol_micro_bioinf_nccr/hardt/nguyenb/tnseq")
old_dataDir = old_root/"scratch/08_21/counts/"
controls_file = old_root/"data/metadata/controls.txt"
metafile = old_root/"scratch/08_21/complete_metadata.tsv"
files = [f for f in old_dataDir.glob("*/*_mapped.csv")]
files_unmapped = [f for f in old_dataDir.glob("*/*_unmapped.csv")]
metadata = pd.read_table(metafile,index_col=0, header=None)
metadata.columns = ["library", "experiment", "mouse", "day", "tissue", "dnaid", "sampleID"]
df = pd.concat([pd.read_csv(f, index_col=0).assign(sampleID=f.stem.split('_counts')[0]) for f in files])
df = df.merge(metadata, how='left', on='sampleID').dropna(subset=['mouse'])
old_counts = (df[['sampleID', 'barcode', 'barcode_cnt']]
              .drop_duplicates()
              .pivot(index='barcode', columns='sampleID')
             .reset_index())

In [None]:
sampleID = 'dnaid2019_65'
left = newCounts[['barcode', sampleID]]
right = old_counts[[('barcode',''), ('barcode_cnt', sampleID)]]
right.columns = ['barcode', f"{sampleID}_old"]
df2 = left.merge(right, how='inner', on='barcode')
plt.plot(df2[sampleID], df2[f"{sampleID}_old"], 'k.')
plt.xscale('log')
plt.yscale('log')