In [None]:
# %load ../snippets/basic_settings.py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path


sns.set_context("notebook", font_scale=1.1)
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 100)
plt.rcParams["figure.figsize"] = (16, 12)
plt.rcParams['savefig.dpi'] = 200
plt.rcParams['figure.autolayout'] = False
plt.rcParams['axes.labelsize'] = 18
plt.rcParams['axes.titlesize'] = 20
plt.rcParams['font.size'] = 16
plt.rcParams['lines.linewidth'] = 2.0
plt.rcParams['lines.markersize'] = 8
plt.rcParams['legend.fontsize'] = 14
plt.rcParams['font.family'] = "serif"
plt.rcParams['font.serif'] = "cm"
pd.set_option('display.float_format', lambda x: '{:,.2f}'.format(x))

In [None]:
import sys
import os
import datetime
import math

In [None]:
root = Path("/nfs/nas22/fs2202/biol_micro_bioinf_nccr/hardt/nguyenb/tnseq")
dataDir = root/"scratch/08_21/counts/"
controls_file = root/"data/metadata/controls.txt"
outDir = root/"scratch/08_21/results/nguyenb"
metafile = root/"scratch/08_21/complete_metadata.tsv"

In [None]:
files = [f for f in dataDir.glob("*/*_mapped.csv")]
files_unmapped = [f for f in dataDir.glob("*/*_unmapped.csv")]
metadata = pd.read_table(metafile,index_col=0, header=None)
metadata.columns = ["library", "experiment", "mouse", "day", "tissue", "dnaid", "sampleID"]

In [None]:
df = pd.concat([pd.read_csv(f, index_col=0).assign(sampleID=f.stem.split('_counts')[0]) for f in files])
df = df.merge(metadata, how='left', on='sampleID').dropna(subset=['mouse'])

In [None]:
df_unmapped = pd.concat([pd.read_csv(f, index_col=0).assign(sampleID=f.stem.split('_counts')[0]) for f in files_unmapped])
df_unmapped = df_unmapped.merge(metadata, how='left', on='sampleID').dropna(subset=['mouse'])

In [None]:
controls = pd.read_table(controls_file, header=None, index_col=0, names=['barcode', 'phenotype', 'conc'])
# Create zero df
a = np.zeros(shape=(controls.shape[0], df.sampleID.nunique()))
zdf = (pd.DataFrame(a,columns=list(df.sampleID.unique()))
.set_index(controls.barcode.values)
.reset_index()
.rename({'index':'barcode'}, axis=1))
zdf = (zdf.melt(id_vars=['barcode'],var_name='sampleID', value_name='zero_cnt')
       .merge(metadata, on='sampleID')
      .drop('zero_cnt', axis=1))

In [None]:
# Merge zdf with controls to make sure all barcodes are 'present' for each sample
# Then merge with unmapped, convert na to 0
controls = controls.merge(zdf, on=['barcode'])
controls = controls.merge(df_unmapped[['barcode','sampleID', 'barcode_cnt']], how='left', on=['barcode','sampleID'])
controls['barcode_cnt'] = controls['barcode_cnt'].fillna(0)
controls['ShortName'] = controls['phenotype'] + '-' + controls['conc'].astype(str)
df = pd.concat([df, controls])
control_barcodes = controls.barcode.unique()

In [None]:
def calculate_correlation(controls, concentration_col = 'conc', cnt_col='barcode_cnt',
                          phenotype_col='phenotype', wt_phenotype='wt',
                          for_each='sampleID', cutoff=0.8):
    """

    Calculate correlation on log (counts+1) 
    Return control_cnts dataframe: contains all the metadata, logCnts, logConc, R, R2 for all the control barcodes
    """
    control_cnts = controls.copy()
    control_cnts['logConc'] = np.log10(control_cnts[concentration_col])
    control_cnts['logCnts'] = np.log10(control_cnts[cnt_col]+1)
    corr_df = control_cnts.groupby([phenotype_col, for_each])[['logConc', 'logCnts']].corr()
    corr_df = corr_df.reset_index()
    corr_df = corr_df[corr_df['level_2'] == 'logConc'].drop(['level_2', 'logConc'], axis=1)
    corr_df.columns = [phenotype_col, for_each, 'R']
    control_cnts = control_cnts.merge(corr_df, on = [for_each, phenotype_col])
    control_cnts['R2'] = control_cnts.R**2
    good_samples = control_cnts[(control_cnts.R2 > cutoff) & (control_cnts.phenotype == wt_phenotype)][for_each].unique()
    return control_cnts, good_samples

In [None]:
control_cnts, good_samples = calculate_correlation(controls, concentration_col = 'conc',
                          cnt_col='barcode_cnt', phenotype_col='phenotype',
                          for_each='sampleID',  wt_phenotype='wt', cutoff=0.8)

In [None]:
unenriched = metadata[metadata.mouse.str.contains('unenriched')].sampleID.values
good_samples = [s for s in good_samples if s not in unenriched]
print(len(good_samples))
clean_df = df[df.sampleID.isin(good_samples)]

In [None]:
magDf = clean_df[clean_df.library == 'library_10_1']
print(magDf.experiment.nunique())
magDf = magDf[magDf.day.isin(['d0', 'd1'])]
print(",".join(magDf[magDf.day == 'd0'].sampleID.unique()))
print(",".join(magDf[magDf.day == 'd1'].sampleID.unique()))
magDf2 = magDf[['barcode', 'ShortName', 'barcode_cnt', 'sampleID']]
magDf2 = (magDf2.pivot(index=['barcode', 'ShortName'], columns='sampleID', values = 'barcode_cnt')
         .reset_index().rename({'barcode':'sgRNA', 'ShortName': 'gene'}, axis=1))


negCntrl = magDf[magDf.phenotype == 'wt'].barcode.unique()
negCntrl
with open(outDir/'mageck_control-sgrna.txt', 'w') as fc:
    for c in negCntrl:
        fc.write(f"{c}\n")
magDf2.to_csv(outDir/'mageck_counts.txt', index=False, sep='\t')

In [None]:
negCntrl

In [None]:
clean_df.groupby('library').experiment.nunique()

In [None]:
# Second test set with batches
magDf = clean_df[clean_df.library == 'library_14_2']
print(magDf.experiment.nunique())
magDf = magDf[magDf.day.isin(['d0', 'd1'])]
print(",".join(magDf[magDf.day == 'd0'].sampleID.unique()))
print(",".join(magDf[magDf.day == 'd1'].sampleID.unique()))
magDf2 = magDf[['barcode', 'ShortName', 'barcode_cnt', 'sampleID']]
magDf2 = (magDf2.pivot(index=['barcode', 'ShortName'], columns='sampleID', values = 'barcode_cnt')
         .reset_index().rename({'barcode':'sgRNA', 'ShortName': 'gene'}, axis=1)
          .fillna(0))

batchFile = magDf[['sampleID', 'experiment', 'day']].drop_duplicates()
batchFile.to_csv(outDir/'mageck_14_2_batch.txt', sep='\t', index=False)
# with open(outDir/'mageck_control-sgrna.txt', 'w') as fc:
#     for c in negCntrl:
#         fc.write(f"{c}\n")
magDf2.to_csv(outDir/'mageck_14_2_counts.txt', index=False, sep='\t')

In [None]:
# Third test set without filtering bad samples
all_samples = [s for s in df.sampleID.unique() if s not in unenriched]
print(len(all_samples))
dirty_df = df[df.sampleID.isin(all_samples)]
magDf = dirty_df[dirty_df.library == 'library_14_2']
print(magDf.experiment.nunique())
magDf = magDf[magDf.day.isin(['d0', 'd1'])]
print(",".join(magDf[magDf.day == 'd0'].sampleID.unique()))
print(",".join(magDf[magDf.day == 'd1'].sampleID.unique()))
magDf2 = magDf[['barcode', 'ShortName', 'barcode_cnt', 'sampleID']]
magDf2 = (magDf2.pivot(index=['barcode', 'ShortName'], columns='sampleID', values = 'barcode_cnt')
         .reset_index().rename({'barcode':'sgRNA', 'ShortName': 'gene'}, axis=1).fillna(0))


negCntrl = magDf[magDf.phenotype == 'wt'].barcode.unique()
negCntrl

batchFile = magDf[['sampleID', 'experiment', 'day']].drop_duplicates()
batchFile.to_csv(outDir/'mageck_dirty_14_2_batch.txt', sep='\t', index=False)
with open(outDir/'mageck_control-sgrna.txt', 'w') as fc:
    for c in negCntrl:
        fc.write(f"{c}\n")
magDf2.to_csv(outDir/'mageck_dirty_counts.txt', index=False, sep='\t')