In [None]:
# %load ../snippets/basic_settings.py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from pathlib import Path
import seaborn as sns
import sys
import subprocess 


sns.set_context("notebook", font_scale=1.1)
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 100)
plt.rcParams["figure.figsize"] = (16, 12)
plt.rcParams['savefig.dpi'] = 200
plt.rcParams['figure.autolayout'] = False
plt.rcParams['axes.labelsize'] = 18
plt.rcParams['axes.titlesize'] = 20
plt.rcParams['font.size'] = 16
plt.rcParams['lines.linewidth'] = 2.0
plt.rcParams['lines.markersize'] = 8
plt.rcParams['legend.fontsize'] = 14
plt.rcParams['text.usetex'] = False  # True activates latex output in fonts!
#pd.set_option('display.float_format', lambda x: '{:,.2f}'.format(x))

# Loading metadata

In [None]:
root = Path("/nfs/nas22/fs2202/biol_micro_bioinf_nccr/hardt/nguyenb/tnseq")
dataDir = root/"scratch/08_21/counts/"
controls_file = root/"data/metadata/controls.txt"
outDir = root/"scratch/02_22_mageck"
metafile = root/"scratch/08_21/complete_metadata.tsv"

In [None]:
meat = pd.read_table(metafile, index_col=0, header=None, 
                     names=['library', 'experiment', 'mouse', 'day', 'tissue', 'dnaid', 'sampleID'])
meat['batch'] = meat['experiment'] + '_' + meat['dnaid']

In [None]:
# Get good samples from 20-01-2022-creating-test-data-for-mageck

In [None]:
%store -r good_samples
%store -r clean_df

In [None]:
meat = meat[meat.sampleID.isin(good_samples)]

# Write out control barcodes

In [None]:
negCntrl = clean_df[clean_df.phenotype == 'wt'].barcode.unique()
negCntrl
with open(outDir/'control-sgrna.txt', 'w') as fc:
    for c in negCntrl:
        fc.write(f"{c}\n")

# Write out batch and count files

In [None]:
def prepare_mageck_dataset(clean_df, meta_df, library):
    batch_file = outDir/f"{library}_batch.txt"
    count_file = outDir/f"{library}_count.txt"
    batch_df = meta_df[meta_df.library == library][['sampleID', 'batch', 'day']].sort_values(['day', 'batch'])
    batch_df.to_csv(batch_file, index=False, sep='\t')
    magDf = clean_df[clean_df.library == library]
    magDf2 = magDf[['barcode', 'ShortName', 'barcode_cnt', 'sampleID']]
    magDf2 = (magDf2.pivot(index=['barcode', 'ShortName'], columns='sampleID', values = 'barcode_cnt')
         .reset_index().rename({'ShortName': 'gene'}, axis=1)
          .fillna(0))
    magDf2.to_csv(count_file, index=False, sep='\t')
    return batch_file, count_file

In [None]:
libraries = {}
for library, g in meat.groupby('library'):
    print(library)
    libraries[library] = prepare_mageck_dataset(clean_df, meat, library)

# Run Batch Correction (if needed)

In [None]:

def run_command(args):
    """Run command, transfer stdout/stderr"""
    result = subprocess.run(args)
    try:
        result.check_returncode()
    except subprocess.CalledProcessError as e:
        raise e
        

def batch_correct(outDir, library,  r_path="./batchCorrect.R"):
    count_path = outDir / f"{library}_count.txt"
    batch_path = outDir / f"{library}_batch.txt"
    cmd = f'Rscript {r_path} {count_path} {batch_path} {library} {outDir}'
    print(cmd)
    r = run_command(cmd.split())


def get_contrast_samples(library_df, treat_col = 'day', treatment='d1', control='d0', sampleID = 'sampleID'):
    controls = ",".join(library_df[library_df[treat_col] == control][sampleID].unique())
    treats = ",".join(library_df[library_df[treat_col] == treatment][sampleID].unique())
    return controls, treats


def run_mageck(count_file, treated, controls, out_prefix, control_barcode_file):
    cmd = (f"mageck test -k {count_file} -t {treated} "
          f"-c {controls}  -n {out_prefix} "  
          f"--control-sgrna {control_barcode_file}  --normcounts-to-file")
    print(cmd)
    r = run_command(cmd.split())


In [None]:
all_contrasts = {lib:{} for lib in meat.library.unique()}
for library in meat.library.unique():
    for day in ['d1', 'd2', 'd3', 'd4']:
        all_contrasts[library][day] = get_contrast_samples(meat[meat.library == library], 'day', day)

In [None]:
# For each library:
library='library_12_2'
meta = meat[meat.library == library]
c = all_contrasts[library]


def mageck_library(library, meta, outDir, contrasts, control_barcode_file, batch_corr=True, batch_col='batch'):
    
    """
    1. Check if batch correction is needed, run if yes -> different count file as input for mageck
    2. For each contrast check if threr are samples, run mageck
    3. Concatenate results for multiple days

    """
    print(meta[batch_col].nunique())
    if batch_corr is True and meta[batch_col].nunique() > 1:
        batch_correct(outDir, library,  r_path="./batchCorrect.R")
        count_file = outDir/f"{library}_count_batchcorrected.txt"
    else:
        count_file = outDir/f"{library}_count.txt"
        
    result_dfs = []
    for contrast, samples in contrasts.items():
        print(contrast)
        if len(samples[0]) == 0 or len(samples[1]) == 0:
            continue
        else:
            treated = samples[1] 
            controls = samples[0]
        out_prefix = outDir/f"{library}-{contrast}"
        run_mageck(count_file, treated, controls, out_prefix, control_barcode_file)
        res = pd.read_table(f'{out_prefix}.gene_summary.txt').assign(contrast=contrast)
        result_dfs.append(res)
    results = pd.concat(result_dfs).assign(library=library)
    return results



In [None]:
libraries

In [None]:
baseline = 'd0'
treat_col = 'day'
sampleID = 'sampleID'
control_barcode_file = outDir/'control-sgrna.txt'
batch_col = 'batch'

all_results = []
for library, files in libraries.items():
    print(library)
    meta = pd.read_table(files[0])
    treatments = [c for c in meta[treat_col].unique() if c != baseline]
    contrasts = {}
    # get all comps
    for treat in treatments:
        contrasts[treat] = get_contrast_samples(meta, treat_col, treatment=treat,
                                                control=baseline, sampleID=sampleID)
    results = mageck_library(library, meta, outDir, contrasts, 
                             control_barcode_file, True, batch_col)
    all_results.append(results)
    
    
fdf = pd.concat(all_results)

In [None]:
len(libraries)

In [None]:
fdf.to_csv(outDir/'16-02-2022-batch-corrected-9-libraries.csv', index=False)

In [None]:
baseline = 'd0'
treat_col = 'day'
sampleID = 'sampleID'
control_barcode_file = outDir/'control-sgrna.txt'
batch_col = 'batch'

all_results = []
for library, files in libraries.items():
    print(library)
    meta = pd.read_table(files[0])
    treatments = [c for c in meta[treat_col].unique() if c != baseline]
    contrasts = {}
    # get all comps
    for treat in treatments:
        contrasts[treat] = get_contrast_samples(meta, treat_col, treatment=treat,
                                                control=baseline, sampleID=sampleID)
    results = mageck_library(library, meta, outDir, contrasts, 
                             control_barcode_file, False, batch_col)
    all_results.append(results)
    
    
fdf = pd.concat(all_results)
fdf.to_csv(outDir/'16-02-2022-not-batch-corrected-9-libraries.csv', index=False)

In [None]:
res12_1

In [None]:
all_contrasts = {lib:{} for lib in meat.library.unique()}
for library in meat.library.unique():
    for day in ['d1', 'd2', 'd3', 'd4']:
        all_contrasts[library][day] = get_contrast_samples(meat[meat.library == library], 'day', day)
        
count_file = outDir/"library_14_2_count_batchcorrected.txt"
treated = all_contrasts['library_14_2']['d1'][1]
controls = all_contrasts['library_14_2']['d1'][0]
control_barcode_file = outDir/'control-sgrna.txt'
library="library_14_2"