In [None]:
# %load ../snippets/basic_settings.py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from pathlib import Path
import seaborn as sns
import subprocess
import sys
import plotly.express as px
import yaml


sns.set_context("notebook", font_scale=1.1)
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 100)
plt.rcParams["figure.figsize"] = (16, 12)
plt.rcParams['savefig.dpi'] = 200
plt.rcParams['figure.autolayout'] = False
plt.rcParams['axes.labelsize'] = 18
plt.rcParams['axes.titlesize'] = 20
plt.rcParams['font.size'] = 16
plt.rcParams['lines.linewidth'] = 2.0
plt.rcParams['lines.markersize'] = 8
plt.rcParams['legend.fontsize'] = 14
plt.rcParams['text.usetex'] = False  # True activates latex output in fonts!
plt.rcParams['font.family'] = "serif"
plt.rcParams['font.serif'] = "cm"
pd.set_option('display.float_format', lambda x: '{:,.2f}'.format(x))

In [None]:
from typing import Optional, Tuple, List, Union

In [None]:
config_file = "../nguyenb_config.yaml"
with open(config_file) as file:
    # The FullLoader parameter handles the conversion from YAML
    # scalar values to Python the dictionary format
    configs = yaml.load(file, Loader=yaml.FullLoader)

In [None]:
# Run on server:
root = Path(configs['root']['server'])
scratchDir = Path(configs['scratchDir']['server'])

In [None]:
mapDir = root/configs['mapDir']
countDir = root/configs['libraryCountsDir']
resultDir = root/configs['resultDir']


In [None]:
# Utilities
def run_command(args):
    """Run command, transfer stdout/stderr"""
    result = subprocess.run(args)
    try:
        result.check_returncode()
    except subprocess.CalledProcessError as e:
        raise e

# Cleaning the data 
- Using data from library_11_1 as a test case

In [None]:

def read_merged_count_file(merged_count_file):
    counts = pd.read_csv(merged_count_file)
    annotation_cols = list(counts.columns[0:2])
    sampleIDs = list(counts.columns[2:])
    return counts, annotation_cols, sampleIDs


def calculate_cpms(merged_df, annotation_cols, sampleIDs):
    merged_df = merged_df[merged_df.sum(axis=1, numeric_only=True) > 10]
    # Normalized for library depth and log transform
    cpms = merged_df.copy().set_index(list(annotation_cols))
    cpms = np.log2(cpms/cpms.sum()*1000000 +0.5).reset_index()
    return cpms

## Extract counts for control barcodes

In [None]:
def read_in_control_file(control_file: Union[str, Path]):
    """
    Control file: No header
    
    [barcode],[conc],[phenotype]
    
    1. Check that the first column contains barcodes
    2. Check that second column is numeric -> should contain concentrations
    3. If there is a thrid column it should be string, and at least one should be ['wt', 'WT', 'wildtype']
    
    """
    cntrl_df = pd.read_csv(control_file, header=None)
    num_cols = cntrl_df.shape[1]
    
    # Add column validation code here
    
    col_names = ['barcode', 'concentration', 'genotype']
    cntrl_df.columns = col_names[0:num_cols]
    
    if num_cols == 3:
        if any(cntrl_df.genotype.isin(['wt', 'WT', 'wildtype'])):
            wt_barcodes = cntrl_df[cntrl_df.genotype.isin(['wt', 'WT', 'wildtype'])].barcode.values
        else:
            wt_barcodes = []
    else:
        wt_barcodes = cntrl_df.barcode.values
    return cntrl_df, wt_barcodes


def get_control_counts(cntrl_df: pd.DataFrame, wt_barcodes: List, counts_df: pd.DataFrame, 
                      annotations_col: List) -> pd.DataFrame:
    
    """

    counts_df: 
    barcode,geneName,sample1,sample2
    
    Merge left, convert NA to 0 
    return merged data frame
    
    """
    
    num_cols = cntrl_df.shape[1]
    if annotations_col[0] != 'barcode':
        counts_df = counts_df.rename({annotations_col[0]: 'barcode'}, axis=1)
    # Add column validation for counts_df
    fdf = cntrl_df.merge(counts_df, how='left', on='barcode').fillna(0)
    if len(wt_barcodes) > 0:
        wt_df = fdf[fdf.barcode.isin(wt_barcodes)]
    else:
        wt_df = pd.DataFrame()
    return wt_df, fdf
    

## Calculate Correlations

In [None]:
def calculate_correlation(control_df: pd.DataFrame, sampleIDs: List, cutoff: float=0.8):
    """
    Given a data frame with a 'concentration' column and sampleID (normalised) counts + list of sampleIDs, 
    calculate correlation between concentration 
    return a list of 'good samples', i.e. passing the cutoff
    
    Assert concentration column is present
    Assert sampleIDs are in control_df columns
    """
    concentrations = np.log2(control_df.concentration)
    samples = control_df[sampleIDs]
    corr_df = pd.DataFrame(samples.corrwith(concentrations), columns=['R'])
    corr_df["R2"] = corr_df.R**2
    good_samples = list(corr_df[corr_df.R2 > cutoff].index)
    return corr_df, good_samples

## Draw correlation plots

In [None]:
def draw_correlation_plots(control_df, sampleIDs):
    """
    given a complete control_df, draw correlation plot for each sampleID for each genotype
    """
    pass

In [None]:
not_found_in_dnaid1315 = ['AACAACACGGTAAGCAA', 'AGAATGACCCGGAGGCT', 'AGTCATCGATGCTATAT', 'CCGACGACTGATTGTCC',
           'CTACGACAGGGACTTAA', 'GTGTATAGCAGGAACCC', 'GTGTATAGCAGGAACCC', 'TAAGTCCGGGCTAAGTC',
           'TATAACACCCCCGATTC', 'TCTCACGCAGCGTTTCG']

In [None]:
# cdf1 = cdf[[1]]
# cdf1.to_csv(root/"controls_1col.csv", index=False, header=None)
# cdf2 = cdf[[1,3]]
# cdf2.to_csv(root/"controls_2col.csv", index=False,header=None)
# cdf3 = cdf[[1,3,2]]
# cdf3.to_csv(root/"controls_3col.csv", index=False,header=None)
#wt_df[['barcode', 'concentration']].to_csv(root/"controls_6barcodes.csv", header=None, index=False)

# MAGeCK Analysis

## Prepare MAGeCK dataset

In [None]:
def prepare_mageck_dataset(counts_df, sampleData, control_barcodes, annotation_cols, good_samples, name, 
                           batch_col, treatment_col, outDir):
    
    """
    
    Assume the first column of sampleData contains sampleIDs.
    Assume second has geneName
    The rest are raw counts for samples in sampleIDs.
    
    """
    
    batch_file = outDir/f"{name}_batch.txt"
    count_file = outDir/f"{name}_count.txt"
    sampleID_col = sampleData.columns[0]

    batch_df = (sampleData[sampleData[sampleID_col].isin(good_samples)]
                [[sampleID_col, batch_col, treatment_col]]
                .sort_values([treatment_col, batch_col]))

    batch_df.to_csv(batch_file, index=False, sep='\t')
    magDf = counts_df[annotation_cols + good_samples].copy()
    magDf.loc[magDf[annotation_cols[0]].isin(control_barcodes), annotation_cols[1]] = 'control'
    magDf = magDf.dropna(subset=annotation_cols).fillna(0)
    magDf.to_csv(count_file, index=False, sep='\t')
    return batch_file, count_file


## Run MAGeCK batch correction

In [None]:
def read_in_sample_data(sample_data_file, sampleIDs, treatment_col="", batch_col=""):
    """
    add data validation code
    
    """
    return pd.read_csv(sample_data_file)
    

In [None]:
def batch_correct(outDir, name,  r_path="../snippets/batchCorrect.R"):
    """
    Given count df only with good samples
    sample data df (read in and validated somewhere else) with information about batches etc. 
    batch column name
    
    """
    count_path = outDir / f"{name}_count.txt"
    batch_path = outDir / f"{name}_batch.txt"
    cmd = f'Rscript {r_path} {count_path} {batch_path} {name} {outDir}'
    print(cmd)
    r = run_command(cmd.split())


## Run MAGeCK RRA

In [None]:
def get_contrast_samples(sampleData, good_samples, treat_col = 'day', 
                         treatment='d1', control='d0', sampleID = 'sampleID'):
    sDf = sampleData[sampleData[sampleID].isin(good_samples)]
    controls = ",".join(sDf[sDf[treat_col] == control][sampleID].unique())
    treats = ",".join(sDf[sDf[treat_col] == treatment][sampleID].unique())
    return controls, treats


def run_mageck(count_file, treated, controls, out_prefix, control_barcode_file):
    """
    count file could be produced before or after batchcorrection
    
    """
    cmd = (f"mageck test -k {count_file} -t {treated} "
          f"-c {controls}  -n {out_prefix} "  
          f"--control-sgrna {control_barcode_file}  --normcounts-to-file")
    print(cmd)
    r = run_command(cmd.split())

def write_control_barcodes_to_file(wt_barcodes, name, outDir):
    fname = outDir/f"{name}_wt_barcodes.txt"
    with open(fname, "w") as fo:
        for bc in wt_barcodes:
            fo.write(f"{bc}\n")
    return fname
    
def process_mageck_matrix_file():
    pass

def run_mageck_mle(count_file, design_file, control_barcode_file, out_prefix):
    cmd = (f"mageck mle -k {count_file} -d {design_file} "
          f" -n {out_prefix} --norm-method control --genes-varmodeling 0 " # should be more?
          f"--permutation-round 2 " # suggested 10
          f"--control-sgrna {control_barcode_file}")
    print(cmd)
    r = run_command(cmd.split())



# Analyse Library

In [None]:
merged_count_file = countDir/"library_11_1_mbarq_merged_counts.csv"
control_file = root/"controls_3col.csv"
control_file_short = root/"controls_6barcodes.csv"
sample_data_file = root/configs['sampleData']
name = "library_11_1"

In [None]:
def clean_samples(merged_count_file, control_file):
    # Read in merged_count table and get sampleIDs
    counts, annotation_cols, sampleIDs = read_merged_count_file(merged_count_file)
    # Calculate cpms
    cpms = calculate_cpms(counts, annotation_cols, sampleIDs)

    # Read in control file
    cntrl_df, wt_barcodes = read_in_control_file(control_file)
    wt_df, full_control_df = get_control_counts(cntrl_df,  wt_barcodes, cpms, annotation_cols)
    return counts, annotation_cols, sampleIDs, cpms, wt_df, full_control_df
    
    

corr_df, good_samples = calculate_correlation(wt_df, sampleIDs)


In [None]:
def run_analysis(counts, sample_data_file, good_samples, wt_df, 
                 annotation_cols, name, batch_col='experiment', treatment_col='day', 
                 contrasts = ['d1', 'd2', 'd3', 'd4'], baseline='d0',
                 sampleID='sampleID',
                 outDir=scratchDir
                 ):
    
    # Read in sample data
    sampleData = read_in_sample_data(sample_data_file, good_samples)
    wt_barcodes = wt_df.barcode.values
    # subset df on only good samples and write out to file
    batch_file, count_file = prepare_mageck_dataset(counts, sampleData, wt_barcodes, 
                                                    annotation_cols, good_samples, name, 
                                                    batch_col,treatment_col, outDir)

    # run batch correction
    batch_correct(scratchDir, name,  r_path="../snippets/batchCorrect.R")
    
    fname = write_control_barcodes_to_file(wt_barcodes, name, scratchDir)

    # run MAGeCK RRA for day 1
    contrasts_ran = []
    for contrast in contrasts:
        controls, treat = get_contrast_samples(sampleData, good_samples, treatment_col, 
                                               contrast, baseline, sampleID)
        count_file2 = count_file.with_suffix('.batchcorrected.txt')
        if len(treat) > 0 and len(controls) > 0:
            run_mageck(count_file2, treat, controls, scratchDir/f"{name}-{contrast}", fname)
            contrasts_ran.append(contrast)
        else:
            continue
    res = pd.concat([pd.read_table(scratchDir/f"{name}-{i}.gene_summary.txt").assign(treat=i) 
                     for i in contrasts_ran])
    fres = res[['id', 'num', 'neg|lfc', 'neg|fdr', 'pos|fdr', 'treat']]
    fres.columns = [annotation_cols[1], 'number_of_barcodes', 'LFC', 'neg_selection_fdr', 'pos_selection_fdr', 'contrast']
    fres.to_csv(scratchDir/f'{name}_rra_results.csv')
    return fres


In [None]:


# Read in merged_count table and get sampleIDs
counts, annotation_cols, sampleIDs = read_merged_count_file(merged_count_file)

# Calculate cpms
cpms = calculate_cpms(counts, annotation_cols, sampleIDs)

# Read in control file
cntrl_df, wt_barcodes = read_in_control_file(control_file_short)
wt_df, full_control_df = get_control_counts(cntrl_df,  wt_barcodes, cpms, annotation_cols)

# Figure out good samples
corr_df, good_samples = calculate_correlation(wt_df, sampleIDs)

# Read in sample data
sampleData = read_in_sample_data(sample_data_file, good_samples)

# subset df on only good samples and write out to file
batch_file, count_file = prepare_mageck_dataset(counts, sampleData, wt_df.barcode.values, annotation_cols, good_samples, name, 
                           batch_col='experiment', treatment_col='day', outDir=scratchDir)

# run batch correction
batch_correct(scratchDir, name,  r_path="../snippets/batchCorrect.R")


In [None]:
fname = write_control_barcodes_to_file(wt_barcodes, name, scratchDir)

# run MAGeCK RRA for day 1

controls, treat = get_contrast_samples(sampleData, good_samples, treat_col = 'day', 
                                       treatment='d1', control='d0', sampleID = 'sampleID')
count_file2 = count_file.with_suffix('.batchcorrected.txt')
run_mageck(count_file2, treat, controls, scratchDir/f"{name}-d1", fname)

In [None]:
%store good_samples

In [None]:
# run MAGeCK RRA for day 2

controls, treat = get_contrast_samples(sampleData, good_samples, treat_col = 'day', 
                                       treatment='d2', control='d0', sampleID = 'sampleID')
count_file2 = count_file.with_suffix('.batchcorrected.txt')
run_mageck(count_file2, treat, controls, scratchDir/f"{name}-d2", fname)

In [None]:
# run MAGeCK RRA for day 3

controls, treat = get_contrast_samples(sampleData, good_samples, treat_col = 'day', 
                                       treatment='d3', control='d0', sampleID = 'sampleID')
count_file2 = count_file.with_suffix('.batchcorrected.txt')
run_mageck(count_file2, treat, controls, scratchDir/f"{name}-d3", fname)

In [None]:
# run MAGeCK RRA for day 4

controls, treat = get_contrast_samples(sampleData, good_samples, treat_col = 'day', 
                                       treatment='d4', control='d0', sampleID = 'sampleID')
count_file2 = count_file.with_suffix('.batchcorrected.txt')
run_mageck(count_file2, treat, controls, scratchDir/f"{name}-d4", fname)

In [None]:
res = pd.concat([pd.read_table(scratchDir/f"library_11_1-{i}.gene_summary.txt").assign(treat=i) for i in ['d1','d2','d3',
                                                                                               'd4']])

In [None]:
res[(res['pos|goodsgrna'] != 0) & (res['pos|fdr'] < 0.01)]['pos|lfc'].hist(bins=100)

In [None]:
res[(res['neg|fdr'] < 0.01) & (res['neg|lfc'] < -0.5) ]

In [None]:
fres = res[['id', 'num', 'neg|lfc', 'neg|fdr', 'pos|fdr', 'treat']]
fres.columns = [annotation_cols[1], 'number_of_barcodes', 'LFC', 'neg_selection_fdr', 'pos_selection_fdr', 'contrast']

In [None]:
fres

In [None]:
fres.to_csv(scratchDir/'library_11_1_rra_results.csv')

# Look at controls across libraries

In [None]:
# There are 12 libraries:
sampleData = pd.read_csv(sample_data_file)
libraries = sampleData[sampleData.library != 'LibraryA'].library.unique()
libraries

## Library 14_1

In [None]:
sampleData = pd.read_csv(sample_data_file)
lib_file = countDir/"library_14_1_mbarq_merged_counts.csv"
sampleData = sampleData[sampleData.library == 'library_14_1']
counts, annotation_cols, sampleIDs, cpms, wt_df, full_control_df = clean_samples(lib_file, control_file) 

In [None]:
sampleData

In [None]:
lib14_missing = list(wt_df[wt_df.dnaid1428_117 == 0].barcode.sort_values().values)
lib14_missing

In [None]:
lib14_present = wt_df[wt_df.dnaid1428_117 > 0]
corr_df, good_samples = calculate_correlation(lib14_present, sampleIDs)

In [None]:
corr_df

In [None]:
good_samples

Library 14_1 did not have any samples passing quality control
dnaid1428 seems to be missing 9 barcodes

## Library 15_1

In [None]:
sampleData = pd.read_csv(sample_data_file)
library = 'library_15_1'
lib_file = countDir/f"{library}_mbarq_merged_counts.csv"
sampleData = sampleData[sampleData.library == library]
counts, annotation_cols, sampleIDs, cpms, wt_df, full_control_df = clean_samples(lib_file, control_file) 

dnaid2025 and dnaid2026 have all 15 control barcodes

None of the samples in dnaid1428 has passed the qc no matter which barcodes are used for the controls

In [None]:
wt_df[list(sampleData[sampleData.tissue == 'inoculum'].sampleID.values)]

In [None]:
corr_df, good_samples = calculate_correlation(wt_df, sampleIDs)

In [None]:
dnaid1428 = wt_df[['barcode', 'concentration']+[c for c in wt_df.columns if 'dnaid1428' in c]]
dnaid1428 = dnaid1428[dnaid1428.dnaid1428_124 > -1]
good_samples1428 = calculate_correlation(dnaid1428, [c for c in wt_df.columns if 'dnaid1428' in c])

In [None]:
res15_1 = run_analysis(counts, sample_data_file, good_samples, wt_df, 
                 annotation_cols, library, batch_col='experiment', treatment_col='day', 
                 contrasts = ['d1', 'd2', 'd3', 'd4'], baseline='d0',
                 sampleID='sampleID',
                 outDir=scratchDir
                 )

In [None]:
res15_1

In [None]:
%store -r maDf

In [None]:
maDf15 = maDf[maDf.library == 'library_15_1']
maDf15.columns = ['Name', 'neg|fdr', 'neg|lfc', 'pos|fdr', 'contrast', 'library', 'fdr']

## Library 13_2

In [None]:
sampleData = pd.read_csv(sample_data_file)
library = 'library_13_2'
lib_file = countDir/f"{library}_mbarq_merged_counts.csv"
sampleData = sampleData[sampleData.library == library]
counts, annotation_cols, sampleIDs, cpms, wt_df, full_control_df = clean_samples(lib_file, control_file) 

dnaid1428 and dnaid1457 also seem to be missing some controls. dnaid2023 and dnaid2024 look good.

In [None]:
wt_df[list(sampleData[sampleData.tissue == 'inoculum'].sampleID.values)]

In [None]:
corr_df, good_samples = calculate_correlation(wt_df, sampleIDs)

In [None]:
good_samples

In [None]:
res13_2 = run_analysis(counts, sample_data_file, good_samples, wt_df, 
                 annotation_cols, library, batch_col='experiment', treatment_col='day', 
                 contrasts = ['d1', 'd2', 'd3', 'd4'], baseline='d0',
                 sampleID='sampleID',
                 outDir=scratchDir
                 )

## Library 9_1

In [None]:
sampleData = pd.read_csv(sample_data_file)
library = 'library_9_1'
lib_file = countDir/f"{library}_mbarq_merged_counts.csv"
sampleData = sampleData[sampleData.library == library]
counts, annotation_cols, sampleIDs, cpms, wt_df, full_control_df = clean_samples(lib_file, control_file) 

dnaid1429 also missing control barcodes

In [None]:
wt_df[list(sampleData[sampleData.tissue == 'inoculum'].sampleID.values)]

In [None]:
corr_df, good_samples = calculate_correlation(wt_df, sampleIDs)
good_samples

In [None]:
res9_1 = run_analysis(counts, sample_data_file, good_samples, wt_df, 
                 annotation_cols, library, batch_col='experiment', treatment_col='day', 
                 contrasts = ['d1', 'd2', 'd3', 'd4'], baseline='d0',
                 sampleID='sampleID',
                 outDir=scratchDir
                 )

In [None]:
res9_1.sample(10)

## Library 10_1

In [None]:
sampleData = pd.read_csv(sample_data_file)
library = 'library_10_1'
lib_file = countDir/f"{library}_mbarq_merged_counts.csv"
sampleData = sampleData[sampleData.library == library]
counts, annotation_cols, sampleIDs, cpms, wt_df, full_control_df = clean_samples(lib_file, control_file) 

dnaid15 and 16 are fine
danid1429 not.

In [None]:
wt_df[list(sampleData[sampleData.tissue == 'inoculum'].sampleID.values)]

In [None]:
corr_df, good_samples = calculate_correlation(wt_df, sampleIDs)
good_samples

In [None]:
res10_1 = run_analysis(counts, sample_data_file, good_samples, wt_df, 
                 annotation_cols, library, batch_col='experiment', treatment_col='day', 
                 contrasts = ['d1', 'd2', 'd3', 'd4'], baseline='d0',
                 sampleID='sampleID',
                 outDir=scratchDir
                 )

In [None]:
genes = list(res10_1[(res10_1.contrast == 'd1') & 
        (res10_1.LFC < -0.5 ) & 
        (res10_1.neg_selection_fdr<0.01)].Name.values)


## Library 11_2

In [None]:
sampleData = pd.read_csv(sample_data_file)
library = 'library_11_2'
lib_file = countDir/f"{library}_mbarq_merged_counts.csv"
sampleData = sampleData[sampleData.library == library]
counts, annotation_cols, sampleIDs, cpms, wt_df, full_control_df = clean_samples(lib_file, control_file) 

dnaid1457 is still crap

In [None]:
wt_df[list(sampleData[sampleData.tissue == 'inoculum'].sampleID.values)]

In [None]:
corr_df, good_samples = calculate_correlation(wt_df, sampleIDs)
good_samples

In [None]:
res11_2 = run_analysis(counts, sample_data_file, good_samples, wt_df, 
                 annotation_cols, library, batch_col='experiment', treatment_col='day', 
                 contrasts = ['d1', 'd2', 'd3', 'd4'], baseline='d0',
                 sampleID='sampleID',
                 outDir=scratchDir
                 )

In [None]:
res11_2.sample(10)

## Library 12_1

In [None]:
sampleData = pd.read_csv(sample_data_file)
library = 'library_12_1'
lib_file = countDir/f"{library}_mbarq_merged_counts.csv"
sampleData = sampleData[sampleData.library == library]
counts, annotation_cols, sampleIDs, cpms, wt_df, full_control_df = clean_samples(lib_file, control_file) 

In [None]:
wt_df[list(sampleData[sampleData.tissue == 'inoculum'].sampleID.values)]

In [None]:
corr_df, good_samples = calculate_correlation(wt_df, sampleIDs)
good_samples

In [None]:
res12_1 = run_analysis(counts, sample_data_file, good_samples, wt_df, 
                 annotation_cols, library, batch_col='experiment', treatment_col='day', 
                 contrasts = ['d1', 'd2', 'd3', 'd4'], baseline='d0',
                 sampleID='sampleID',
                 outDir=scratchDir
                 )

In [None]:
res12_1.sample(10)

## Library 12_2

In [None]:
sampleData = pd.read_csv(sample_data_file)
library = 'library_12_2'
lib_file = countDir/f"{library}_mbarq_merged_counts.csv"
sampleData = sampleData[sampleData.library == library]
counts, annotation_cols, sampleIDs, cpms, wt_df, full_control_df = clean_samples(lib_file, control_file)

In [None]:
wt_df[list(sampleData[sampleData.tissue == 'inoculum'].sampleID.values)]

In [None]:
corr_df, good_samples = calculate_correlation(wt_df, sampleIDs)
good_samples

In [None]:
res12_2 = run_analysis(counts, sample_data_file, good_samples, wt_df, 
                 annotation_cols, library, batch_col='experiment', treatment_col='day', 
                 contrasts = ['d1', 'd2', 'd3', 'd4'], baseline='d0',
                 sampleID='sampleID',
                 outDir=scratchDir
                 )

In [None]:
res12_2.sample(10)

## Library 13_1

In [None]:
sampleData = pd.read_csv(sample_data_file)
library = 'library_13_1'
lib_file = countDir/f"{library}_mbarq_merged_counts.csv"
sampleData = sampleData[sampleData.library == library]
counts, annotation_cols, sampleIDs, cpms, wt_df, full_control_df = clean_samples(lib_file, control_file)

In [None]:
wt_df[list(sampleData[sampleData.tissue == 'inoculum'].sampleID.values)]

In [None]:
corr_df, good_samples = calculate_correlation(wt_df, sampleIDs)
good_samples

In [None]:
res13_1 = run_analysis(counts, sample_data_file, good_samples, wt_df, 
                 annotation_cols, library, batch_col='experiment', treatment_col='day', 
                 contrasts = ['d1', 'd2', 'd3', 'd4'], baseline='d0',
                 sampleID='sampleID',
                 outDir=scratchDir
                 )

In [None]:
res13_1.sample(10)

## Library 10_2

In [None]:
sampleData = pd.read_csv(sample_data_file)
library = 'library_10_2'
lib_file = countDir/f"{library}_mbarq_merged_counts.csv"
sampleData = sampleData[sampleData.library == library]
counts, annotation_cols, sampleIDs, cpms, wt_df, full_control_df = clean_samples(lib_file, control_file)

In [None]:
wt_df[list(sampleData[sampleData.tissue == 'inoculum'].sampleID.values)]

In [None]:
corr_df, good_samples = calculate_correlation(wt_df, sampleIDs)
good_samples

In [None]:
res10_2 = run_analysis(counts, sample_data_file, good_samples, wt_df, 
                 annotation_cols, library, batch_col='experiment', treatment_col='day', 
                 contrasts = ['d1', 'd2', 'd3', 'd4'], baseline='d0',
                 sampleID='sampleID',
                 outDir=scratchDir
                 )

In [None]:
res10_2.sample(10)

## Library 14_2

In [None]:
sampleData = pd.read_csv(sample_data_file)
library = 'library_14_2'
lib_file = countDir/f"{library}_mbarq_merged_counts.csv"
sampleData = sampleData[sampleData.library == library]
counts, annotation_cols, sampleIDs, cpms, wt_df, full_control_df = clean_samples(lib_file, control_file)

In [None]:
wt_df[list(sampleData[sampleData.tissue == 'inoculum'].sampleID.values)]

In [None]:
corr_df, good_samples = calculate_correlation(wt_df, sampleIDs)
good_samples

In [None]:
res14_2 = run_analysis(counts, sample_data_file, good_samples, wt_df, 
                 annotation_cols, library, batch_col='experiment', treatment_col='day', 
                 contrasts = ['d1', 'd2', 'd3', 'd4'], baseline='d0',
                 sampleID='sampleID',
                 outDir=scratchDir
                 )

In [None]:
res14_2.sample(10)

# MLE

In [None]:
matrix = (pd.get_dummies(sampleData[sampleData.sampleID.isin(good_samples)][['sampleID', 'day']]
                         .set_index('sampleID'))
         .reset_index())

In [None]:
matrix.to_csv(scratchDir/'design_matrix.tsv', sep='\t', index=False)

In [None]:
run_mageck_mle(count_file2, scratchDir/'design_matrix.tsv', fname, scratchDir/"mle_test")

In [None]:
mle_res = pd.read_table(scratchDir/"mle_test.gene_summary.txt")

In [None]:
mle_res

In [None]:
mle_res[(mle_res['day_d1|wald-fdr'] < 0.05) ]

In [None]:
# For each library:
library='library_12_2'
meta = meat[meat.library == library]
c = all_contrasts[library]


def mageck_library(library, meta, outDir, contrasts, control_barcode_file, batch_corr=True, batch_col='batch'):
    
    """
    1. Check if batch correction is needed, run if yes -> different count file as input for mageck
    2. For each contrast check if threr are samples, run mageck
    3. Concatenate results for multiple days

    """
    print(meta[batch_col].nunique())
    if batch_corr is True and meta[batch_col].nunique() > 1:
        batch_correct(outDir, library,  r_path="./batchCorrect.R")
        count_file = outDir/f"{library}_count_batchcorrected.txt"
    else:
        count_file = outDir/f"{library}_count.txt"
        
    result_dfs = []
    for contrast, samples in contrasts.items():
        print(contrast)
        if len(samples[0]) == 0 or len(samples[1]) == 0:
            continue
        else:
            treated = samples[1] 
            controls = samples[0]
        out_prefix = outDir/f"{library}-{contrast}"
        run_mageck(count_file, treated, controls, out_prefix, control_barcode_file)
        res = pd.read_table(f'{out_prefix}.gene_summary.txt').assign(contrast=contrast)
        result_dfs.append(res)
    results = pd.concat(result_dfs).assign(library=library)
    return results

