In [None]:
# %load ../snippets/basic_settings.py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path


sns.set_context("notebook", font_scale=1.1)
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 100)
plt.rcParams["figure.figsize"] = (16, 12)
plt.rcParams['savefig.dpi'] = 200
plt.rcParams['figure.autolayout'] = False
plt.rcParams['axes.labelsize'] = 18
plt.rcParams['axes.titlesize'] = 20
plt.rcParams['font.size'] = 16
plt.rcParams['lines.linewidth'] = 2.0
plt.rcParams['lines.markersize'] = 8
plt.rcParams['legend.fontsize'] = 14
plt.rcParams['font.family'] = "serif"
plt.rcParams['font.serif'] = "cm"
pd.set_option('display.float_format', lambda x: '{:,.2f}'.format(x))

In [None]:
import sys
import os
import datetime
import math

# Data preprocessing for Bidongs RB-TNseq screen  

## Loading the data
 - Need count files generated by `tnseq2`
 - `control.txt` lists all the control barcodes, their phenotypes and concentrations
 - `metadata.tsv` provides metatdata for each sample (library, experiment, dnaid, mouse, day, tissue)
 - `mapped.csv` files will contain mapped barcodes, `unmapped.csv` files will contain control barcodes

In [None]:
root = Path("/nfs/nas22/fs2202/biol_micro_bioinf_nccr/hardt/nguyenb/tnseq")
dataDir = root/"scratch/08_21/counts/"
controls_file = root/"data/metadata/controls.txt"
outDir = root/"scratch/08_21/results/nguyenb"
metafile = root/"scratch/08_21/complete_metadata.tsv"

In [None]:
files = [f for f in dataDir.glob("*/*_mapped.csv")]
files_unmapped = [f for f in dataDir.glob("*/*_unmapped.csv")]
metadata = pd.read_table(metafile,index_col=0, header=None)
metadata.columns = ["library", "experiment", "mouse", "day", "tissue", "dnaid", "sampleID"]

## Make sure using files from Nov 4-5/2021

In [None]:
start_date = datetime.date(2021, 11, 4)
end_date = datetime.date(2021, 11, 5)
for file in files:
    try:
        assert start_date <= datetime.datetime.fromtimestamp(os.path.getmtime(file)).date() <= end_date 
    except AssertionError:
        print(f'{file.name}: Date modified is not the expected date')
        print("last modified: %s" % datetime.datetime.fromtimestamp(os.path.getmtime(file)).date())

In [None]:
df = pd.concat([pd.read_csv(f, index_col=0).assign(sampleID=f.stem.split('_counts')[0]) for f in files])
df = df.merge(metadata, how='left', on='sampleID').dropna(subset=['mouse'])

## Getting the control barcodes out of the unmapped files

- Somewhat complicated way of getting control counts for all samples, can use some refactoring
- Potentially something that `mBARq` can output

In [None]:
df_unmapped = pd.concat([pd.read_csv(f, index_col=0).assign(sampleID=f.stem.split('_counts')[0]) for f in files_unmapped])
df_unmapped = df_unmapped.merge(metadata, how='left', on='sampleID').dropna(subset=['mouse'])

In [None]:
controls = pd.read_table(controls_file, header=None, index_col=0, names=['barcode', 'phenotype', 'conc'])
# Create zero df
a = np.zeros(shape=(controls.shape[0], df.sampleID.nunique()))
zdf = (pd.DataFrame(a,columns=list(df.sampleID.unique()))
.set_index(controls.barcode.values)
.reset_index()
.rename({'index':'barcode'}, axis=1))
zdf = (zdf.melt(id_vars=['barcode'],var_name='sampleID', value_name='zero_cnt')
       .merge(metadata, on='sampleID')
      .drop('zero_cnt', axis=1))

In [None]:
# Merge zdf with controls to make sure all barcodes are 'present' for each sample
# Then merge with unmapped, convert na to 0
controls = controls.merge(zdf, on=['barcode'])
controls = controls.merge(df_unmapped[['barcode','sampleID', 'barcode_cnt']], how='left', on=['barcode','sampleID'])
controls['barcode_cnt'] = controls['barcode_cnt'].fillna(0)
controls['ShortName'] = controls['phenotype'] + '-' + controls['conc'].astype(str)
df = pd.concat([df, controls])
control_barcodes = controls.barcode.unique()

In [None]:
# should have 30 barcodes for each sample
controls.groupby(['sampleID']).barcode.nunique().reset_index()

# Quality Control

## Calculating WITS correlations

In [None]:
def calculate_correlation(controls, concentration_col = 'conc', cnt_col='barcode_cnt',
                          phenotype_col='phenotype', wt_phenotype='wt',
                          for_each='sampleID', cutoff=0.8):
    """

    Calculate correlation on log (counts+1) 
    Return control_cnts dataframe: contains all the metadata, logCnts, logConc, R, R2 for all the control barcodes
    """
    control_cnts = controls.copy()
    control_cnts['logConc'] = np.log10(control_cnts[concentration_col])
    control_cnts['logCnts'] = np.log10(control_cnts[cnt_col]+1)
    corr_df = control_cnts.groupby([phenotype_col, for_each])[['logConc', 'logCnts']].corr()
    corr_df = corr_df.reset_index()
    corr_df = corr_df[corr_df['level_2'] == 'logConc'].drop(['level_2', 'logConc'], axis=1)
    corr_df.columns = [phenotype_col, for_each, 'R']
    control_cnts = control_cnts.merge(corr_df, on = [for_each, phenotype_col])
    control_cnts['R2'] = control_cnts.R**2
    good_samples = control_cnts[(control_cnts.R2 > cutoff) & (control_cnts.phenotype == wt_phenotype)][for_each].unique()
    return control_cnts, good_samples

In [None]:
control_cnts, good_samples = calculate_correlation(controls, concentration_col = 'conc',
                          cnt_col='barcode_cnt', phenotype_col='phenotype',
                          for_each='sampleID',  wt_phenotype='wt', cutoff=0.8)

In [None]:
def graph_wits_correlation(control_cnts, sampleName, ax, phenotype='wt', label_cols = ('mouse', 'day', 'experiment')):
    meta_dict = control_cnts.set_index('sampleID').to_dict()
    df = control_cnts[(control_cnts.phenotype == phenotype) & (control_cnts.sampleID == sampleName)].copy()
    r2 = round(df.R2.unique()[0], 2)
    label = "_".join([meta_dict[c][sampleName] for c in label_cols]) + f"; R2 = {r2}"
    sns.regplot(data=df, x='logConc', y='logCnts', ax=ax, x_jitter=0.05, color='black');
    ax.set_title(label);
    ax.set_xlim(df.logConc.min() -0.5, df.logConc.max() + 0.5)

In [None]:
libraries = df.library.unique()
for library in libraries:
    lib_control = control_cnts[control_cnts.library == library]
    nrows = math.ceil(lib_control.sampleID.nunique()/4)
    samples = (lib_control[['mouse', 'experiment', 'day', 'sampleID']]
     .drop_duplicates()
     .sort_values(['mouse', 'day'])
     .sampleID.values)
    sns.set(font_scale=1.1)
    sns.set_style('whitegrid')
    fig, axes = plt.subplots(nrows, 4, figsize=(20, nrows*4))
    axes = axes.flatten()
    for i, name in enumerate(samples):
        graph_wits_correlation(lib_control, name, axes[i])
        axes[i].set_ylim(-0.5, 4.5);
    plt.subplots_adjust(left=0.125, bottom=0.1, right=0.9, top=0.9, wspace=0.2, hspace=0.5)
    fig.savefig(outDir/f"controls/{library}_WT_correlations.png")

In [None]:
control_cnts.phenotype.unique()

In [None]:
libraries = df.library.unique()
for library in libraries:
    lib_control = control_cnts[control_cnts.library == library]
    nrows = math.ceil(lib_control.sampleID.nunique()/4)
    samples = (lib_control[['mouse', 'experiment', 'day', 'sampleID']]
     .drop_duplicates()
     .sort_values(['mouse', 'day'])
     .sampleID.values)
    sns.set(font_scale=1.1)
    sns.set_style('whitegrid')
    fig, axes = plt.subplots(nrows, 4, figsize=(20, nrows*4))
    axes = axes.flatten()
    for i, name in enumerate(samples):

        graph_wits_correlation(lib_control, name, axes[i], phenotype = 'hyb',)
        axes[i].set_ylim(-0.5, 4.5);
    plt.subplots_adjust(left=0.125, bottom=0.1, right=0.9, top=0.9, wspace=0.2, hspace=0.5)
    fig.savefig(outDir/f"controls/{library}_hyb_correlations.png")

In [None]:
libraries = df.library.unique()
for library in libraries:
    lib_control = control_cnts[control_cnts.library == library]
    nrows = math.ceil(lib_control.sampleID.nunique()/4)
    samples = (lib_control[['mouse', 'experiment', 'day', 'sampleID']]
     .drop_duplicates()
     .sort_values(['mouse', 'day'])
     .sampleID.values)
    sns.set(font_scale=1.1)
    sns.set_style('whitegrid')
    fig, axes = plt.subplots(nrows, 4, figsize=(20, nrows*4))
    axes = axes.flatten()
    for i, name in enumerate(samples):

        graph_wits_correlation(lib_control, name, axes[i], phenotype = 'chey',)
        axes[i].set_ylim(-0.5, 4.5);
    plt.subplots_adjust(left=0.125, bottom=0.1, right=0.9, top=0.9, wspace=0.2, hspace=0.5)
    fig.savefig(outDir/f"controls/{library}_chey_correlations.png")

In [None]:
libraries = df.library.unique()
for library in libraries:
    lib_control = control_cnts[control_cnts.library == library]
    nrows = math.ceil(lib_control.sampleID.nunique()/4)
    samples = (lib_control[['mouse', 'experiment', 'day', 'sampleID']]
     .drop_duplicates()
     .sort_values(['mouse', 'day'])
     .sampleID.values)
    sns.set(font_scale=1.1)
    sns.set_style('whitegrid')
    fig, axes = plt.subplots(nrows, 4, figsize=(20, nrows*4))
    axes = axes.flatten()
    for i, name in enumerate(samples):

        graph_wits_correlation(lib_control, name, axes[i], phenotype = 'ssaV_invG',)
        axes[i].set_ylim(-0.5, 4.5);
    plt.subplots_adjust(left=0.125, bottom=0.1, right=0.9, top=0.9, wspace=0.2, hspace=0.5)
    fig.savefig(outDir/f"controls/{library}_ssaV_invG_correlations.png")

# Summarizing at gene level

## Dropping unenriched samples

In [None]:
unenriched = metadata[metadata.mouse.str.contains('unenriched')].sampleID.values
good_samples = [s for s in good_samples if s not in unenriched]
print(len(good_samples))
clean_df = df[df.sampleID.isin(good_samples)]

## Summing barcodes for each ShortName

In [None]:
clean_gene_df = (clean_df.groupby(['library', 'sampleID', 'ShortName', 'experiment', 'mouse', 'day', 'tissue'])
          .barcode_cnt.sum().reset_index())

## Writting out clean data

In [None]:
clean_gene_df.to_csv(outDir/'09-11-2021-annotated_gene_counts_after_qc.csv')

## Summarizing clean data

In [None]:
summary_table = clean_gene_df.groupby(['library', 'experiment', 'day']).agg({'sampleID':['nunique'], 'ShortName':['nunique']})

In [None]:
summary_table.columns = ['Number of samples', 'Number of genes']

In [None]:
summary_table.to_csv(outDir/'24-11-2021-summary-clean-samples.csv')