# Loading the data

- Analysis of dnaid2030 for Saane Skroon
- Inoculum, LPS injected, control, fecal and cecum samples
- Described in  dnaid2030: LPS injections:  13-10-2021-data-cleaning & 19-10-2021-data-cleaning

In [None]:
# %load ../snippets/basic_settings.py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path


sns.set_context("notebook", font_scale=1.1)
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 100)
plt.rcParams["figure.figsize"] = (16, 12)
plt.rcParams['savefig.dpi'] = 200
plt.rcParams['figure.autolayout'] = False
plt.rcParams['axes.labelsize'] = 18
plt.rcParams['axes.titlesize'] = 20
plt.rcParams['font.size'] = 16
plt.rcParams['lines.linewidth'] = 2.0
plt.rcParams['lines.markersize'] = 8
plt.rcParams['legend.fontsize'] = 14
plt.rcParams['text.usetex'] = False  # True activates latex output in fonts!
plt.rcParams['font.family'] = "Nimbus Roman"
plt.rcParams['font.serif'] = "cm"
pd.set_option('display.float_format', lambda x: '{:,.2f}'.format(x))

In [None]:
import sys
import os
sys.path.append("../../mbarq_analysis")
from quality_control import calculate_correlation
import datetime

In [None]:
root = Path("/nfs/nas22/fs2202/biol_micro_bioinf_nccr/hardt/nguyenb/tnseq")
dataDir = root/"scratch/08_21/counts/skroon/dnaid2030"
controls_file = root/"data/metadata/controls.txt"
outDir = root/"scratch/08_21/results/skroon"
metadata_file = outDir/'skroon_metadata.csv'

In [None]:
files = [f for f in dataDir.glob("*_mapped.csv")]
files_unmapped = [f for f in dataDir.glob("*_unmapped.csv")]
metadata = pd.read_csv(metadata_file, names=['sampleID', 'treatment'], skiprows=1).dropna()
metadata_ext = metadata.sampleID.str.split("-", expand=True)
metadata_ext.columns = ['dnaid', 'demux_code', 'library', 'experiment', 'mouse', 'day', 'tissue']
metadata_ext['day'] = metadata_ext['day'].replace('2', 'd0')
metadata = pd.concat([metadata, metadata_ext], axis=1)
metadata['sampleID'] = "dnaid" + metadata['dnaid'] +"_" + metadata['demux_code']

## Make sure using files from Oct 19/2021

In [None]:
counted_date = datetime.date(2021, 10, 19)
for file in files:
    try:
        assert datetime.datetime.fromtimestamp(os.path.getmtime(file)).date() == counted_date
    except AssertionError:
        print(f'{file.name}: Date modified is not the expected date')
        print("last modified: %s" % datetime.datetime.fromtimestamp(os.path.getmtime(file)).date())


In [None]:
df = pd.concat([pd.read_csv(f, index_col=0).assign(sampleID=f.stem.split('_counts')[0]) for f in files])
df = df.merge(metadata, how='left', on='sampleID').dropna(subset=['mouse'])

In [None]:
df_unmapped = pd.concat([pd.read_csv(f, index_col=0).assign(sampleID=f.stem.split('_counts')[0]) for f in files_unmapped])
df_unmapped = df_unmapped.merge(metadata, how='left', on='sampleID').dropna(subset=['mouse'])
controls = pd.read_table(controls_file, header=None, index_col=0)
controls.columns = ['barcode', 'phenotype', 'conc']
controls = controls.merge(df_unmapped, how='left', on='barcode')
controls['ShortName'] = controls['phenotype'] + '-' + controls['conc'].astype(str)
df = pd.concat([df, controls])
control_barcodes = controls.barcode.values

# Quality Control

## Looking at the inoculum

In [None]:
inoculum = df[df.mouse == 'inoc12'].copy()

In [None]:
inoculum.barcode_cnt.describe()

In [None]:
inoculum.barcode_cnt.hist(bins=300)
plt.xlim(-100, 20000);

In [None]:
# Number of barodes with counts > 1000
inoculum[inoculum.barcode_cnt > 1000].barcode.nunique()

In [None]:
def calc_relAb(x):
    return x/x.sum()*100

In [None]:
relAb= df[['sampleID', 'barcode_cnt']].groupby('sampleID').apply(lambda x: x/x.sum()*100)

In [None]:
inoculum['relAb'] = inoculum['barcode_cnt']/inoculum['barcode_cnt'].sum()*100

In [None]:
inoculum.relAb.hist(bins=300)

In [None]:
inoculum.relAb.describe()

In [None]:
# Barcodes to analyze: 
bc_present = inoculum[inoculum.relAb > 0.01].barcode.values
len(bc_present)

## Calculating WITS correlation 

In [None]:
corr_df, good_samples = calculate_correlation(controls, concentration_col = 'conc',
                          cnt_col='barcode_cnt', phenotype_col='phenotype',
                          for_each='sampleID', how='log_w_0', wt_phenotype='wt', cutoff=0.8)

In [None]:
def graph_wits_correlation(controls, metadata, sampleName, ax):
    meta_dict = metadata.set_index('sampleID').to_dict()
    df = controls[(controls.phenotype == 'wt') &(controls.sampleID == sampleName)].copy()
    r2 = round(df.logConc.corr(df.logCnts)**2, 2)
    label = meta_dict['mouse'][sampleName] + '_' + meta_dict['treatment'][sampleName] + f"; R2 = {r2}"
    sns.regplot(data=df, x='logConc', y='logCnts', ax=ax);
    ax.set_title(label);
    ax.set_xlim(df.logConc.min() -0.5, df.logConc.max() + 0.5)

In [None]:
sns.set(font_scale=0.9)
fig, axes = plt.subplots(6, 3, figsize=(15, 30))
axes = axes.flatten()
for i, name in enumerate(controls.sampleID.unique()):
    graph_wits_correlation(controls, metadata, name, axes[i])
fig.savefig(outDir/"wt_correlations.png")

## Plotting wt strain relative abundance by concentrations

In [None]:
relAb_df = (df[['barcode', 'barcode_cnt', 'sampleID']].drop_duplicates().pivot(index='barcode', columns='sampleID')
 .apply(lambda x: x/x.sum()*100).reset_index()
.melt(id_vars='barcode', value_name='relAb')[['barcode', 'sampleID', 'relAb']])
df = df.merge(relAb_df, on=['barcode', 'sampleID'])

In [None]:
wt_bc = df[(df.barcode.isin(control_barcodes)) & (df.sampleID.isin(good_samples))]
wt_bc = df[(df.barcode.isin(control_barcodes)) ]
wt_bc = wt_bc[wt_bc.phenotype == 'wt']
wt_bc[['barcode', 'sampleID', 'conc', 'relAb']]
wt_bc.conc.unique()

In [None]:
plt.figure(figsize=(20,10))
data = wt_bc[wt_bc.conc == 1.50000e-03].sort_values('sampleID')
sns.boxplot(data=data, x='demux_code', y='relAb')
sns.stripplot(data=data, x='demux_code', y='relAb',color='black')
#plt.hlines(1.50000e-03, 0, 20)
plt.yscale('log')
plt.legend('')
plt.ylim(1e-05, 1);

In [None]:
plt.figure(figsize=(20,10))
data = wt_bc[wt_bc.conc == 5.00000e-04].sort_values('sampleID')
sns.boxplot(data=data, x='demux_code', y='relAb')
sns.stripplot(data=data, x='demux_code', y='relAb',color='black')
#plt.hlines(1.50000e-03, 0, 20)
plt.yscale('log')
plt.legend('')
plt.ylim(1e-05, 1);

In [None]:
plt.figure(figsize=(20,10))
data = wt_bc[wt_bc.conc == 1.66667e-04].sort_values('sampleID')
sns.boxplot(data=data, x='demux_code', y='relAb',)
sns.stripplot(data=data, x='demux_code', y='relAb',color='black')
#plt.hlines(1.50000e-03, 0, 20)
plt.yscale('log')
plt.legend('')
plt.ylim(1e-05, 1);

In [None]:
plt.figure(figsize=(20,10))
data = wt_bc[wt_bc.conc == 5.55556e-05].sort_values('sampleID')
sns.boxplot(data=data, x='demux_code', y='relAb')
sns.stripplot(data=data, x='demux_code', y='relAb',color='black')
#plt.hlines(1.50000e-03, 0, 20)
plt.yscale('log')
plt.legend('')
plt.ylim(1e-05, 1);

In [None]:
plt.figure(figsize=(20,10))
data = wt_bc[wt_bc.conc == 1.85185e-05].sort_values('sampleID')
sns.boxplot(data=data, x='demux_code', y='relAb',)
sns.stripplot(data=data, x='demux_code', y='relAb',color='black')
#plt.hlines(1.50000e-03, 0, 20)
plt.yscale('log')
plt.ylim(1e-05, 1);
plt.legend('')

# Summarizing at Gene level

In [None]:
df['tissue'] = df.tissue.replace('CC', 'cc')
clean_df = df[df.sampleID.isin(good_samples)]
clean_gene_df = (clean_df.groupby(['library', 'sampleID', 'ShortName', 'experiment', 'mouse', 'day', 'tissue'])
          .barcode_cnt.sum().reset_index())

In [None]:
clean_df_filtered = clean_df[clean_df.barcode.isin(bc_present)]
clean_gene_df_filtered = (clean_df_filtered.groupby(['library', 'sampleID', 'ShortName', 
                                                     'experiment', 'mouse', 'day', 'tissue'])
          .barcode_cnt.sum().reset_index())

# Writting out clean data

In [None]:
# Drop unenriched sample from good samples
good_samples = list(good_samples)
good_samples.remove('dnaid2030_3')

In [None]:
clean_meta = metadata[metadata.sampleID.isin(good_samples)].copy()
clean_meta['tissue'] = clean_meta.tissue.replace('CC', 'cc')
clean_meta

In [None]:
clean_gene_df.to_csv(outDir/'21-10-2021-annotated_gene_counts_after_qc.csv')
clean_gene_df_filtered.to_csv(outDir/'21-10-2021-annotated_gene_counts_filtered_after_qc.csv')

In [None]:
clean_gene_df[clean_gene_df.ShortName=="wt-0.0015"]