In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import sys
sys.path.append("../../mbarq_analysis")
from quality_control import calculate_correlation

In [None]:
dataDir = Path("/Users/ansintsova/git_repos/nguyenb_tnseq/data/10_21_skroon")
controls_file = Path("/Users/ansintsova/git_repos/nguyenb_tnseq/data/controls.txt")

In [None]:
files = [f for f in dataDir.glob("*_mapped.csv")]
files_unmapped = [f for f in dataDir.glob("*_unmapped.csv")]
metadata = pd.read_csv(dataDir/'skroon_metadata.csv', names=['sampleID', 'treatment'], skiprows=1).dropna()
metadata_ext = metadata.sampleID.str.split("-", expand=True)
metadata_ext.columns = ['dnaid', 'demux_code', 'library', 'experiment', 'mouse', 'day', 'tissue']
metadata_ext['day'] = metadata_ext['day'].replace('2', 'd0')
metadata = pd.concat([metadata, metadata_ext], axis=1)
metadata['sampleID'] = "dnaid" +metadata['dnaid'] +"_" + metadata['demux_code']

In [None]:
df = pd.concat([pd.read_csv(f, index_col=0).assign(sampleID=f.stem.split('_counts')[0]) for f in files])
df = df.merge(metadata, how='left', on='sampleID').dropna(subset=['mouse'])

In [None]:
df_unmapped = pd.concat([pd.read_csv(f, index_col=0).assign(sampleID=f.stem.split('_counts')[0]) for f in files_unmapped])
df_unmapped = df_unmapped.merge(metadata, how='left', on='sampleID').dropna(subset=['mouse'])
controls = pd.read_table(controls_file, header=None, index_col=0)
controls.columns = ['barcode', 'phenotype', 'conc']
controls = controls.merge(df_unmapped, how='left', on='barcode')
df = pd.concat([df, controls])
control_barcodes = controls.barcode.values

In [None]:
inoculum = df[df.mouse == 'inoc12'].copy()

In [None]:
inoculum.barcode_cnt.describe()

In [None]:
plt.figure(figsize=(6,6))
inoculum.barcode_cnt.hist(bins=100)
plt.xlim(-100, 20000);

In [None]:
# Number of barodes with counts > 1000
inoculum[inoculum.barcode_cnt > 500].barcode.nunique()

In [None]:
def calc_relAb(x):
    return x/x.sum()*100

In [None]:
relAb= df[['sampleID', 'barcode_cnt']].groupby('sampleID').apply(lambda x: x/x.sum()*100)

In [None]:
inoculum['relAb'] = inoculum['barcode_cnt']/inoculum['barcode_cnt'].sum()*100

In [None]:
inoculum.relAb.hist(bins=100)

In [None]:
inoculum.relAb.describe()

In [None]:
# Barcodes to analyze: 
bc_present = inoculum[inoculum.relAb > 0.01].barcode.values

In [None]:
good_samples

In [None]:
relAb_df = (df[['barcode', 'barcode_cnt', 'sampleID']].drop_duplicates().pivot(index='barcode', columns='sampleID')
 .apply(lambda x: x/x.sum()*100).reset_index()
.melt(id_vars='barcode', value_name='relAb')[['barcode', 'sampleID', 'relAb']])
df = df.merge(relAb_df, on=['barcode', 'sampleID'])

In [None]:
wt_bc = df[(df.barcode.isin(control_barcodes)) & (df.sampleID.isin(good_samples))]
wt_bc = df[(df.barcode.isin(control_barcodes)) ]
wt_bc = wt_bc[wt_bc.phenotype == 'wt']
wt_bc[['barcode', 'sampleID', 'conc', 'relAb']]

In [None]:
wt_bc.conc.unique()

In [None]:
plt.figure(figsize=(20,10))
sns.boxplot(data=wt_bc[wt_bc.conc == 1.50000e-03], x='demux_code', y='relAb',)
sns.stripplot(data=wt_bc[wt_bc.conc == 1.50000e-03], x='demux_code', y='relAb',color='black')
#plt.hlines(1.50000e-03, 0, 20)
plt.yscale('log')
plt.legend('')

In [None]:
plt.figure(figsize=(20,10))
sns.boxplot(data=wt_bc[wt_bc.conc == 1.66667e-04], x='demux_code', y='relAb',)
sns.stripplot(data=wt_bc[wt_bc.conc == 1.66667e-04], x='demux_code', y='relAb',color='black')
#plt.hlines(1.50000e-03, 0, 20)
plt.yscale('log')
plt.legend('')

In [None]:
plt.figure(figsize=(20,10))
sns.boxplot(data=wt_bc[wt_bc.conc == 1.85185e-05], x='demux_code', y='relAb',)
sns.stripplot(data=wt_bc[wt_bc.conc == 1.85185e-05], x='demux_code', y='relAb',color='black')
#plt.hlines(1.50000e-03, 0, 20)
plt.yscale('log')
plt.legend('')

In [None]:
corr_df, good_samples = calculate_correlation(controls, concentration_col = 'conc',
                          cnt_col='barcode_cnt', phenotype_col='phenotype',
                          for_each='sampleID', how='log', wt_phenotype='wt', cutoff=0.6)

In [None]:
good_samples

In [None]:
metadata

In [None]:
corr_df['r2']= corr_df.R**2
corr_df[corr_df.phenotype =='wt']

In [None]:
wt = controls[controls.phenotype == 'wt']

In [None]:
wt[wt.sampleID == "dnaid2030_9"]

In [None]:
df_unmapped.barcode_cnt.hist(bins=100)

In [None]:
df_unmapped.barcode_cnt.describe()