In [None]:
# %load /Users/ansintsova/git_repos/snippets/basic_settings.py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path


sns.set_context("notebook", font_scale=1.1)
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 100)
plt.rcParams["figure.figsize"] = (16, 12)
plt.rcParams['savefig.dpi'] = 200
plt.rcParams['figure.autolayout'] = False
plt.rcParams['axes.labelsize'] = 18
plt.rcParams['axes.titlesize'] = 20
plt.rcParams['font.size'] = 16
plt.rcParams['lines.linewidth'] = 2.0
plt.rcParams['lines.markersize'] = 8
plt.rcParams['legend.fontsize'] = 14
plt.rcParams['text.usetex'] = False  # True activates latex output in fonts!
plt.rcParams['font.family'] = "serif"
plt.rcParams['font.serif'] = "cm"
pd.set_option('display.float_format', lambda x: '{:,.2f}'.format(x))

In [None]:
%pwd

In [None]:
dataDir = Path("/Users/ansintsova/git_repos/nguyenb_tnseq/data/mageck_test")

In [None]:
df1 = pd.read_table(dataDir/"test1.gene_summary.txt") # library 10_1 no negative cntrl
df2 =  pd.read_table(dataDir/"test2.gene_summary.txt") # library 10_1 with negative cntrl
df3 = pd.read_table(dataDir/"test3.gene_summary.txt") # library 14_2 w/o batch correction
df4 =  pd.read_table(dataDir"test4.gene_summary.txt") # library 14_2 w batch correction
df5 =  pd.read_table(dataDir"test5.gene_summary.txt") # library 14_2 with different norm-method (negCntrl)

In [None]:
df1[df1['neg|fdr'] < 0.05]

In [None]:
df2[df2['neg|fdr'] < 0.05]

In [None]:
df6 = pd.read_csv("library_14_2-results-d1.csv", index_col=0, sep=' ')
df6 = df6[['log2FoldChange', 'padj']].reset_index()
df6.columns = ['id', 'neg|lfc', 'neg|fdr']
df6['source'] = 'deseq'
df6.sample(5)


In [None]:
df3[df3['neg|fdr'] < 0.05]

df3 = df3[['id', 'neg|lfc', 'neg|fdr']].copy()
df3['source'] = 'NoBatchCorr'

In [None]:
df4[df4['neg|fdr'] < 0.05]
df4 = df4[['id', 'neg|lfc', 'neg|fdr']].copy()
df4['source'] = 'BatchCorr'

In [None]:
df = pd.concat([df3, df4, df6], axis=0)
df.sample(5)

In [None]:
sig  = df[df['neg|fdr'] < 0.05]

In [None]:
s = sig.id.value_counts()
s[s==3].shape

In [None]:
s2 = s[s==2].index

In [None]:
s[s==1]

In [None]:
sig[sig.id.isin(s2)]

In [None]:
df[df.id == 'SL1344_3958']

In [None]:
df3[df3.id == "SL1344_3958"]

In [None]:
df7 = pd.read_table('test7.normalized.txt')

In [None]:
df7.head()

In [None]:
x = df7[df7.sgRNA == 'ATCCGCGTCACCGAAAA'].set_index(['sgRNA', 'Gene']).melt(var_name='sampleID')

In [None]:
df8 = pd.read_table('mageck_14_2_batch.txt')
df8

In [None]:
x.merge(df8, on='sampleID').groupby('day').value.median()

In [None]:
np.log2(3271.87/3497.16)

In [None]:
df9 = pd.read_table('test8.gene_summary.txt')

In [None]:
df9[df9.id == 'pilT']

In [None]:
df9[(df9['neg|fdr'] < 0.05)& (df9.id.str.len() < 16)].sort_values('neg|rank')

In [None]:
df4[(df4['neg|fdr'] < 0.05)& (df4.id.str.len() < 16)].sort_values('neg|rank')

In [None]:
a = set(df9[(df9['neg|fdr'] < 0.05)& (df9.id.str.len() < 16)].id.values)

In [None]:
b = set(df4[(df4['neg|fdr'] < 0.05)& (df4.id.str.len() < 16)].id.values)

In [None]:
a-b

In [None]:
b-a

In [None]:


batches

In [None]:
negCntrl = ['TACCCAGAGCACACTCA', 'ATCCGCGTCACCGAAAA', 'ACAGAGCTCGGGAGTCT',
       'ACTACAAGACTGGTTAA', 'AGATGCATGACTAGCTA', 'AGAATGACCCGGAGGCT',
       'AGGAAGGCGACGAAATC', 'AGTCATCGATGCTATAT', 'TAAGTCCGGGCTAAGTC',
       'AACAACACGGTAAGCAA', 'TATAACACCCCCGATTC', 'CTACGACAGGGACTTAA',
       'GTGTATAGCAGGAACCC', 'CCGACGACTGATTGTCC', 'TCTCACGCAGCGTTTCG']
batches = pd.read_table('mageck_dirty_14_2_batch.txt')

In [None]:
def test_norm(file, negCntrl, batches, gene='Gene'):
    df10 = pd.read_table(file)
    df10 = df10[df10.sgRNA.isin(negCntrl)]
    df10 = df10.melt(id_vars=['sgRNA', gene], var_name='sampleID').merge(batches, on='sampleID')
    x = df10.groupby(['sgRNA', 'day']).value.median().unstack()
    x['lfc'] = np.log2(x.d1/x.d0)
    return x


In [None]:
norm_counts = test_norm("test8.normalized.txt", negCntrl, batches)
batches_clean = pd.read_table('mageck_14_2_batch.txt')
norm_counts_clean = test_norm("test4.normalized.txt", negCntrl, batches_clean)

In [None]:
norm_counts

In [None]:
batch_counts = test_norm("14_2.dirty.batch.corrected.txt", negCntrl, batches, gene='gene')

In [None]:
norm_counts_clean

In [None]:
raw_counts = test_norm("mageck_dirty_counts.txt", negCntrl, batches, gene='gene')

In [None]:
raw_counts.lfc.median()

In [None]:
norm_counts.lfc.median()

In [None]:
norm_counts_clean.lfc.median()

In [None]:
raw_counts.lfc.hist(bins=10, alpha=0.5, label='raw')
norm_counts.lfc.hist(bins=10, alpha=0.5, label='all norm')
norm_counts_clean.lfc.hist(bins=10, alpha=0.5, label='clean_norm')
plt.legend()