In [None]:
# %load ../snippets/basic_settings.py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path


sns.set_context("notebook", font_scale=1.1)
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 100)
plt.rcParams["figure.figsize"] = (16, 12)
plt.rcParams['savefig.dpi'] = 200
plt.rcParams['figure.autolayout'] = False
plt.rcParams['axes.labelsize'] = 18
plt.rcParams['axes.titlesize'] = 20
plt.rcParams['font.size'] = 16
plt.rcParams['lines.linewidth'] = 2.0
plt.rcParams['lines.markersize'] = 8
plt.rcParams['legend.fontsize'] = 14
plt.rcParams['text.usetex'] = False  # True activates latex output in fonts!
plt.rcParams['font.family'] = "serif"
plt.rcParams['font.serif'] = "cm"
pd.set_option('display.float_format', lambda x: '{:,.2f}'.format(x))

# Load Data

In [None]:
root = Path("/nfs/nas22/fs2202/biol_micro_bioinf_nccr/hardt/nguyenb/tnseq")
controls_file = root/"data/metadata/controls.txt"
outDir = root/"scratch/08_21/results/nguyenb"
metafile = root/"scratch/08_21/complete_metadata.tsv"
jun_res_file = outDir/"27_07/26-07-final-results.csv"
nov_res_file = outDir/"24-11-2021-all-libraries-zscores.csv"

In [None]:
fres = pd.read_csv(jun_res_file, index_col=0)
sres = pd.read_csv(nov_res_file, index_col=0)
fres = fres[['gene', 'day', 'z-score', 'zscore_padj', 'CI', 'library']]
fres.columns = ['gene', 'day', 'zscore_jun', 'padj_jun', 'ci_jun', 'library']
sres = sres[['library', 'gene', 'day', 'zscore', 'padj', 'ci']]
sres.columns = ['library', 'gene', 'day', 'zscore_nov', 'padj_nov', 'ci_nov']
fdf = fres.merge(sres, how='outer', on=['library', 'day', 'gene'])
#fdf = fdf.dropna()

# Features by library

- I have more features in the Nov results, because did not do any filtering step 

In [None]:
fres.groupby('library').gene.nunique()

In [None]:
sres.groupby('library').gene.nunique()

# Significant Features

In [None]:
fres.head()

In [None]:
fres[fres.padj_jun < 0.05].groupby(['library', 'day']).gene.nunique()

In [None]:
sres[sres.padj_nov < 0.05].groupby(['library', 'day']).gene.nunique()

In [None]:
fres[fres.padj_jun < 0.05].groupby(['day']).gene.nunique()

In [None]:
sres[sres.padj_nov < 0.05].groupby(['day']).gene.nunique()

# Compare Z-Score and CI by library

In [None]:
library = 'library_10_2'
libraries = list(fdf.library.unique())
nrows = len(libraries)

In [None]:
def compare_results(df, col1, col2, library):
    df = df[df.library == library].copy()
    for day in df.day.unique():
        plt.figure(figsize=(4,4))
        plt.title(day)
        sns.regplot(data=df[df.day == day], x=col1, y=col2, color='black')
        #sns.scatterplot(data = df[df.day == day], x = col1, y = col2)

In [None]:
fdf['logCI_jun'] = np.log2(fdf.ci_jun)
fdf['logCI_nov'] = np.log2(fdf.ci_nov)

In [None]:
col1='logCI_jun'
col2='logCI_nov'
fig, axes = plt.subplots(nrows, 4, figsize=(20, nrows*4))
axes = axes.flatten()
j=0
for i, name in enumerate(libraries):
    df = fdf[fdf.library == library].copy().sort_values('day')
    for day in df.day.unique():
        
        sns.regplot(data=df[df.day == day], x=col1, y=col2, color='black', ax=axes[j])
        axes[j].set_title(f'{name}_{day}')
        j+=1

plt.subplots_adjust(left=0.125, bottom=0.1, right=0.9, top=0.9, wspace=0.5, hspace=0.5)

# Total number of genes identified as hits

In [None]:
fdf[fdf.padj_jun < 0.05].gene.nunique()

In [None]:
fdf[fdf.padj_nov < 0.05].gene.nunique()

In [None]:
fdf.gene.nunique()

# Hit summaries

In [None]:
sres

In [None]:
num_lib = sres.groupby(['gene', 'day']).agg({'library':['nunique'], 
                                            'zscore_nov':['median', 'min', 'max'],
                                            'ci_nov':['median', 'min', 'max'], 
                                            'padj_nov': [lambda x: sum(x<0.05)]}).reset_index()
num_lib.columns = ['gene', 'day', 'num_libs_present', 'zscore_median', 'zscore_min',
                  'zscore_max', 'ci_median', 'ci_min', 'ci_max', 'num_of_times_was_hit']

# Need to annotate the final results

In [None]:
%ls ../../../data/metadata

In [None]:
def get_feat_id(x):
    if x is np.nan:
        return x
    elif 'gene-' in x or 'cds-' in x:
        return x.split(';')[0].split('-')[1]
    else:
        return x.split(';')[0]
    
def get_gene_name(x):
    if x is np.nan:
        return x
    elif 'ID=gene' in x:
        return x.split('Name=')[1].split(';')[0]
    elif 'ID=cds' in x and 'gene=' in x:
        return x.split('gene=')[1].split(';')[0]
    elif 'ID=cds' in x and 'Parent=' in x:
        return x.split('Parent=')[1].split(';')[0].split('-')[1]
    else:
        return x.split(';')[0].strip('ID=')

In [None]:
dataDir = "../../../data/metadata"
gff_file = Path(dataDir)/"GCA_000210855.2_ASM21085v2_genomic.gff"
emap_file = Path(dataDir)/"SL1344.emapper.annotations"

gff = pd.read_table(gff_file, skiprows=7, header=None)
gff.columns = ['chr', 'loc', 'feat', 'start', 'end', 'dn', 'strand', 'dn2', 'desc']

gff['feat_id'] = gff['desc'].apply(get_feat_id)
gff['Name'] = gff['desc'].apply(get_gene_name)

gene_to_cds = gff[gff.feat == 'CDS'][['feat_id', 'Name', 'start']]
emap = pd.read_table(emap_file, skiprows=4)
go_map = gene_to_cds.merge(emap, left_on='feat_id', right_on='#query', how='outer')
go_map = go_map.drop(go_map.tail(3).index)

In [None]:
res_ann = num_lib.merge(go_map,  left_on='gene', right_on='Name')

In [None]:
res_ann.to_csv(outDir/'06-11-2021-final-results-summary-annotated.csv')

In [None]:
sres_ann = sres.merge(go_map, left_on='gene', right_on='Name')

In [None]:
sres_ann.to_csv(outDir/'06-11-2021-final-results-annotated.csv')