In [None]:
import pandas as pd
from pathlib import Path
import numpy as np

In [None]:
%ls ../../../data/09_21

In [None]:
dataDir = "../../../data/09_21/"
gff_file = Path(dataDir)/"GCA_000210855.2_ASM21085v2_genomic.gff"
emap_file = Path(dataDir)/"SL1344.emapper.annotations"

In [None]:
gff = pd.read_table(gff_file, skiprows=7, header=None)
gff.columns = ['chr', 'loc', 'feat', 'start', 'end', 'dn', 'strand', 'dn2', 'desc']

In [None]:
gff.head()

In [None]:
def get_feat_id(x):
    if x is np.nan:
        return x
    elif 'gene-' in x or 'cds-' in x:
        return x.split(';')[0].split('-')[1]
    else:
        return x.split(';')[0]
    
def get_gene_name(x):
    if x is np.nan:
        return x
    elif 'ID=gene' in x:
        return x.split('Name=')[1].split(';')[0]
    elif 'ID=cds' in x and 'gene=' in x:
        return x.split('gene=')[1].split(';')[0]
    elif 'ID=cds' in x and 'Parent=' in x:
        return x.split('Parent=')[1].split(';')[0].split('-')[1]
    else:
        return x.split(';')[0].strip('ID=')

In [None]:
gff['feat_id'] = gff['desc'].apply(get_feat_id)
gff['Name'] = gff['desc'].apply(get_gene_name)

In [None]:
gff[['feat','feat_id', 'Name']]

In [None]:
emap = pd.read_table(emap_file, skiprows=4)

In [None]:
emap.head()

In [None]:
gene_to_cds = gff[gff.feat == 'CDS'][['feat_id', 'Name', 'start']]

In [None]:
go_map = gene_to_cds.merge(emap, left_on='feat_id', right_on='#query', how='outer')

In [None]:
go_map = go_map.drop(go_map.tail(3).index)

In [None]:
go_map[go_map['KEGG_Pathway'].apply(term_in_gene, term='ko00010') == True]

In [None]:
def term_in_gene(x, term):
    if x is np.nan:
        return False
    return term in x

#def get_pathway_genes(df, ann_column='KEGG_Pathway', pathway='ko00010'):
    

In [None]:
results = pd.read_csv('/Users/ansintsova/git_repos/nguyenb_tnseq/data/07_06_results/26-07-final-results.csv', index_col=0)

In [None]:
results

In [None]:
res_ann = results.merge(go_map,  left_on='gene', right_on='Name')

In [None]:
res_ann.columns

In [None]:
t1 = res_ann[['gene', 'day', 'library','start', 'z-score', 'GOs', 'KEGG_Pathway', 'KEGG_Module']]


In [None]:
t1.sample(20)

In [None]:
c('thiM', 'thiC', 'bioB', 'bioF', 'bioDb', 'ybiV(2)', 'ybiV(1)', 'pdxAa', 'pdxB')

In [None]:
t1[t1.KEGG_Pathway.apply(lambda x: False if x is np.nan else 'ko00750' in x)].sort_values('start').gene.unique()

In [None]:
t1[t1.KEGG_Module.apply(lambda x: False if x is np.nan else 'M00125' in x)].sort_values('start').gene.unique()

In [None]:
t1[t1.KEGG_Module.apply(lambda x: False if x is np.nan else 'M00123' in x)].sort_values('start').gene.unique()

In [None]:
t1[t1.KEGG_Module.apply(lambda x: False if x is np.nan else 'M00127' in x)].sort_values('start').gene.unique()

In [None]:
t1[t1.KEGG_Pathway.apply(lambda x: False if x is np.nan else 'ko01212' in x)].sort_values('start').gene.unique()

In [None]:
t1[t1.KEGG_Pathway.apply(lambda x: False if x is np.nan else 'ko00230' in x)].sort_values('start').gene.unique()

In [None]:
t1[t1.KEGG_Pathway.apply(lambda x: False if x is np.nan else 'ko00240' in x)].sort_values('start').gene.unique()

In [None]:
#t1[t1.KEGG_Pathway.apply(lambda x: False if x is np.nan else 'ko00410' in x)].sort_values('start').gene.unique()

In [None]:
t1[t1.KEGG_Pathway.apply(lambda x: False if x is np.nan else 'ko01230' in x)].sort_values('start').gene.unique()


In [None]:
t1[t1.GOs.apply(lambda x: False if x is np.nan else 'GO:0043602' in x)].sort_values('start').gene.unique()

In [None]:
t1[t1.GOs.apply(lambda x: False if x is np.nan else 'GO:0009061' in x)].sort_values('start').gene.unique()

In [None]:
res_ann.to_csv(Path(dataDir)/'final_results_annotated.csv')

In [None]:
t1[t1.gene == 'nudF']

In [None]:
ko_pathways = list(t1.KEGG_Pathway.dropna().unique())
ko_pathways = [g.split(",") for g in ko_pathways]
ko_pathways = set([p for path in ko_pathways for p in path  if p.startswith('ko')])

In [None]:
len(ko_pathways)

In [None]:
p = 'ko00010'

In [None]:
t1[t1.KEGG_Pathway.apply(lambda x: False if x is np.nan else p in x)].gene.unique()

In [None]:
ko_gmt = {}
for path in ko_pathways:
    ko_gmt[path] = list(t1[t1.KEGG_Pathway.apply(lambda x: False if x is np.nan else path in x)].gene.unique())

In [None]:
with open(Path(dataDir)/'07-10-ko.gmt', 'w') as fh:
    for ko, genes in ko_gmt.items():
        gene_str = "\t".join(genes)
        fh.write(f"{ko}\t{gene_str}\n")

In [None]:
ko_gmt

In [None]:
results

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(10,10))
plt.scatter(results['z-score'], np.log2(results.CI))