In [None]:
# %load ../snippets/basic_settings.py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from pathlib import Path
import seaborn as sns
import sys
import plotly.express as px
import yaml
import pyranges as pr 

sns.set_context("notebook", font_scale=1.1)
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 100)
plt.rcParams["figure.figsize"] = (16, 12)
plt.rcParams['savefig.dpi'] = 200
plt.rcParams['figure.autolayout'] = False
plt.rcParams['axes.labelsize'] = 18
plt.rcParams['axes.titlesize'] = 20
plt.rcParams['font.size'] = 16
plt.rcParams['lines.linewidth'] = 2.0
plt.rcParams['lines.markersize'] = 8
plt.rcParams['legend.fontsize'] = 14
pd.set_option('display.float_format', lambda x: '{:,.2f}'.format(x))

In [None]:
with open("config.yaml", "r") as fh:
    config_dict = yaml.safe_load(fh)

out_dir = Path(config_dict['out_dir'])
map_dir = Path(config_dict['map_dir'])
analysis_dir = Path(config_dict['analysis_dir'])
gff_file = config_dict['gff_file']
gff = pr.read_gff3(gff_file)
counts_dir = Path(config_dict['counts_dir'])
sd = pd.read_csv(config_dict['sample_data_file'])

alphabetClrs = px.colors.qualitative.Alphabet


sushi_colors = {'red': '#C0504D',
             'orange': '#F79646',
             'medSea': '#4BACC6', 
             'black': '#000000',
             'dgreen': '#00B04E',
             'lgreen': '#92D050',
             'dblue': '#366092',
             'lblue': '#95B3D7',
             'grey': alphabetClrs[8]}

import re


In [None]:
libraries_used = ['library_9_1', 'library_12_1', 'library_10_1', 'library_11_2',
       'library_10_2', 'library_15_1', 'library_14_2', 'library_13_1',
       'library_13_2', 'library_12_2']

In [None]:
sd['name'] = sd['mouse'] + "_" + sd['library'] + "_" + sd['day'] + "_"+ sd['dnaid']

In [None]:
sd = sd.rename(columns={'sampleID':'sample_id'})

# Maps

In [None]:
maps = list(map_dir.rglob("*/*annotated.csv"))
map_df = pd.concat([pd.read_csv(f).assign(library=f.stem.split(".annotated")[0]) for f in maps])
map_sum = map_df.groupby('library').agg({'barcode':['nunique'], 'ID':['nunique']}).reset_index()
map_sum.columns = ['library', 'num_inserts', 'num_genes']

In [None]:
map_df = map_df[map_df.library.isin(libraries_used)]

In [None]:
map_df['Library'] = map_df['library'].str.replace("library_", '').str.replace('_', '.').astype(float)

In [None]:
fig = px.histogram(map_df[map_df.chr == 'FQ312003.1'].sort_values('Library'), x='insertion_site', color='Library', nbins=100, 
             template='plotly_white', width=1000, height=700, color_discrete_sequence=px.colors.sequential.gray, 
             labels={'insertion_site': 'Position, bp'}, log_y=False)

fig.update_layout(bargap=0.1)
fig.update_xaxes(showline=True, linewidth=1, linecolor='black',
                tickfont=dict(size=24, color='black'),  titlefont=dict(size=24, color='black'))
fig.update_yaxes(showline=True, linewidth=1, linecolor='black', title='Number of insertions',
                tickfont=dict(size=20, color='black'), titlefont=dict(size=30, color='black'))

In [None]:
map_gene_summary = map_df.groupby('Name').library.nunique().reset_index()

In [None]:
fig = px.histogram(map_gene_summary, x='library', 
             template='plotly_white', width=900, height=700, color_discrete_sequence=px.colors.sequential.gray, 
             labels={'insertion_site': 'Position, bp'}, log_y=False)

fig.update_layout(bargap=0.1)
fig.update_xaxes(showline=True, linewidth=1, linecolor='black', title='Libraries containing the gene disruption', tickvals = [1,2,3,4,5,6,7,8,9,10],
                tickfont=dict(size=20, color='black'),  titlefont=dict(size=24, color='black'))
fig.update_yaxes(showline=True, linewidth=1, linecolor='black', title='Number of genes',
                tickfont=dict(size=24, color='black'), titlefont=dict(size=30, color='black'))

## Simulated libraries

In [None]:
from numpy.random import RandomState
import numpy as np
import pyranges as pr
gen_len = 4878012

def simulate_insertions(genome_length, cds, depth=[1000], iterations=1, seed=42):
    res = None
    if iterations > 100000:
        print('Max number of iterations allowed is 100000')
        return None
    seeds = np.random.choice(100000, size=iterations)if iterations > 1 else [seed]
    df_list = []
    for seed in seeds:
        grs = {}
        for d in depth:
            prng = RandomState(seed)
            p = [1/genome_length]*genome_length
            starts = np.unique(prng.choice(genome_length, size=d, p=p))
            ends = starts+1
            chromosome = "FQ312003.1"
            grs[f'{d}']= pr.from_dict({'Chromosome':[chromosome]*len(starts), 'Start':starts, 'End':ends})
        ovs = pr.count_overlaps(grs, cds).as_df()
        df_list.append(pd.DataFrame((ovs[list(grs.keys())] > 0).sum()).T)
    
    return pd.concat(df_list).melt(var_name='num_inserts', value_name='num_genes')
    


In [None]:
gen_len = 4878012
cds = gff[gff.Feature == 'CDS']
simulated = simulate_insertions(gen_len, cds, [1000, 2000, 5000, 10000, 20000, 50000, 100000, 200000], 5)
simulated['num_inserts'] = simulated['num_inserts'].astype(int)

In [None]:
px.scatter(simulated, x='num_inserts', y='num_genes', template='plotly_white',  log_x=True, width=900, height=600)

In [None]:
px.strip(simulated, x='num_inserts', y='num_genes', template='plotly_white',  log_x=True, width=900, height=600)

In [None]:
X_all = map_df.barcode.nunique()
Y_all = map_df.ID.nunique()

In [None]:
X_used = map_df[map_df.library.isin(libraries_used)].barcode.nunique()
Y_used = map_df[map_df.library.isin(libraries_used)].ID.nunique()

In [None]:
map_sum.to_csv(out_dir/'map_summary.csv')

In [None]:
import plotly.graph_objects as go

fig = go.Figure()

fig.add_trace(go.Scatter(
    x=simulated.num_inserts, y=simulated.num_genes,
    name='Simulated Libraries',
    mode='markers',
    marker = dict(color='grey', opacity=0.3, size=8, line=dict(color='black', width=1))
    
))

fig.add_trace(go.Scatter(
    x=[X_all], y=[Y_all],
    name='Full library collection',
    marker= dict(color='red', size=14, symbol= 'cross', line=dict(color='black', width=1))
))

fig.add_trace(go.Scatter(
    x=[X_used], y=[Y_used],
    name='Libraries Used',
    marker= dict(color='blue', size=14, symbol= 'x', line=dict(color='black', width=1))
))

fig.add_trace(go.Scatter(
    x=map_sum.num_inserts, y=map_sum.num_genes,
    name='Libraries Used',
    mode='markers',
    marker= dict(color='green', size=10, symbol= 'star', line=dict(color='black', width=1))
))

# Set options common to all traces with fig.update_traces

fig.update_layout(title='Styled Scatter', template='plotly_white',
                  yaxis_zeroline=False, xaxis_zeroline=False, width=800, height=600)
fig.update_xaxes(type='log', title='Total number of unique inserts/barcodes')
fig.update_yaxes( title='Total number of CDS disrupted')


In [None]:
cfiles = list(counts_dir.rglob("*mbarq_merged_counts.csv"))
dd

In [None]:
df = pd.read_csv(cfiles[0]).drop(['barcode'], axis=1)

df2 = pd.read_csv(cfiles[1]).drop(['barcode'], axis=1)

In [None]:
df = df.melt(id_vars=['Name'], var_name='sample_id', value_name='cnt').groupby(['sample_id', 'Name']).cnt.sum().reset_index()
df2 = df2.melt(id_vars=['Name'], var_name='sample_id', value_name='cnt').groupby(['sample_id', 'Name']).cnt.sum().reset_index()

In [None]:
df['tts'] = np.log2(df['cnt']/df.groupby('sample_id')['cnt'].transform('sum')*1000000 +0.5)
df2['tts'] = np.log2(df2['cnt']/df.groupby('sample_id')['cnt'].transform('sum')*1000000 +0.5)

In [None]:
df3 = pd.concat([df, df2])

In [None]:
df3.tts.min()

In [None]:
df3 = df3.merge(sd[['sample_id', 'name']], on='sample_id', how='left')

In [None]:
df3.head()

In [None]:
df4 = df3.pivot(index='Name', columns='name', values='tts').fillna(-1)

In [None]:
import seaborn as sns

# Results

In [None]:
result_files = list(analysis_dir.glob("*rra_results.csv"))
df_list = []
for f in result_files:
    df = pd.read_csv(f).assign(library=f.stem.split("_rra")[0])
    df_list.append(df)
fdf = pd.concat(df_list)
fdf['hit'] = (abs(fdf.LFC) > 1) &((fdf.neg_selection_fdr < 0.05) | (fdf.pos_selection_fdr < 0.05))

In [None]:
libraries_used = fdf.library.unique()

In [None]:
libraries_used

In [None]:
# Table 1
table1 = fdf.groupby(['library', 'contrast']).agg({'locus_tag':['nunique'], 'hit':['sum'],
                                          'LFC': ['median']}).reset_index()
table1.columns = ['library', 'day', 'number_genes_analysed', 'number_hits', 'median_LFC']
table1                                          

In [None]:
rsig = fdf.query("(abs(LFC) > 1) &(neg_selection_fdr < 0.05 | pos_selection_fdr < 0.05)")
rsig.groupby(['library', 'contrast']).locus_tag.nunique()

In [None]:
# Get correlations
cor_df = (fdf[['locus_tag', 'contrast', 'LFC', 'library']]
          .pivot(index=['locus_tag', 'contrast'], columns='library')
          .reset_index()
          .set_index('locus_tag')
          .groupby('contrast')
          .corr()
          .reset_index())
df_list = []
for i, g in cor_df.groupby('contrast'):
    df = g.drop(['level_1'], axis=1).set_index(['contrast', 'library'])
    df = (df.mask(np.triu(np.ones(df.shape, dtype=np.bool_)))
          .stack()
          .rename_axis(('contrast', 'lib1', 'lib2'))
          .reset_index()
          .rename(columns={'LFC': 'R'}))
    df_list.append(df)
cor_df = pd.concat(df_list)



In [None]:
font_size=24
fig = px.box(cor_df, x='contrast', y='R', color_discrete_sequence = ['black']*4, 
                  labels={"contrast":'', 'R': "Pearson's <i>r</i>"},
                  height=500, width=400,  template='plotly_white')
fig.update_layout(showlegend=False)
fig.update_xaxes(showline=True, linewidth=2, linecolor='black',
                         tickfont=dict(size=font_size-6, color='black'), 
                 titlefont=dict(size=font_size, color='black'))
fig.update_yaxes(showline=True, linewidth=2, linecolor='black', range=[0, 1],
                        tickfont=dict(size=font_size-6, color='black'), 
                 titlefont=dict(size=font_size, color='black'))

In [None]:
ex_df = (fdf.query('(library=="library_10_2" | library == "library_11_2") ')[['locus_tag', 'LFC', 'library', 'contrast']]
        .pivot(index=['locus_tag', 'contrast'], columns='library')
        .dropna()
        .reset_index())
ex_df.columns = ['locus_tag','contrast', 'library_10_1', 'library_12_1']

In [None]:
px.scatter(ex_df, x='library_10_1', y='library_12_1',color='contrast', width=500, height=500, 
            template='plotly_white', trendline='ols' )



fig = px.scatter(ex_df, x='library_10_1', y='library_12_1', color='contrast', 
                     height=600, width=800,
                     template = 'plotly_white', 
                     labels = {'library_10_1': 'LFC(library 10-1)', 
                               'library_12_1': 'LFC (library 12-1)'},
                     color_discrete_map = {'d1': sushi_colors['red'], 
                                           'd2': sushi_colors['dgreen'], 
                                           'd3': sushi_colors['dblue'], 
                                           'd4': sushi_colors['orange']},
                #hover_data=['locus_tag', 'gene'],
                category_orders = {'contrast':['d1', 'd2', 'd3','d4']}, trendline='ols')

fig.update_traces(marker=dict(size=14, line=dict(width=1, color='DarkSlateGrey'), 
                                opacity=0.9),
                    selector=dict(mode='markers'))
fig.update_xaxes(showline=True, linewidth=2, linecolor='black',
                        tickfont=dict(size=font_size-6, color='black'), 
                    titlefont=dict(size=font_size, color='black'), range=[-14,8])
fig.update_yaxes(showline=True, linewidth=2, linecolor='black',
                    tickfont=dict(size=font_size-6, color='black'), 
                    titlefont=dict(size=font_size, color='black'), range=[-14,8])

fig.update_layout(legend=dict(font=dict(size=font_size-2)), 
                    legend_title=dict(font=dict(size=font_size)))

tr_line=[]
for  k, trace  in enumerate(fig.data):
        if trace.mode is not None and trace.mode == 'lines':
            tr_line.append(k)
print(tr_line)

for id in tr_line:
    fig.data[id].update(line_width=6)
fig

In [None]:
gt_dir = Path('/nfs/cds-peta/exports/biol_micro_cds_gr_sunagawa/scratch/ansintsova/Projects_NCCR/hardt/nguyenb/tnseq/scratch/manuscript_data')


In [None]:
phenotypes = pd.read_csv(gt_dir/"nguyen_2020/nguyen_2020_inframe_mut_phenotypes.csv")
phenotypes[phenotypes['day'] == 'd4']

In [None]:
from sklearn.metrics import recall_score, precision_score, balanced_accuracy_score

phenotypes = pd.read_csv(gt_dir/"nguyen_2020/nguyen_2020_inframe_mut_phenotypes.csv")
phenotypes = phenotypes.rename({'day': 'contrast'}, axis=1)
phenotypes = phenotypes.merge(fdf, how='left', left_on=['gene', 'contrast'], right_on=['locus_tag', 'contrast'])


# Same definition of hit as before
phenotypes['pheno_hit'] = ((phenotypes['adjusted p value (C.I.)'] <0.05) & (abs(np.log2(phenotypes['median'])) > 0.6)).astype(int)
phenotypes = phenotypes[['locus_tag', 'gene', 'adjusted p value (C.I.)', 'median', 'contrast', 'pheno_hit', 'hit', 'library']]
phenotypes = phenotypes.dropna()
phenotypes['mbarq_hit'] = phenotypes.hit.astype(int)

metrics = {'mBARq Analysis': (precision_score(phenotypes.pheno_hit, phenotypes.mbarq_hit), 
                     recall_score(phenotypes.pheno_hit, phenotypes.mbarq_hit), 
                     balanced_accuracy_score(phenotypes.pheno_hit, phenotypes.mbarq_hit)), }


metric_df = (pd.DataFrame(metrics, index=['Precision', 'Recall', 'Balanced Accuracy'])
              .T
            .reset_index()
            .rename({'index':'Method'}, axis=1)
              .melt(id_vars=['Method'], var_name='Metric', value_name='Score'))

In [None]:
phenotypes.groupby(['library', 'contrast']).agg({'gene': ['nunique'], 'pheno_hit': ['sum']}).reset_index()

In [None]:
metric = phenotypes.groupby(['library', 'contrast']).apply(lambda x: pd.Series({'Precision':  precision_score(x.pheno_hit, x.mbarq_hit),
                                                            'Recall': recall_score(x.pheno_hit, x.mbarq_hit)})).reset_index()

In [None]:
metric = metric.melt(id_vars=['library', 'contrast'], var_name='metric', value_name='score')

In [None]:
fig = px.box(metric, x='contrast', y='score', color='metric', color_discrete_sequence = [sushi_colors['dblue'], sushi_colors['lblue']],
                        template='plotly_white', height=400, width=600)

fig.update_xaxes(showline=True, linewidth=2, linecolor='black',
                         tickfont=dict(size=font_size-6, color='black'), 
                 titlefont=dict(size=font_size, color='black'))
fig.update_yaxes(showline=True, linewidth=2, linecolor='black', range=[0, 1.15],
                        tickfont=dict(size=font_size-6, color='black'), 
                 titlefont=dict(size=font_size, color='black'))

In [None]:
#fdf.to_csv(out_dir/'final-results-all-libraries.csv', index=False)

In [None]:
#rsig.to_csv(out_dir/'final-results-all-libraries-significant-only.csv', index=False)

In [None]:
res_sum = rsig.groupby(['library', 'contrast']).agg({'locus_tag':['nunique']}).reset_index()
res_sum.columns = ['library', 'contrast', 'number_of_hits']       
#res_sum.to_csv(out_dir/'number_of_hits_per_library.csv', index=False)                                   

In [None]:
(rsig.groupby(['contrast', 'locus_tag'])
                          .library.nunique()
                          .reset_index()
                          .rename(columns={'library': 'num_hit'}))

In [None]:
fdf.head()

In [None]:
fdf['padj'] = fdf[['neg_selection_fdr','pos_selection_fdr']].min(axis=1)

In [None]:
fdf.head()

In [None]:
num_hit = (fdf.groupby(['contrast', 'locus_tag'])
           .agg({'library':['nunique'], 'LFC':['median'], 'padj':['min', 'max']})
           .reset_index())
num_hit.columns = ['contrast', 'locus_tag', 'num_lib', 'LFC_median', 'padj_min', 'padj_max']
num_hit =  (num_hit.merge(rsig.groupby(['contrast', 'locus_tag'])
                          .library.nunique()
                          .reset_index()
                          .rename(columns={'library': 'num_hit'}), 
            on=['contrast', 'locus_tag'], how='outer'))
num_hit['num_hit'] = num_hit['num_hit'].fillna(0).astype(int)

In [None]:
fdf[fdf.locus_tag == 'RygC']

In [None]:
num_hit[num_hit.locus_tag == 'RygC']

In [None]:
num_hit.columns = ['day', 'locus_tag', 'number_of_libraries_with_mutant', 'LFC_median', 'padj_min', 'padj_max', 'number_of_times_detected_as_hit']

In [None]:
num_hit = num_hit[['locus_tag', 'day', 'LFC_median', 'number_of_libraries_with_mutant', 'number_of_times_detected_as_hit', 'padj_min', 'padj_max']].sort_values(['day','LFC_median'])

In [None]:
num_hit.shape

In [None]:
final_summary = num_hit.pivot(index=['locus_tag', 'number_of_libraries_with_mutant'], columns='day', 
              values=['LFC_median', 'number_of_times_detected_as_hit', 'padj_min', 'padj_max']).reset_index()

In [None]:
final_summary.head()

In [None]:
col_names = [f'{col_name}_{day}' for col_name in ['LFC_median', 'number_of_times_detected_as_hit', 
                                                  'padj_min', 'padj_max'] for day in ['d1', 'd2', 'd3', 'd4']]
final_summary.columns = ['locus_tag', 'number_of_libraries_with_mutant'] + col_names

In [None]:
final_summary.head()

In [None]:
final_summary.to_csv(out_dir/'12-09-final-results-gene-level-summary.csv', index=0)

In [None]:
import requests
from time import sleep
import json

def string_function(gene_names, species):
    string_api_url = "https://version-11-5.string-db.org/api"
    output_format = "json"
    method = "enrichment"


    ##
    ## Construct the request
    ##

    request_url = "/".join([string_api_url, output_format, method])

    ##
    ## Set parameters
    ##
    params = {

        "identifiers" : "\r".join(gene_names),  # your protein
        "species" : species, # species NCBI identifier 
        "caller_identity" : "test_api" # your app name

    }

    response = requests.post(request_url, data=params)
    data = json.loads(response.text)
    return data

def get_functional_analysis(df, day, direction, out_dir):
    d = '< -1' if direction == 'decreased-fitness' else '> 1'
    genes = df.query(f"LFC_median {d} & day == '{day}'").locus_tag.values
    data = pd.DataFrame(string_function(genes, 99287))
    data.to_csv(out_dir/f"{day}_{direction}_functional-analysis.csv", index=False)
    return data

# Day 1 down
day = 'd1'
direction = 'decreased-fitness'
r = get_functional_analysis(num_hit, day, direction, out_dir)

# Day 1 up
day = 'd1'
direction = 'increased-fitness'
get_functional_analysis(num_hit, day, direction, out_dir)

# Day 2 down
day = 'd2'
direction = 'decreased-fitness'
get_functional_analysis(num_hit, day, direction, out_dir)

# Day 2 up
day = 'd2'
direction = 'increased-fitness'
get_functional_analysis(num_hit, day, direction, out_dir)

# Day 3 down
day = 'd3'
direction = 'decreased-fitness'
get_functional_analysis(num_hit, day, direction, out_dir)

# Day 3 up
day = 'd3'
direction = 'increased-fitness'
get_functional_analysis(num_hit, day, direction, out_dir)

In [None]:
day = 'd1'
direction = 'decreased-fitness'
r = get_functional_analysis(num_hit, day, direction, out_dir)

In [None]:
terms = [()'Lipopolysaccharide biosynthesis', 'KW-0448'), ('O-Antigen nucleotide sugar biosynthesis', 'CL:4794'), ('RNA degradation', ), ('AA biosynthetic process and ..', 'CL:1292')]

# General to use: Carbohydrate metabolic process , Lipid metabolic process

In [None]:
r.sort_values('number_of_genes', ascending=False)

In [None]:
r = r[['term', 'inputGenes', 'description']]
r.explode('inputGenes').sample(20)

# KEGG

In [None]:
from Bio import SeqIO
from Bio.KEGG import REST
from Bio.KEGG import Gene
# Standard library packages
import io
import os

# A bit of code that will help us display the PDF output
def PDF(filename):
    return HTML('<iframe src=%s width=700 height=350></iframe>' % filename)

# Some code to return a Pandas dataframe, given tabular text
def to_df(result):
    return pd.read_table(io.StringIO(result), header=None)

In [None]:
with open('ko03110.keg') as handle:
    for record in Gene.parse(handle):
        print(record.name)

In [None]:
result = REST.kegg_list("pathway", 'sey').read()

In [None]:
sey_pathways = to_df(result)
sey_pathways.columns = ['path_id', 'path_desc']

In [None]:
sey_pathways[sey_pathways.path_desc.str.contains("Lipo")]

In [None]:
pathways = ['sey00010', 'sey00020', 'sey00030', 'sey00190', 'sey01230', 'sey00230', 'sey00240',  'sey00061', 'sey00071', 'sey01240',
            'sey02010', 'sey02020', "sey00540", "sey00541", "sey02010", "sey01212", "sey01200"]

brite = ['sey02000', 'sey01005', 'sey03110']

In [None]:
import requests
from lxml import html,etree

def get_genes_for_pathway_v2(pathway="sey01212", db='path'):
    url = f"https://www.genome.jp/dbget-bin/get_linkdb?-t+genes+{db}:{pathway}"
    try:
        page = requests.get(url).content.decode('utf-8')
        tree = etree.parse(io.StringIO(page), parser=etree.HTMLParser())
        refs = tree.xpath("//a")
        links = [link.get('href', '') for link in refs]
        links = [l for l in links if 'sey:SL1344' in l]
        return [l.split("sey:")[1] for l  in links]
    except HTTPError:
        print("Bad pathway name")

def get_gene_info(locus_tags):
    #slow
    gene_ids = []
    pattern = r".*SYMBOL *(\w*).*ORTHOLOGY *(K[0-9]*)"
    for locus_tag in locus_tags:
        desc = REST.kegg_get(f"sey:{locus_tag}").read()
        matches = re.search(pattern, desc.replace("\n", ' '))
        if matches:
            gene_ids.append([locus_tag, matches.group(1), matches.group(2)])
    return pd.DataFrame(gene_ids, columns = ['locus_tag', 'Name', 'KO'])


def get_kegg_info(pathways, db, gff):
    df_list = []
    for pathway in pathways:
        genes = get_genes_for_pathway_v2(pathway,db)
        df_list.append(pd.DataFrame(genes, columns=['locus_tag']).assign(KEGG_Pathway=pathway))
    fdf = pd.concat(df_list)
    gff = gff[gff.Feature == 'gene'][['Name', 'locus_tag']]
    fdf = fdf.merge(gff, how='left', on='locus_tag')
    return fdf

In [None]:
def get_genes_for_pathway(pathway, ):
    try:
        lines = REST.kegg_get(pathway).read().split("\n")
        ids = []
        pattern = r"[\w ]*(SL1344_[0-9]*) .([\w -]*)(;|\[)[\w -\[]*KO:(K[0-9]*)"
        for line in lines:
            matches = re.search(pattern, line)
            if matches:
                ids.append([matches.group(1), matches.group(2), matches.group(4)])
        return pd.DataFrame(ids, columns=['locus_tag', 'Name', 'KO']).assign(KEGG_Pathway=pathway)
    except HTTPError:
        print("Bad pathway name")

df_list = []
for pathway in pathways:
    df_list.append(get_genes_for_pathway(pathway))
ko_df = pd.concat(df_list)

In [None]:
gff= pr.read_gff3(Path(config_dict['gff_file'])).as_df()
ko_df = get_kegg_info(pathways,db='path', gff=gff)


In [None]:
ko_df2 = get_kegg_info(brite,db='br', gff=gff)

In [None]:
ko_df = pd.concat([ko_df, ko_df2])

In [None]:
ko_df['gene_id'] = ko_df.Name.apply(lambda x: x if len(x.split()) == 1 else np.nan)
ko_df['gene_id'] = ko_df['gene_id'].fillna(ko_df.locus_tag)

In [None]:
test = num_hit[num_hit.LFC_median < 0].rename(columns={'locus_tag':'gene_id'}).merge(ko_df, on='gene_id', how='left')

In [None]:
test['KEGG_Pathway'] = test["KEGG_Pathway"].fillna('Other')

In [None]:
ko_df.groupby('KEGG_Pathway').locus_tag.nunique()

In [None]:
test[test.number_of_times_detected_as_hit > 1].groupby('day').gene_id.nunique()

In [None]:
hi_conf = test[(test.LFC_median < - 1) &(test.number_of_times_detected_as_hit > 1)]

In [None]:
test2 = hi_conf.groupby(['day', 'KEGG_Pathway']).gene_id.nunique().reset_index().sort_values('gene_id')
test2 = test2[(test2.gene_id >=4) ]

In [None]:
test

In [None]:
px.bar(test2.sort_values('day'), x='day', y='gene_id', color='KEGG_Pathway', height=800, width=800)

In [None]:
print(REST.kegg_get("sey:recR").read())

In [None]:
with open('test.keg', 'w') as fh:
    fh.write(REST.kegg_get("sey:recO").read())

In [None]:
with open("test.keg") as handle:
    for record in Gene.parse(handle):
        print("%s %s" % (record.entry, record.dblinks))

In [None]:
hi_conf[(hi_conf.day == 'd4') & (hi_conf.KEGG_Pathway == 'Other')]