In [None]:
# %load ../snippets/basic_settings.py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import matplotlib

sns.set_context("notebook", font_scale=1.1)
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 100)
plt.rcParams["figure.figsize"] = (16, 12)
plt.rcParams['savefig.dpi'] = 200
plt.rcParams['figure.autolayout'] = False
plt.rcParams['axes.labelsize'] = 18
plt.rcParams['axes.titlesize'] = 20
plt.rcParams['font.size'] = 16
plt.rcParams['lines.linewidth'] = 2.0
plt.rcParams['lines.markersize'] = 8
plt.rcParams['legend.fontsize'] = 14
plt.rcParams['text.usetex'] = False  # True activates latex output in fonts!
plt.rcParams['font.family'] = "serif"
plt.rcParams['font.serif'] = "cm"
pd.set_option('display.float_format', lambda x: '{:,.2f}'.format(x))

# Biopython KEGG API 

In [None]:
from Bio import SeqIO
from Bio.KEGG.REST import *
from Bio.KEGG.KGML import KGML_parser
from Bio.Graphics.KGML_vis import KGMLCanvas
from Bio.Graphics.ColorSpiral import ColorSpiral

from IPython.display import Image, HTML
import IPython
import random

# A bit of code that will help us display the PDF output
def PDF(filename):
    return HTML('<iframe src=%s width=700 height=350></iframe>' % filename)

# A bit of helper code to shorten long text
def head(text, lines=10):
    """ Print the first lines lines of the passed text.
    """
    print ('\n'.join(text.split('\n')[:lines] + ['[...]']))

In [None]:
all_pathways = kegg_list("pathway").read().strip().split('\n')
all_pathways = [c.split("\t") for c in all_pathways]
all_pathways_map = {c[0].split(':')[1]: c[1] for c in all_pathways}
all_pathways_ko = {c[0].split(':')[1].replace('map', 'ko'): c[1] for c in all_pathways}

In [None]:
scratchDir = Path('/nfs/nas22/fs2202/biol_micro_bioinf_nccr/hardt/nguyenb/tnseq/scratch/10_22')

In [None]:
# pd.DataFrame.from_dict(all_pathways_ko, orient='index').to_csv(scratchDir/'20-10-22-kegg-pathway-list-ko.csv',
#                                                                header=None)

In [None]:
# pd.DataFrame.from_dict(all_pathways_map, orient='index').to_csv(scratchDir/'20-10-22-kegg-pathway-list-map.csv',
#                                                               header=None)

# Get all pathways for an organism 

In [None]:
organism = 'sey'
result = pd.read_table(io.StringIO(kegg_list("pathway", organism).read()), header=None)
result.columns = [f'KEGG_Pathway', 'Pathway_Description']
result[f'KEGG_Pathway'] = result[f'KEGG_Pathway'].str.split(":").str.get(1)
result['Pathway_Description'] = result['Pathway_Description'].str.split(" - ").str.get(0)
result['KEGG_Display'] = result[f'KEGG_Pathway'] + ":" + result['Pathway_Description']
path_map = result.set_index('KEGG_Display').to_dict()
# todo create a column that combines path and description

In [None]:
path_map['KEGG_Pathway']

In [None]:
# User chose a pathway
pathway = 'sey00010'

# Drawing a map

## Load the result files

In [None]:
data = pd.read_csv(scratchDir/"control_norm_analysis/26-10-22-annotated-results.csv")
data['hit'] = ((abs(data.LFC) > 1) & (data.fdr<0.05))
hitSummary = (data.groupby(['Name', 'contrast']).hit.sum()
                  .reset_index()
                  .rename({'hit':'hitSum'}, axis=1))
data = (data.merge(hitSummary, on=['Name','contrast' ], how='outer'))
data['hitStar'] = data['hitSum'].apply(lambda x: '*' if x > 0 else '')
data['NameForMap'] = data['Name'] + data['hitStar'] + " (" + data['LFC_median'].round(2).astype(str) +")"
data_short = data[['Name', 'contrast', 'locus_tag', 'NameForMap', 'LFC_median']].drop_duplicates()
data_short = data[['Name', 'contrast', 'locus_tag', 'NameForMap', 'LFC_median']].drop_duplicates()

In [None]:

data_short

## Define color map

In [None]:
norm = matplotlib.colors.Normalize(vmin=-6, vmax=6, clip=True)
mapper = matplotlib.cm.ScalarMappable(norm=norm, cmap=sns.diverging_palette(220, 20, as_cmap=True))
data_short['hex'] = data_short.LFC_median.apply(mapper.to_rgba).apply(matplotlib.colors.to_hex)
data_short['hex'] = data_short.hex.str.replace('#000000', "#7c7d83")

In [None]:
# Choose day
day = 'd1'
data_day = data_short[data_short.contrast == day]
ko_dict = data_day.set_index('locus_tag').to_dict()

In [None]:
ko_dict['NameForMap']['SL1344_3685']

In [None]:
ko_dict['hex']['SL1344_3685']

In [None]:
ko_dict['hex']

# Load the map KGML

In [None]:
pathwayKGML = KGML_parser.read(kegg_get(pathway, "kgml"))
canvas = KGMLCanvas(pathwayKGML, import_imagemap=True)
fname = f"{pathway}_{day}_map.pdf"
canvas.draw(fname)
IPython.display.IFrame(fname, width=1000, height=1000)

In [None]:
pathwayKGML = KGML_parser.read(kegg_get(pathway, "kgml"))
canvas = KGMLCanvas(pathwayKGML, import_imagemap=True)
pathGeneNames = [gene.name.split() for gene in pathwayKGML.genes]
pathGeneNames = [gene.split(":")[1] for sublist in pathGeneNames for gene in sublist]
# -> use this for the heatmaps

In [None]:
sorted(pathGeneNames)

In [None]:
pathwayKGML = KGML_parser.read(kegg_get(pathway, "kgml"))
canvas = KGMLCanvas(pathwayKGML, import_imagemap=True)

for element in pathwayKGML.genes:
    color = None
    name = None
    node_kos = [e.split(":")[1] for e in element.name.split()]
    names = []
    for ko in node_kos:
        color = ko_dict['hex'].get(ko, color)
        name = ko_dict['NameForMap'].get(ko, name)
    for graphic in element.graphics:
        if color is not None:
            graphic.bgcolor = color
            graphic.name = name
fname = f"{pathway}_{day}_map.pdf"
canvas.draw(fname)
IPython.display.IFrame(fname, width=1000, height=1000)

0. Create Naming column:
    Add star to name if was hit at least in 1 library. Define hit as LFC > 1 and fdf < 0.05

For each day:
  For each pathway:

1. Subset to that day + missing genes
2. Define colorscale based on median LFC
3.  
4. Display name as Name*, LFC
5. Save map


In [None]:
data = pd.read_csv(scratchDir/"control_norm_analysis/24-10-22-annotated-results.csv")
data['hit'] = ((abs(data.LFC) > 1) & (data.fdr<0.05))
hitSummary = (data.groupby(['Name', 'contrast']).hit.sum()
                  .reset_index()
                  .rename({'hit':'hitSum'}, axis=1))
data = (data.merge(hitSummary, on=['Name','contrast' ], how='outer'))
data['hitStar'] = data['hitSum'].apply(lambda x: '*' if x > 0 else '')
data['NameForMap'] = data['Name'] + data['hitStar'] + " (" + data['LFC_median'].round(2).astype(str) +")"
data['KEGG_Pathway'] = data["KEGG_Pathway"].fillna('-')

In [None]:
all_pathways_ko

In [None]:
day = 'd4'
pathwayName = 'ko00190'

In [None]:
pDf = data[(data.KEGG_Pathway.str.contains(pathwayName)) & ((data.contrast == day) | (data.contrast.isnull()))].copy()
#minima = -max(abs(pDf.LFC_median.min()), abs(pDf.LFC_median.max()))
#maxima = max(abs(pDf.LFC_median.min()), abs(pDf.LFC_median.max()))
norm = matplotlib.colors.Normalize(vmin=-6, vmax=6, clip=True)
mapper = matplotlib.cm.ScalarMappable(norm=norm, cmap=sns.diverging_palette(220, 20, as_cmap=True))
pDf['hex'] = pDf.LFC_median.apply(mapper.to_rgba).apply(matplotlib.colors.to_hex)
ko_map = (pDf[['Name','LFC_median', 'KEGG_ko', 'hex', 'NameForMap']]
          .replace('-', np.nan)
          .dropna(subset=['KEGG_ko'])
          .drop_duplicates())
new_cols = ko_map.KEGG_ko.str.split(",", expand=True)
ko_map = pd.concat([ko_map, new_cols], axis=1)
ko_map = (ko_map.melt(id_vars=['LFC_median', 'NameForMap', 'hex'], value_vars=new_cols.columns, value_name='KO')[['NameForMap', 'LFC_median', 'KO', 'hex']]
           .dropna(subset=['KO']))
ko_map['hex'] = ko_map.hex.str.replace('#000000', "#7c7d83")
ko_dict = ko_map.set_index("KO").to_dict()


In [None]:
pathway = KGML_parser.read(kegg_get(pathwayName, "kgml"))
canvas = KGMLCanvas(pathway, import_imagemap=True)

for element in pathway.orthologs:
    color = None
    name = None
    node_kos = element.name.split()
    names = []
    for ko in node_kos:
        
        color = ko_dict['hex'].get(ko, color)
        name = ko_dict['NameForMap'].get(ko, name)

    for graphic in element.graphics:
        if color is not None:
            graphic.bgcolor = color
            graphic.name = name
        else:
            graphic.bgcolor = '#FFFFFF'
    
fname = f"{pathwayName}_{day}_map.pdf"
canvas.draw(fname)
IPython.display.IFrame(fname, width=1000, height=1000)

In [None]:
#pathway = KGML_parser.read(kegg_get("ko00061", "kgml"))
# pathway = KGML_parser.read(kegg_get("ko01130", "kgml"))
# canvas = KGMLCanvas(pathway, import_imagemap=True)
# canvas.draw("fab_map_with_image.pdf")
# PDF("fab_map_with_image.pdf")

In [None]:
ko_map = data[(data.day == 'd1') | (data.day.isnull())][['Name','LFC_mean', 'KEGG_ko']].replace('-', np.nan).dropna(subset=['KEGG_ko'])
new_cols = ko_map.KEGG_ko.str.split(",", expand=True)
ko_map = pd.concat([ko_map, new_cols], axis=1)
ko_map = (ko_map.melt(id_vars=['LFC_mean', 'Name'], value_vars=new_cols.columns, value_name='KO')[['Name', 'LFC_mean', 'KO']]
           .dropna(subset=['KO']))

def assign_color(x):

    if x > 1:
        return '#fa5282'
    elif x < -1:
        return '#4ab548' 
    elif -1<x<1:
        return '#faefbe'
    else:
        return '#0daeff'
    
ko_map['hex'] = ko_map.LFC_mean.apply(assign_color )
ko_map["Name2"] = ko_map['Name'] + ", " + ko_map['LFC_mean'].round(2).astype(str)
komap_dict = ko_map[['KO', 'hex', 'Name2']].set_index(['KO']).to_dict()

In [None]:
ko_map

In [None]:
def rgb_to_hex(rgb):
    rgb = tuple([int(255*val) for val in rgb])
    return '#' + ''.join([hex(val)[2:] for val in rgb]).upper()

In [None]:
import plotly.express as px
x = sns.diverging_palette(220, 20, as_cmap=False)
rgb_to_hex(x[0])

In [None]:
data.sample(5)

In [None]:
result = kegg_get("sey00061", "image").read()
Image(result)

In [None]:
pathway = KGML_parser.read(kegg_get("sey00540", "kgml"))
canvas = KGMLCanvas(pathway, import_imagemap=True)
color = None
name = None
# for element in pathway.orthologs:
    
#     for ko in element.name.split():
#         color = komap_dict['hex'].get(ko, None)
#         name = komap_dict['Name2'].get(ko, None)
#     if color is not None:
#         for graphic in element.graphics:
#             graphic.bgcolor = color
#             graphic.name = name
canvas.draw("fab_map.pdf")
IPython.display.IFrame("fab_map.pdf", width=1000, height=1000)

In [None]:
canvas = KGMLCanvas("sey00540", import_imagemap=True)
canvas.draw("fab_map_new_colours.pdf")
IPython.display.IFrame("fab_map_new_colours.pdf", width=1000, height=800)

In [None]:
def rgb_to_hex(rgb):
    rgb = tuple([int(255*val) for val in rgb])
    return '#' + ''.join([hex(val)[2:] for val in rgb]).upper()


# Define arbitrary colours
colorspiral = ColorSpiral()
colorlist = colorspiral.get_colors(len(pathway.orthologs))

# Change the colours of ortholog elements
for color, element in zip(colorlist, pathway.orthologs):
    for graphic in element.graphics:
        graphic.bgcolor = rgb_to_hex(color)
        
canvas = KGMLCanvas(pathway, import_imagemap=True)
canvas.draw("fab_map_new_colours.pdf")
IPython.display.IFrame("fab_map_new_colours.pdf", width=800, height=800)
#PDF("fab_map_new_colours.pdf")

In [None]:
canvas

In [None]:
# Use the bacterial diverse environments map
pathway = KGML_parser.read(kegg_get("ko01120", "kgml"))

# Change the widths of reaction entries elements
for element in pathway.orthologs:
    for graphic in element.graphics:
        graphic.width = random.randrange(1, 10, 1)
        
canvas = KGMLCanvas(pathway, import_imagemap=False)
canvas.draw("bacteria_mod_widths.pdf")
PDF("bacteria_mod_widths.pdf")

In [None]:
lps_kos = []
for orth in pathway.orthologs:
    lps_kos.append(orth.graphics[0].name.strip('...'))

In [None]:
test = df[['gene', 'KEGG_ko', 'z-score', 'day']].copy()
test['KEGG_ko'] = test.KEGG_ko.apply(lambda x: x.split(";")[0].strip('ko:') if ';' in x else x.strip("ko:"))
td1 = test[test.day == 'd1']

In [None]:
td1col = td1[td1.KEGG_ko.isin(lps_kos)].groupby('KEGG_ko').median()
ncolor = td1col['z-score'].values
ncolor = [colors.to_hex(sm.to_rgba(x)) for x in ncolor]
td1col['col'] = ncolor
ncolor
coldict = td1col.to_dict()['col']

In [None]:
for element in pathway.orthologs:
    for graphic in element.graphics:
        if graphic.name in coldict.keys():
            graphic.bgcolor = coldict[graphic.name]
        else:
            graphic.bgcolor = '#f7f6ff'

In [None]:
canvas = KGMLCanvas(pathway, import_imagemap=True)
canvas.draw("fab_map_new_colours.pdf")
PDF("fab_map_new_colours.pdf")

In [None]:
# Colors
vmin= -9
vmax=2
cmap = plt.cm.coolwarm
from matplotlib import colors
divnorm=colors.TwoSlopeNorm(vmin=vmin, vcenter=0., vmax=vmax)
sm = plt.cm.ScalarMappable(cmap=cmap,norm=divnorm)

In [None]:
ncolor[0]

In [None]:
lps_kos

In [None]:
path2gene['ko01130']

In [None]:
df = pd.read_csv("/nfs/nas22/fs2202/biol_micro_bioinf_nccr/hardt/nguyenb/tnseq/scratch/10_22/control_norm_analysis/26-10-22-annotated-results.csv")

In [None]:
df = df[(df.contrast == 'd1') | (df.contrast.isna())]
df.sample(4)

In [None]:
pathName = 'ko00540'
df["KEGG_Pathway"] = df.KEGG_Pathway.fillna('-')


In [None]:
kegg_df = df[df.KEGG_Pathway.str.contains(pathName)]
kegg_df = kegg_df[~kegg_df.library.isna()]

In [None]:
kegg_df['hit'] = (abs(kegg_df.LFC) > 1) & (kegg_df.fdr < 0.01)

In [None]:
lps_genes

In [None]:
def display_kegg_map(kegg_df, pathwayName, contrast, kegg_col='KEGG_Pathway', gene_id='Name'):
    # Assumptions results contains 'contrast' column and 'LFC' column, 'KEGG_ko'
    # Assumes kegg_df already has  hit column -> identified based on user set criteria
    # In case there are multiple libraries, aggregate over libraries

    hitSummary = (kegg_df.groupby([gene_id, 'contrast']).agg({'LFC': ['median'], 'hit': ['sum']})
                  .reset_index())
    hitSummary.columns = [gene_id, 'contrast','lfcSum', 'hitSum']
    kegg_df = (kegg_df.merge(hitSummary, on=[gene_id, 'contrast'], how='outer'))
    kegg_df['hitStar'] = kegg_df['hitSum'].apply(lambda x: '*' if x > 0 else '')
    kegg_df['NameForMap'] = kegg_df[gene_id] + kegg_df['hitStar'] + " (" + kegg_df['lfcSum'].round(2).astype(str) + ")"
    kegg_df['KEGG_Pathway'] = kegg_df["KEGG_Pathway"].fillna('-')
    #pathDf = kegg_df[(kegg_df[kegg_col].str.contains(pathwayName)) & ((kegg_df.contrast == contrast) | (kegg_df.contrast.isnull()))].copy()
    pathDf = kegg_df.copy()

    # minima = -max(abs(pDf.LFC_median.min()), abs(pDf.LFC_median.max()))
    # maxima = max(abs(pDf.LFC_median.min()), abs(pDf.LFC_median.max()))
    # Defining the palette
    norm = matplotlib.colors.Normalize(vmin=-6, vmax=6, clip=True)
    mapper = matplotlib.cm.ScalarMappable(norm=norm, cmap=sns.diverging_palette(220, 20, as_cmap=True))
    pathDf['hex'] = pathDf['lfcSum'].apply(mapper.to_rgba).apply(matplotlib.colors.to_hex)
    ko_map = (pathDf[[gene_id, 'lfcSum', 'KEGG_ko', 'hex', 'NameForMap']]
              .replace('-', np.nan)
              .dropna(subset=['KEGG_ko'])
              .drop_duplicates())

    new_cols = ko_map['KEGG_ko'].str.split(",", expand=True)
    ko_map = pd.concat([ko_map, new_cols], axis=1)
    ko_map = (ko_map.melt(id_vars=['lfcSum', 'NameForMap', 'hex'], value_vars=new_cols.columns, value_name='KO')[
                  ['NameForMap', 'lfcSum', 'KO', 'hex']]
              .dropna(subset=['KO']))
    ko_map['hex'] = ko_map.hex.str.replace('#000000', "#7c7d83")
    ko_dict = ko_map.set_index("KO").to_dict()
    pathway = KGML_parser.read(kegg_get(pathwayName, "kgml"))
    canvas = KGMLCanvas(pathway, import_imagemap=True)

    for element in pathway.orthologs:
        color = None
        name = None
        
        node_kos = element.name.split()
        print(node_kos)
        for ko in node_kos:
            print(ko)
            color = ko_dict['hex'].get(ko, color)
            print(color)
            name = ko_dict['NameForMap'].get(ko, name)
        for graphic in element.graphics:
            if color is not None:
                graphic.bgcolor = color
                graphic.name = name
            else:
                graphic.bgcolor = '#FFFFFF'
    fname = f"{pathwayName}_{contrast}_map.pdf"
    canvas.draw(fname)
    return fname, ko_map


In [None]:
fname, ko_map = display_kegg_map(kegg_df, "sey00540", 'd1', kegg_col='KEGG_Pathway', gene_id='Name')

In [None]:
pathway = KGML_parser.read(kegg_get("sey00540", "kgml"))

In [None]:
for gene in pathway.genes:
    print(gene)

In [None]:
dir(pathway)

In [None]:
[p for p in pathway.orthologs]

In [None]:
fname

In [None]:
PDF(fname)

In [None]:
df[df.Name == 'kdsB']