In [1]:
from collections import defaultdict
import pandas as pd
import tmap as tm
from faerun import Faerun
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

from tqdm import tqdm
tqdm.pandas()

# Load data

In [2]:
all_data = pd.read_csv("../data/fingerprints/combined_mhfp6.tsv", sep="\t")

In [3]:
amr_df = pd.read_csv("../data/processed/combined_bioassay_data.tsv", sep="\t", usecols=["compound_smiles", "compound_inchikey"])

In [4]:
combined_df = pd.merge(all_data, amr_df, left_on="cmp_id", right_on="compound_inchikey")
combined_df.drop(columns=["compound_inchikey"], inplace=True)
combined_df.head(2)

Unnamed: 0,cmp_id,bit0,bit1,bit2,bit3,bit4,bit5,bit6,bit7,bit8,...,bit2040,bit2041,bit2042,bit2043,bit2044,bit2045,bit2046,bit2047,label,compound_smiles
0,OOYGSFOGFJDDHP-KMCOLRRFSA-N,53109374,13294028,17313015,13405020,159565048,166773519,112685388,23281340,74257363,...,86693380,23470166,13223091,13893646,72383491,138619944,10981154,123604138,acid-fast,NC[C@H]1O[C@H](O[C@H]2[C@H](O)[C@@H](O[C@H]3O[...
1,XIPHLJFTBFXVBS-UHFFFAOYSA-N,2376200,75861701,8411880,265132626,17183886,48771247,26537035,264241068,149295683,...,21431025,1416250,104553653,280401855,71835899,176069654,91289868,137970302,fungi,C=C(C(=O)c1ccc(F)cc1)c1ccc(Cl)cc1Cl


# Generate the map

In [5]:
fingerprint_dict = defaultdict(list)

mol_smiles = combined_df["compound_smiles"]
mol_labels = combined_df["label"]
mol_fingerprint_df = combined_df.drop(columns=["compound_smiles", "cmp_id", "label"])

for idx, row in tqdm(mol_fingerprint_df.iterrows(), total=mol_fingerprint_df.shape[0]):
    fingerprint_dict[mol_smiles[idx]].append((row.to_numpy(), mol_labels[idx]))

100%|██████████| 77442/77442 [00:09<00:00, 8520.42it/s] 


In [6]:
lf = tm.LSHForest(2048, 128)
fps = []
labels = []
activity = []

for smile, data_info in tqdm(fingerprint_dict.items()):
    fp, class_label = data_info[0]
    fps.append(tm.VectorUint(fp))
    labels.append(smile)
    activity.append(class_label)

100%|██████████| 77442/77442 [00:57<00:00, 1344.71it/s]


In [7]:
lf.batch_add(fps)
lf.index()

In [8]:
cfg = tm.LayoutConfiguration()
x, y, s, t, _ = tm.layout_from_lsh_forest(lf, config=cfg)

In [9]:
custom_cmap = ListedColormap(
    ["#2ecc71", "#9b59b6", "#ecf0f1", "#e74c3c", "#e67e22", "#f1c40f", "#95a5a6"],
    name="custom",
)

In [10]:
f = Faerun(
    clear_color="#222222",
    coords=False,
    view="front",
)

In [11]:
activity_mapper = {
    'acid-fast': 0, 
    'fungi': 1, 
    'gram-negative': 2, 
    'gram-positive': 3
}

activity_labels, activity_data = Faerun.create_categories([
    activity_mapper[i]
    for i in activity
])

In [24]:
f.add_scatter(
    "amr",
    {
        "x": x,
        "y": y,
        "c": [activity_data],
        "labels": labels,
    },
    shader="smoothCircle",
    colormap=[custom_cmap],
    categorical=[True],
    has_legend=True,
    legend_labels=[(0, "acid-fast"), (1, "fungi"), (2, "gram-negative"), (3, "gram-positive")],
    legend_title="",
)

In [25]:
f.add_tree("amrtree", {"from": s, "to": t}, point_helper="amr")

In [26]:
f.plot("amr", template="smiles")