In [1]:
# All source code and ideas are from: https://towardsdatascience.com/neural-network-embeddings-explained-4d028e6f0526
# with some modifications

"""
Note to me: Another formulation of the problem is to fix the number of links but allow a 
variable number of articles (right now both are fixes)

The input of the neural network would be a one hot encoded vector
with each position (dim.) representing wether a link exists.

Each vector represents an article.
"""

%matplotlib inline
import functools
import json
from collections import Counter
from glob import glob
from itertools import chain
from pathlib import Path

import matplotlib.pyplot as plt

import pandas as pd
import seaborn as sns
from keras.layers import Dense, Dot, Embedding, Input, Reshape
from keras.models import Model
import numbers
from tensorboard.plugins import projector


articles_path = Path(
    "../../data/ndjson"
)  # "/home/gonzalo/repos/wiki_disease/data/wiki/old/"
plt.style.use("fivethirtyeight")
plt.rcParams["font.size"] = 15

2022-11-07 16:01:09.139676: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-07 16:01:09.238474: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-11-07 16:01:09.238492: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-11-07 16:01:09.253981: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-11-07 16:01:09.695594: W tensorflow/stream_executor/platform/de

In [2]:
class Article:
    def __init__(
        self, title, infobox, wikilinks, external_links, timestamp, text_length
    ):
        self.title = title
        self.infobox = infobox
        self.wikilinks = wikilinks
        self.external_links = external_links
        self.timestamp = timestamp
        self.text_length = text_length

    def __getitem__(self, item):
        return (
            self.title,
            self.infobox,
            self.wikilinks,
            self.external_links,
            self.timestamp,
            self.text_length,
        )[item]

    def __repr__(self):
        return f"<Article: {self.title}>"

    def __format__(self, spec):
        return self.title.__format__(spec)

    def __len__(self):
        return len(self.title)

class ArticleCollection:
    def __init__(self):
        self.articles = []
        # To map titles to index positions
        self.article_index = {}
        # To map index positions to titles
        self.index_article = {}
        self.embedding_weights = None

    def append(self, *article_data):
        self.articles.append(Article(*article_data))

    def __len__(self):
        return len(self.articles)

    def __getitem__(self, item):
        if isinstance(item, numbers.Number):
            return self.articles[int(item)]
        elif isinstance(item, str):
            return self._title_mapping()[item]
        else:
            raise IndexError("Invalid index")

    def __iter__(self):
        return iter(self.articles)

    def _title_mapping(self):
        # Create a mapping (article_index):   article_title -> article_index
        if len(self.article_index) != len(self.articles):
            self.article_index = {
                article.title: idx for idx, article in enumerate(self.articles)
            }
        return self.article_index

    def keys(self):
        return self._title_mapping().keys()

In [3]:
# Load articles from json files
articles = ArticleCollection()

for f in articles_path.glob("*.ndjson"):
    with open(f, "r") as json_file:
        for lines in json_file.readlines():
            articles.append(*json.loads(lines))
print(f"Total articles: {len(articles)}")
articles[55].title

Total articles: 8736


'Arteriovenous malformation'

In [4]:
articles["Hume fracture"]

313

The hypothesis for the project is that articles that are similar will point to similar (or the same) wikipedia links.

Let's run some stats.

In [5]:
from collections import Counter, OrderedDict


def count_items(l):
    """Return ordered dictionary of counts of objects in `l`"""

    # Create a counter object
    counts = Counter(l)

    # Sort by highest count first and place in ordered dictionary
    counts = sorted(counts.items(), key=lambda x: x[1], reverse=True)
    counts = OrderedDict(counts)

    return counts

In [6]:
# Find set of wikilinks for each article
unique_wikilinks = list(chain(*[list(set(article.wikilinks)) for article in articles]))

wikilink_counts = count_items(unique_wikilinks)
list(wikilink_counts.items())[:10]

[('List of cutaneous conditions', 916),
 ('Category:Rare diseases', 668),
 ('Category:Wikipedia medicine articles ready to translate', 614),
 ('inflammation', 580),
 ('CT scan', 492),
 ('Category:Syndromes', 466),
 ('Micrograph', 447),
 ('infection', 440),
 ('World Health Organization', 440),
 ('fever', 440)]

In [7]:
wikilinks = [link.lower() for link in unique_wikilinks]
print(f"There are {len(set(wikilinks))} unique wikilinks.")

wikilink_counts = count_items(wikilinks)
list(wikilink_counts.items())[:10]

There are 105388 unique wikilinks.


[('list of cutaneous conditions', 916),
 ('inflammation', 680),
 ('category:rare diseases', 668),
 ('magnetic resonance imaging', 635),
 ('category:wikipedia medicine articles ready to translate', 614),
 ('fever', 551),
 ('infection', 527),
 ('ct scan', 508),
 ('micrograph', 508),
 ('cancer', 496)]

We see that the most pointed wikilinks are usually categories within diseases. That's good and kind of helps sustaining the hypothesis.

Some medical conditions (articles) might point to other medical conditions.
Let's run the same ranking again but only using links for medical conditions.

In [8]:
# Find set of wikilinks for each article
unique_wikilinks_articles = list(
    chain(
        *[
            list(set(link for link in article[2] if link in articles.keys()))
            for article in articles
        ]
    )
)

# Count the number of articles linked to by other articles
wikilink_article_counts = count_items(unique_wikilinks_articles)
list(wikilink_article_counts.items())[:10]

[('Depression (mood)', 187),
 ("Parkinson's disease", 175),
 ('Hypoxia (medical)', 142),
 ('HIV/AIDS', 135),
 ('Staphylococcus aureus', 134),
 ("Crohn's disease", 133),
 ("Alzheimer's disease", 125),
 ('Syncope (medicine)', 123),
 ('Down syndrome', 114),
 ('Fever', 111)]

Let's build the same mappings for links. We aren't really interested in links used by few articles so we will use only links pointed by at least 5 articles

In [9]:
MIN_ARTICLES_POINTING = 5
links = [t[0] for t in wikilink_counts.items() if t[1] >= MIN_ARTICLES_POINTING]
link_index = {link: idx for idx, link in enumerate(links)}
index_link = {idx: link for link, idx in link_index.items()}

link_index["neurology"]
index_link[300]
print(f"Total links {len(links)}")

Total links 15625


## Create dataset

We have each of the articles and links encoded as integers and we can now proceed to the next step.

We want to create a NN that given the tuple (article, link) tries to predict wether the link exists and returns ($y$) 0 if it doesn't and 1 if it does.

Let's do the easy part first. Let's obtain all the "positive" tuple. That is, all the connections between articles and links that exist.

In [10]:
pairs = []

for article in articles:
    # Iterate through the links in the article and add them in batch (.extend)
    pairs.extend(
        (articles[article.title], link_index[link.lower()])
        for link in article.wikilinks
        if link.lower() in links
    )

pairs_set = set(pairs)

len(pairs), len(links), len(articles)
# pairs[5000]

(375532, 15625, 8736)

In [11]:
articles[pairs[50][0]], index_link[pairs[50][1]]

(<Article: Recurrent painful ophthalmoplegic neuropathy>,
 'digital subtraction angiography')

Creating negative examples is also easy. We just take a random article and a random link and if the tuple doesn't exist in "pairs" that means that the connection does not exist ($y=0$).

We can implement everything in a generator function to train the NN.

In [12]:
import numpy as np
import random

random.seed(100)


def generate_batch(pairs, n_positive=50, negative_ratio=1.0, classification=False):
    """Generate batches of samples for training"""
    batch_size = n_positive * (1 + negative_ratio)
    batch = np.zeros((batch_size, 3))

    # Adjust label based on task
    if classification:
        neg_label = 0
    else:
        neg_label = -1

    # This creates a generator
    while True:
        # randomly choose positive examples
        for idx, (article_id, link_id) in enumerate(random.sample(pairs, n_positive)):
            batch[idx, :] = (article_id, link_id, 1)

        # Add negative examples until reach batch size
        while idx < batch_size:

            # random selection
            random_article = random.randrange(len(articles))
            random_link = random.randrange(len(links))

            # Check to make sure this is not a positive example
            if (random_article, random_link) not in pairs_set:

                # Add to batch and increment index
                batch[idx, :] = (random_article, random_link, neg_label)
                idx += 1

        # Make sure to shuffle order
        np.random.shuffle(batch)
        yield {"article": batch[:, 0], "link": batch[:, 1]}, batch[:, 2]

In [13]:
# We ask for 2 positive samples with a negative ratio of 2 for a total of 2 positive 4 negative.
next(generate_batch(pairs, n_positive=2, negative_ratio=2))

({'article': array([7453., 8304., 1519., 5730., 6439., 2863.]),
  'link': array([15592., 13107.,  2504.,  7102., 11992., 11559.])},
 array([-1., -1.,  1., -1., -1., -1.]))

In [14]:
x, y = next(generate_batch(pairs, n_positive=2, negative_ratio=2))


for label, b_idx, l_idx in zip(y, x["article"], x["link"]):
    print(f"Article: {articles[b_idx]:30} Link: {index_link[l_idx]:40} Label: {label}")

Article: Perisylvian syndrome           Link: category:congenital disorders of nervous system Label: -1.0
Article: Epidemic dropsy                Link: mitochondrion                            Label: 1.0
Article: Ring chromosome 14 syndrome    Link: interleukin 2                            Label: -1.0
Article: Glycogen storage disease type IX Link: haemophilus                              Label: -1.0
Article: Lichen spinulosus              Link: retinoic acid                            Label: -1.0
Article: Basal-cell carcinoma           Link: ventilation (physiology)                 Label: -1.0


In [15]:
def article_embedding_model(embedding_size=50, classification=False):
    """Model to embed articles and wikilinks.
    Trained to discern if a link is present in a article.
    Embedding layers will try to 'put' similar articles and links close in the 50-dimensional space.
    """

    # Both inputs are 1-dimensional
    article = Input(name="article", shape=[1])
    link = Input(name="link", shape=[1])

    # Embedding the article (shape will be (None, 1, 50))
    article_embedding = Embedding(
        name="article_embedding",
        input_dim=len(articles),
        output_dim=embedding_size,
    )(article)

    # Embedding the link (shape will be (None, 1, 50))
    link_embedding = Embedding(
        name="link_embedding", input_dim=len(link_index), output_dim=embedding_size
    )(link)

    # Merge the layers with a dot product along the second axis (shape will be (None, 1, 1))
    merged = Dot(name="dot_product", normalize=True, axes=2)(
        [article_embedding, link_embedding]
    )

    # Reshape to be a single number (shape will be (None, 1))
    merged = Reshape(target_shape=[1])(merged)

    # If classifcation, add extra layer and loss function is binary cross entropy
    if classification:
        merged = Dense(1, activation="sigmoid")(merged)
        model = Model(inputs=[book, link], outputs=merged)
        model.compile(
            optimizer="Adam", loss="binary_crossentropy", metrics=["accuracy"]
        )

    # Otherwise loss function is mean squared error
    else:
        model = Model(inputs=[article, link], outputs=merged)
        model.compile(optimizer="Adam", loss="mse")

    return model


# Instantiate model and show parameters
model = article_embedding_model()
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 article (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 link (InputLayer)              [(None, 1)]          0           []                               
                                                                                                  
 article_embedding (Embedding)  (None, 1, 50)        436800      ['article[0][0]']                
                                                                                                  
 link_embedding (Embedding)     (None, 1, 50)        781250      ['link[0][0]']                   
                                                                                              

2022-11-07 16:01:47.047402: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-11-07 16:01:47.047426: W tensorflow/stream_executor/cuda/cuda_driver.cc:263] failed call to cuInit: UNKNOWN ERROR (303)
2022-11-07 16:01:47.047442: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (gon-laptop-linux): /proc/driver/nvidia/version does not exist
2022-11-07 16:01:47.047585: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
n_positive = 1024

gen = generate_batch(pairs, n_positive, negative_ratio=2)

# Train
h = model.fit_generator(
    gen, epochs=15, steps_per_epoch=len(pairs) // n_positive, verbose=2
)

Epoch 1/15


  h = model.fit_generator(


366/366 - 4s - loss: 0.9726 - 4s/epoch - 12ms/step
Epoch 2/15
366/366 - 4s - loss: 0.7664 - 4s/epoch - 12ms/step
Epoch 3/15
366/366 - 4s - loss: 0.5182 - 4s/epoch - 12ms/step
Epoch 4/15
366/366 - 4s - loss: 0.4703 - 4s/epoch - 11ms/step
Epoch 5/15
366/366 - 4s - loss: 0.4501 - 4s/epoch - 11ms/step
Epoch 6/15
366/366 - 4s - loss: 0.4410 - 4s/epoch - 12ms/step
Epoch 7/15
366/366 - 5s - loss: 0.4371 - 5s/epoch - 13ms/step
Epoch 8/15
366/366 - 5s - loss: 0.4339 - 5s/epoch - 13ms/step
Epoch 9/15
366/366 - 5s - loss: 0.4327 - 5s/epoch - 13ms/step
Epoch 10/15
366/366 - 4s - loss: 0.4310 - 4s/epoch - 12ms/step
Epoch 11/15


In [None]:
def extract_weights(name, model):
    """Extract weights from a neural network model"""

    # Extract weights
    weight_layer = model.get_layer(name)
    weights = weight_layer.get_weights()[0]

    # Normalize
    weights = weights / np.linalg.norm(weights, axis=1).reshape((-1, 1))
    return weights


article_weights = extract_weights("article_embedding", model)
link_weights = extract_weights("link_embedding", model)

We want to calculate similarity (dot product) between pairs of article embeddings because we expected the neural network to have placed similar articles closer together in the 50-dimensional space.

Let's run a simple test, let's check how similar is an article with itself.

In [None]:
print(
    f"Similarity of article with index 0 with itself: {np.dot(article_weights[0], article_weights[0])}"
)

Hmm, that does not make a lot of sense, an article should be similar with itsel. In fact the similarity should be 1.

That's okay, we just need to normalize our embeddings:

$$ \lbrace \frac{W_n}{\lVert W \rVert}_{n} {\rbrace}_{n=0}^{N}$$

In [None]:
# np.allclose(np.linalg.norm(article_weights, axis = 1), np.sqrt(np.sum(article_weights**2, axis=1)))
article_weights = article_weights / np.linalg.norm(article_weights, axis=1).reshape(
    (-1, 1)
)
np.sum(np.square(article_weights[0]))

In [None]:
link_weights = link_weights / np.linalg.norm(link_weights, axis=1).reshape((-1, 1))
np.sum(np.square(link_weights[0]))

In [None]:
print(
    f"Similarity of article with index 0 with itself (after normalization): {np.dot(article_weights[0], article_weights[0])}"
)

That's way better. We can now implement a function to find similar articles.

In [None]:
output_embeddings_path = Path("../../data/embeddings")
output_embeddings_path.mkdir(parents=True, exist_ok=True)


In [None]:
# Save embeddings
np.savetxt(
    output_embeddings_path / Path("article_embedding.tsv"),
    article_weights,
    delimiter="\t",
)
np.savetxt(
    output_embeddings_path / Path("link_embedding.tsv"), link_weights, delimiter="\t"
)

# Save metadata

with open(output_embeddings_path / Path("link_metadata.tsv"), "w", encoding="utf-8") as fout:
    for l in link_index.keys():
        fout.write(str(l))
        fout.write("\n")
        
with open(output_embeddings_path / Path("article_metadata.tsv"), "w", encoding="utf-8") as fout:
    for l in articles.keys():
        fout.write(str(l))
        fout.write("\n")

In [None]:
def find_similar(
    name,
    weights,
    index_name="article",
    n=15,
    least=False,
    return_dist=False,
    plot=False,
):
    """Find n most similar items (or least) to name based on embeddings. Option to also plot the results"""

    # Select index and reverse index
    if index_name == "article":
        index = articles
        rindex = articles
    elif index_name == "link":
        index = link_index
        rindex = index_link

    # Check to make sure `name` is in index
    try:
        # Calculate dot product between book and all others
        dists = np.dot(weights, weights[index[name]])
    except KeyError:
        print(f"{name} Not Found.")
        return

    # Sort distance indexes from smallest to largest
    sorted_dists = np.argsort(dists)

    # Plot results if specified
    if plot:

        # Find furthest and closest items
        furthest = sorted_dists[: (n // 2)]
        closest = sorted_dists[-n - 1 : len(dists) - 1]
        items = [rindex[c] for c in furthest]
        items.extend(rindex[c] for c in closest)

        # Find furthest and closets distances
        distances = [dists[c] for c in furthest]
        distances.extend(dists[c] for c in closest)

        colors = ["r" for _ in range(n // 2)]
        colors.extend("g" for _ in range(n))

        data = pd.DataFrame({"distance": distances}, index=items)

        # Horizontal bar chart
        data["distance"].plot.barh(
            color=colors, figsize=(10, 8), edgecolor="k", linewidth=2
        )
        plt.xlabel("Cosine Similarity")
        plt.axvline(x=0, color="k")

        # Formatting for italicized title
        name_str = f"{index_name.capitalize()}s Most and Least Similar to"
        for word in name.split():
            # Title uses latex for italize
            name_str += " $\it{" + word + "}$"
        plt.title(name_str, x=0.2, size=28, y=1.05)

        return None

    # If specified, find the least similar
    if least:
        # Take the first n from sorted distances
        closest = sorted_dists[:n]

        print(f"{index_name.capitalize()}s furthest from {name}.\n")

    # Otherwise find the most similar
    else:
        # Take the last n sorted distances
        closest = sorted_dists[-n:]

        # Need distances later on
        if return_dist:
            return dists, closest

        print(f"{index_name.capitalize()}s closest to {name}.\n")

    # Need distances later on
    if return_dist:
        return dists, closest

    # Print formatting
    max_width = max([len(rindex[c]) for c in closest])

    # Print the most similar and distances
    for c in reversed(closest):
        print(
            f"{index_name.capitalize()}: {rindex[c]:{max_width + 2}} Similarity: {dists[c]:.{2}}"
        )

In [None]:
find_similar("Fever", article_weights)

In [None]:
find_similar("Sepsis", article_weights)

In [None]:
find_similar("Parkinson's disease", article_weights)

In [None]:
find_similar("Sleep apnea", article_weights)

In [None]:
config = """embeddings {
  metadata_path: "article_metadata.tsv"
  tensor_path: "article_embedding.tsv"
}
"""
with open(output_embeddings_path / Path("projector_config.pbtxt"), "w") as f:
    f.write(config)

In [None]:
tensorboard_dir = str(output_embeddings_path.resolve())


In [None]:
# For some reason I have to run this cell twice because of an error: 
# No dashboards are active for the current data set.
%load_ext tensorboard
%tensorboard --logdir "$tensorboard_dir" --port=6006

In [None]:
# TODO: Can we get a disease from symptons?

custom_weight = (
    link_weights[link_index["fever"]]
    + link_weights[link_index["headache"]]
    + link_weights[link_index["fatigue"]]
)
dists = np.dot(link_weights, custom_weight)
sorted_dists = np.argsort(dists)
closest = sorted_dists[-15 - 1 : len(dists) - 1]
[articles[c].title for c in closest]

In [None]:
custom_weight = (
    article_weights[articles["Fever"]]
    + article_weights[articles["Headache"]]
    + article_weights[articles["Fatigue"]]
)
dists = np.dot(article_weights, custom_weight)
sorted_dists = np.argsort(dists)
closest = sorted_dists[-15 - 1 : len(dists) - 1]
[articles[c].title for c in closest]

In [None]:
article_names = [article.title for article in articles]

In [None]:
df = pd.DataFrame(article_names, columns=["title"])
df[df.title.str.contains("Fever")]