In [None]:
import networkx as nx
import numpy as np
import pandas as pd
import csv as csv
import sklearn as sk
import nltk

import re


First, let's load all the data we have prepared, and embed both the drug descriptions and protein function descriptions.

This step will also include some slight cleaning of the textual data for each drug / protein.

Obtaining the embeddings first is required in order to give "weights" to the edges in the final network.

This will assist in extracting additional, graph specific features.

In [None]:
drug_dict_list = []
protein_dict_list = []

with open("DPI_enriched.csv", "r") as drug_file:
    drug_dict_list = [{k: v for k,v in row.items()} for row in csv.DictReader(drug_file, skipinitialspace=True)]

with open("PPI_enriched.csv", "r") as protein_file:
    protein_dict_list = [{k: v for k,v in row.items()} for row in csv.DictReader(protein_file, skipinitialspace=True)]


In [None]:
len(drug_dict_list)

1677

In [None]:
len(protein_dict_list)

809

Now to do some basic NLP text cleaning before obtaining some embeddings.

We remove and clean the text based on the following:


In [None]:
from nltk.corpus import stopwords
stop_words = nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


USE (universal sentence encoder) is capable of handeling punctuation and general sentence structure. We will only clean the text of citations. Some special tokens and general numbers and other characters.

In [None]:
def remove_citations(text):
    text = re.sub("\[(.*?)\]|\"(.*?)\"", '', str(text))
    text = re.sub("\{ECO:(.*?)\}", '', str(text))
    text = re.sub("\(PubMed:(.*?)\)", '', str(text))
    text = text.replace(" .", '')

    return text


In [None]:
def remove_special_characters(text):
    return re.sub('[^A-Za-z0-9]+', '', text)


In [None]:
def clean_text(text):
    cleaned_text = remove_citations(text)
    # cleaned_text = remove_special_characters(cleaned_text)
    cleaned_text = cleaned_text.replace("FUNCTION: ", '')
    return cleaned_text


In [None]:
def clean_text_list(text_list):
    return [clean_text(text) for text in text_list]

In [None]:
def clean_dict_text(dict_list, text_key):
    text_list = [item[text_key] for item in dict_list]

    cleaned_text_list = clean_text_list(text_list)

    for item in dict_list:
        item[text_key] = cleaned_text_list[dict_list.index(item)]

In [None]:
#Sample some texts from dicts:

print(drug_dict_list[0]["description"])
print(protein_dict_list[0]["function"])

Lepirudin is a recombinant hirudin formed by 65 amino acids that acts as a highly specific and direct thrombin inhibitor.[L41539,L41569] Natural hirudin is an endogenous anticoagulant found in _Hirudo medicinalis_ leeches.[L41539] Lepirudin is produced in yeast cells and is identical to natural hirudin except for the absence of sulfate on the tyrosine residue at position 63 and the substitution of leucine for isoleucine at position 1 (N-terminal end).[A246609] Lepirudin is used as an anticoagulant in patients with heparin-induced thrombocytopenia (HIT), an immune reaction associated with a high risk of thromboembolic complications.[A3, L41539] HIT is caused by the expression of immunoglobulin G (IgG) antibodies that bind to the complex formed by heparin and platelet factor 4. This activates endothelial cells and platelets and enhances the formation of thrombi.[A246609] Bayer ceased the production of lepirudin (Refludan) effective May 31, 2012.[L41574]
FUNCTION: Serine/threonine-protein

In [None]:
clean_dict_text(drug_dict_list, "description")
clean_dict_text(protein_dict_list, "function")

In [None]:
#Sample some texts from dicts:

print(drug_dict_list[0]["description"])
print(protein_dict_list[0]["function"])

Lepirudin is a recombinant hirudin formed by 65 amino acids that acts as a highly specific and direct thrombin inhibitor. Natural hirudin is an endogenous anticoagulant found in _Hirudo medicinalis_ leeches. Lepirudin is produced in yeast cells and is identical to natural hirudin except for the absence of sulfate on the tyrosine residue at position 63 and the substitution of leucine for isoleucine at position 1 (N-terminal end). Lepirudin is used as an anticoagulant in patients with heparin-induced thrombocytopenia (HIT), an immune reaction associated with a high risk of thromboembolic complications. HIT is caused by the expression of immunoglobulin G (IgG) antibodies that bind to the complex formed by heparin and platelet factor 4. This activates endothelial cells and platelets and enhances the formation of thrombi. Bayer ceased the production of lepirudin (Refludan) effective May 31, 2012.
Serine/threonine-protein kinase that phosphorylates SFPQ/PSF, HNRNPA1 and EIF4E. May play a rol

Now that our text is cleaned up we van transform text into embeddings using USE

In [None]:
import tensorflow as tf
import tensorflow_hub as hub

use_embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-large/5")

In [None]:
len(drug_dict_list)

1677

In [None]:
%%time

use_drug_vectors = np.array([use_embed([text["description"]]).numpy() for i, text in enumerate(drug_dict_list)])
use_protein_vectors = np.array([use_embed([text["function"]]).numpy() for i, text in enumerate(protein_dict_list)])

CPU times: user 7min 11s, sys: 7.62 s, total: 7min 18s
Wall time: 4min 44s


In [None]:
use_drug_vectors[:3]

array([[[ 0.12059877,  0.02534292, -0.01538776, ..., -0.05198244,
         -0.01028922,  0.01146254]],

       [[-0.01686497, -0.00703895, -0.02396241, ..., -0.05406497,
         -0.05575457,  0.0366965 ]],

       [[ 0.0946089 , -0.03260491, -0.01860221, ..., -0.02816161,
         -0.09406241,  0.01205715]]], dtype=float32)

In [None]:
use_drug_vectors[0].shape

(1, 512)