In [2]:
import pandas as pd
from tqdm import tqdm
from tqdm import tqdm_notebook
import shutup; shutup.please()

In [3]:
data = pd.read_csv('./cmpd_descriptions.csv')

In [5]:
pd.set_option('max_colwidth', None)
data.head()

Unnamed: 0.1,Unnamed: 0,CID,Description
0,0,1697,"4,5-dianilinophthalimide is phthalimide substituted at the 4- and 5-positions by anilino groups. It has a role as a tyrosine kinase inhibitor and a geroprotector."
1,1,755673,"7-hydroxy-2,3,4,5-tetrahydrobenzofuro[2,3-c]azepin-1-one is a member of benzofurans."
2,2,15160711,Ovalitenin B is a butanone.
3,3,176870,"Erlotinib is a quinazoline compound having a (3-ethynylphenyl)amino group at the 4-position and two 2-methoxyethoxy groups at the 6- and 7-positions. It has a role as an antineoplastic agent, a protein kinase inhibitor and an epidermal growth factor receptor antagonist. It is a member of quinazolines, a terminal acetylenic compound, an aromatic ether and a secondary amino compound."
4,4,656344,"2-[[5-(4-methylphenyl)-1,3,4-oxadiazol-2-yl]thio]-1-(4-phenyl-1-piperazinyl)ethanone is a member of piperazines."


In [6]:
data.shape

(4095, 3)

# Preprocessing of texts for topic modeling by LDA

In [20]:
word_tokenizer = nltk.WordPunctTokenizer()
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ivanikova/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [21]:
stop_words = nltk.corpus.stopwords.words('english')

In [69]:
# adding custom stop words for compound descriptions
text_file = open('./custom_stop_words.txt', 'r')
custom_stop_words = text_file.read().split(',')

In [70]:
custom_stop_words[:5]

['is', ' has', ' it', ' thus', ' therefore']

In [71]:
all_stop_words = stop_words + custom_stop_words

In [None]:
import re

In [25]:
regex = re.compile(r'[a-zA-Z]+')
def words_only(text, regex=regex):
    try:
        return " ".join(regex.findall(text)).lower()
    except:
        return ""

In [26]:
def process_data(data):
    texts = []
    for index, row in tqdm(data.iterrows(), total=len(data)):
        text_lower = words_only(row['Description'])
        tokens = word_tokenizer.tokenize(text_lower)
        tokens = [word for word in tokens if (word not in all_stop_words and not word.isnumeric())]
        texts.append(tokens)
    return texts

In [64]:
texts = process_data(data)

100%|█████████████████████████████████████████████████████████████████████████████| 4095/4095 [00:00<00:00, 7799.98it/s]


In [55]:
import nltk
from nltk.stem.snowball import SnowballStemmer

In [56]:
stemmer = SnowballStemmer('english')

In [72]:
for i in range(len(texts)):
    text_stemmed = [stemmer.stem(x) for x in texts[i]]
    texts[i] = text_stemmed

# Topic modeling by LDA 

In [31]:
from gensim.models import *
from gensim import corpora

In [73]:
dictionary = corpora.Dictionary(texts)
print('Original: {}'.format(dictionary))
dictionary.filter_extremes(no_below = 2, keep_n=None)
dictionary.save('descriptions.dict')
print('Filtered: {}'.format(dictionary))
corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('descriptions.model', corpus) 

Original: Dictionary<7117 unique tokens: ['anilino', 'dianilinophthalimid', 'geroprotector', 'group', 'inhibitor']...>
Filtered: Dictionary<2880 unique tokens: ['anilino', 'geroprotector', 'group', 'inhibitor', 'kina']...>


In [84]:
lda_model = ldamodel.LdaModel(corpus, id2word=dictionary, num_topics = 3, chunksize=50, update_every=1, passes=10, per_word_topics=True)

In [85]:
lda_model.show_topics(num_words=15, formatted = False)

[(0,
  [('agent', 0.037135173),
   ('group', 0.036099948),
   ('role', 0.034600955),
   ('member', 0.031554304),
   ('inhibitor', 0.028787829),
   ('posit', 0.02724963),
   ('compound', 0.021645103),
   ('drug', 0.018227583),
   ('substitut', 0.018017067),
   ('use', 0.015712477),
   ('ec', 0.015512313),
   ('class', 0.013302829),
   ('antagonist', 0.01316792),
   ('amino', 0.010440581),
   ('anti', 0.0092367735)]),
 (1,
  [('acid', 0.1350727),
   ('metabolit', 0.043351624),
   ('role', 0.03286594),
   ('group', 0.028795455),
   ('amino', 0.020485636),
   ('function', 0.020466657),
   ('relat', 0.020207193),
   ('posit', 0.019174732),
   ('conjug', 0.019113919),
   ('ester', 0.014794956),
   ('plant', 0.013659264),
   ('human', 0.012747456),
   ('hydroxi', 0.012588059),
   ('monocarboxyl', 0.010758045),
   ('methyl', 0.010550005)]),
 (2,
  [('member', 0.08025469),
   ('n', 0.054545607),
   ('yl', 0.042841658),
   ('methyl', 0.035115443),
   ('h', 0.029372092),
   ('phenyl', 0.020295324

# Assigning dominant topics to descriptions as clasification labels

In [78]:
def format_topics_sent(ldamodel, corpus, texts):
    sent_topics_df = pd.DataFrame()
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row[0], key=lambda x: x[1], reverse=True)
        
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df._append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_topic', 'Perc_Contrib', 'Topic_Keywords']
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    sent_topics_df.rename(columns={0: "Text"}, inplace=True)
    return sent_topics_df

In [90]:
sent_topics = format_topics_sent(lda_model, corpus, texts)
sent_topics.head()

Unnamed: 0,Dominant_topic,Perc_Contrib,Topic_Keywords,Text
0,0,0.7578,"agent, group, role, member, inhibitor, posit, compound, drug, substitut, use","[dianilinophthalimid, phthalimid, substitut, posit, anilino, group, role, tyrosin, kina, inhibitor, geroprotector]"
1,2,0.8957,"member, n, yl, methyl, h, phenyl, amino, aromat, compound, ethyl","[hydroxi, tetrahydrobenzofuro, c, azepin, one, member, benzofuran]"
2,2,0.7758,"member, n, yl, methyl, h, phenyl, amino, aromat, compound, ethyl","[ovalitenin, b, butanon]"
3,0,0.9758,"agent, group, role, member, inhibitor, posit, compound, drug, substitut, use","[erlotinib, quinazolin, compound, ethynylphenyl, amino, group, posit, two, methoxyethoxi, group, posit, role, antineoplast, agent, protein, kina, inhibitor, epiderm, growth, factor, receptor, antagonist, member, quinazolin, termin, acetylen, compound, aromat, ether, secondari, amino, compound]"
4,2,0.9331,"member, n, yl, methyl, h, phenyl, amino, aromat, compound, ethyl","[methylphenyl, oxadiazol, yl, thio, phenyl, piperazinyl, ethanon, member, piperazin]"


In [95]:
topic_modeling_desc_labels = pd.merge(data, sent_topics, left_index=True, right_index=True)
topic_modeling_desc_labels.Dominant_topic = topic_modeling_desc_labels.Dominant_topic.map({0: 'Drug', 1: 'Metabolite', 2: 'No annotation'})
topic_modeling_desc_labels.head()

Unnamed: 0.1,Unnamed: 0,CID,Description,Dominant_topic,Perc_Contrib,Topic_Keywords,Text
0,0,1697,"4,5-dianilinophthalimide is phthalimide substituted at the 4- and 5-positions by anilino groups. It has a role as a tyrosine kinase inhibitor and a geroprotector.",Drug,0.7578,"agent, group, role, member, inhibitor, posit, compound, drug, substitut, use","[dianilinophthalimid, phthalimid, substitut, posit, anilino, group, role, tyrosin, kina, inhibitor, geroprotector]"
1,1,755673,"7-hydroxy-2,3,4,5-tetrahydrobenzofuro[2,3-c]azepin-1-one is a member of benzofurans.",No annotation,0.8957,"member, n, yl, methyl, h, phenyl, amino, aromat, compound, ethyl","[hydroxi, tetrahydrobenzofuro, c, azepin, one, member, benzofuran]"
2,2,15160711,Ovalitenin B is a butanone.,No annotation,0.7758,"member, n, yl, methyl, h, phenyl, amino, aromat, compound, ethyl","[ovalitenin, b, butanon]"
3,3,176870,"Erlotinib is a quinazoline compound having a (3-ethynylphenyl)amino group at the 4-position and two 2-methoxyethoxy groups at the 6- and 7-positions. It has a role as an antineoplastic agent, a protein kinase inhibitor and an epidermal growth factor receptor antagonist. It is a member of quinazolines, a terminal acetylenic compound, an aromatic ether and a secondary amino compound.",Drug,0.9758,"agent, group, role, member, inhibitor, posit, compound, drug, substitut, use","[erlotinib, quinazolin, compound, ethynylphenyl, amino, group, posit, two, methoxyethoxi, group, posit, role, antineoplast, agent, protein, kina, inhibitor, epiderm, growth, factor, receptor, antagonist, member, quinazolin, termin, acetylen, compound, aromat, ether, secondari, amino, compound]"
4,4,656344,"2-[[5-(4-methylphenyl)-1,3,4-oxadiazol-2-yl]thio]-1-(4-phenyl-1-piperazinyl)ethanone is a member of piperazines.",No annotation,0.9331,"member, n, yl, methyl, h, phenyl, amino, aromat, compound, ethyl","[methylphenyl, oxadiazol, yl, thio, phenyl, piperazinyl, ethanon, member, piperazin]"


In [96]:
topic_modeling_desc_labels.to_csv('./description_classification.csv')