In [2]:
from bs4 import BeautifulSoup
import requests

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer


import scipy.sparse as ssp
import pandas as pd
import json

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from unicodedata import normalize

# the following code is used since I had problem with the nltk.download method 
import ssl
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

import nltk 
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/monti03/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/monti03/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
file_ = open('DG-Miner_miner-disease-gene.tsv')


In [4]:
diseases = set()
lines = file_.read().split('\n')[1:]
for line in lines:
    if(line == ''):
        continue
    gene = line.split('\t')[0][5:]
    diseases.add(gene)

In [5]:
null_values = set()
http_null = set()
founded = 0
not_f = 0
data_dict = {}
for row in diseases:
    page = requests.get('https://www.ncbi.nlm.nih.gov/medgen/?term='+row)
    print(row)
    soup = BeautifulSoup(page.content, 'html.parser')
    if(soup == None):
        http_null.add(row)
        print('{}: page not found'.format())
        continue
    
    results = soup.find('div', class_='portlet_content')
    if(results == None or results.text == ''):
        print(results)
        null_values.add(row)
        not_f += 1
        print('{}: field not found'.format(row))
        continue
    res = results.text
    founded += 1
    
    print('founded: {}\t not_founded: {}\ttotal: {}'.format(founded, not_f, not_f+founded))
    data_dict[row] = res
    

founded: 5253	 not_founded: 19	total: 5272
D011038
founded: 5254	 not_founded: 19	total: 5273
D002873
founded: 5255	 not_founded: 19	total: 5274
C563865
founded: 5256	 not_founded: 19	total: 5275
C565311
founded: 5257	 not_founded: 19	total: 5276
D007969
founded: 5258	 not_founded: 19	total: 5277
607447
founded: 5259	 not_founded: 19	total: 5278
C565831
founded: 5260	 not_founded: 19	total: 5279
D001228
founded: 5261	 not_founded: 19	total: 5280
D002177
founded: 5262	 not_founded: 19	total: 5281
D009395
founded: 5263	 not_founded: 19	total: 5282
D017282
founded: 5264	 not_founded: 19	total: 5283
C535441
founded: 5265	 not_founded: 19	total: 5284
D006946
founded: 5266	 not_founded: 19	total: 5285
D012874
founded: 5267	 not_founded: 19	total: 5286
264070
founded: 5268	 not_founded: 19	total: 5287
212050
founded: 5269	 not_founded: 19	total: 5288
C536122
founded: 5270	 not_founded: 19	total: 5289
D000015
founded: 5271	 not_founded: 19	total: 5290
C567753
founded: 5272	 not_founded: 19	tot

In [6]:
data_dict[list(data_dict.keys())[10]]

'Fanconi anemia (FA) is characterized by physical abnormalities, bone marrow failure, and increased risk for malignancy. Physical abnormalities, present in approximately 75% of affected individuals, include one or more of the following: short stature, abnormal skin pigmentation, skeletal malformations of the upper and lower limbs, microcephaly, and ophthalmic and genitourinary tract anomalies. Progressive bone marrow failure with pancytopenia typically presents in the first decade, often initially with thrombocytopenia or leukopenia. The incidence of acute myeloid leukemia is 13% by age 50 years. Solid tumors – particularly of the head and neck, skin, gastrointestinal tract, and genitourinary tract – are more common in individuals with FA.\xa0[from GeneReviews]'

In [14]:
len(list(data_dict.keys()))

5645

In [8]:
with open('data_description.json', 'w') as fp:
    json.dump(data_dict, fp)

In [11]:
with open('null_description.txt', 'w') as fp:
    for code in null_values:
        fp.write('\n'+code)

In [15]:
with open('null_description_page.txt', 'w') as fp:
    for code in http_null:
        fp.write('\n'+code)

In [6]:
with open('data_description.json') as f:
  data_dict = json.load(f)

In [7]:
disease_ids = sorted(list(data_dict.keys()))

In [8]:
disease_ids[-1]

'ease(MESH)'

In [9]:
''' 
    there are three type of returned texts:
    - the complete description of the disease
    - TO_UPDATE_TEXT
    - TO_REMOVE_TEXT

    the second case means that we have more diseases related to that diseaseID: in
        this case I have to get the description of the diseases
    the third case instead means that there where no results for the required 
        diseaseID, so I have to remove them.
'''
TO_UPDATE_TEXT = '\n\n\nDatabase: \n\nSelect\nBioSystems\nBooks\nClinVar\ndbGaP\nGene\nGTR\nMeSH\nOMIM\nPMC\nPubMed\n\n\n\n\nOption: \n\n\n\n\n\nFind items\n\n\n'

TO_REMOVE_TEXT = '[All Fields])\n\nSearch\n\n\n                See more...\n            \n'

to_update = set()
to_remove = set()

for disease_id in disease_ids:
    if(TO_UPDATE_TEXT == data_dict[disease_id]):
        to_update.add(disease_id)
    elif(TO_REMOVE_TEXT in data_dict[disease_id]):
        to_remove.add(disease_id)

In [10]:
# remove diseases
for disease_id in to_remove:
    if(disease_id in data_dict):
        del data_dict[disease_id]
    else:
        print('error: {} key not present'.format(disease_id))

del data_dict['ease(MESH)']

In [95]:
# get the descriptions in the case of TO_UPDATE_TEXT
for disease_id in to_update:
    page = requests.get('https://www.ncbi.nlm.nih.gov/medgen/?term='+disease_id)
    soup = BeautifulSoup(page.content, 'html.parser')
    results = soup.findAll(class_='rprt')
    
    at_least_one_description = False
    tmp_des = '' 
    tmp_title = ''
    for r in results:
        title = r.find(class_='rslt').find(class_='title').text
        des_div = r.find(class_='rslt').find(class_='concept-def')
        des = ''
        if(des_div != None):
            des = des_div.text
            at_least_one_description = True

        tmp_title = tmp_title + ' ' + title 
        tmp_des = tmp_des + ' ' + des

    if(at_least_one_description):
        data_dict[disease_id] = tmp_title + ' ' + tmp_des

In [101]:
TO_UPDATE_TEXT = '\n\n\nDatabase: \n\nSelect\nBioSystems\nBooks\nClinVar\ndbGaP\nGene\nGTR\nMeSH\nOMIM\nPMC\nPubMed\n\n\n\n\nOption: \n\n\n\n\n\nFind items\n\n\n'

keys = set(data_dict.keys())

'''
    if for instance no one of the associated diseases has a description, we remove 
    the entry
'''
for disease_id in keys:
    if(TO_UPDATE_TEXT == data_dict[disease_id]):
        del data_dict[disease_id]
    

In [102]:
with open('data_description_updated.json', 'w') as fp:
    json.dump(data_dict, fp)

In [103]:
disease_ids = sorted(list(data_dict.keys()))
with open('disease_ids_order.txt', 'w') as fp:
    for code in disease_ids:
        fp.write(code+'\n')

In [14]:
with open('data_description_updated.json') as f:
  data_dict = json.load(f)
  disease_ids = sorted(list(data_dict.keys()))

In [15]:
lemmatizer = WordNetLemmatizer()

# apply normalization to the text
def normalize_(text):
    # here I replace all the chars with accents with the normal chars and I 
    # remove all utf-8 symbols (for insance I remove \u3010 that represents '[' ) 
    text = normalize('NFKD', text).encode('ascii','ignore').decode('utf-8')

    # in order to avoid to differentiate for instance dog from Dog
    # I use the text.lower() method 
    text = text.lower()
    text = text.strip()
    
    # replace punctuation and other symbols with spaces 
    text = text.translate(text.maketrans('"$&#[]-,*:{;}()/|<@>=!?.\n', "                         ")) 
    while ('  ' not in text):
        # replace double spaces with single spaces
        text.replace('  ', ' ')

    lemmatized_text = ''
    word_list = word_tokenize(text)
    for word in word_list:
        # the the lem of the single term
        lemmatized_term = lemmatizer.lemmatize(word)
        # append to the other lemmatized terms
        lemmatized_text = lemmatized_text + ' ' + lemmatized_term

    return lemmatized_text

In [16]:
normalized_texts = []
with open('normalized_texts.txt', 'w') as fp:
    for disease_id in disease_ids:
        text = data_dict[disease_id]
        normalized_text = normalize_(text)
        fp.write(normalized_text+'\n')
        normalized_texts.append(normalized_text)

In [17]:
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(normalized_texts)

In [18]:
X

<5469x33306 sparse matrix of type '<class 'numpy.float64'>'
	with 229748 stored elements in Compressed Sparse Row format>

In [19]:
ssp.save_npz('tf_idf_data', X)

In [20]:
r = 2151
svd = TruncatedSVD(r)
Y_to_norm = svd.fit_transform(X)
normalizer = Normalizer()
Y = normalizer.fit_transform(Y_to_norm)

var_explained = svd.explained_variance_ratio_.sum()
print(var_explained)
    

0.8501041445409021


In [21]:
Y.tofile('tf_idf_data_SVD')