In [1]:
from Bio.Entrez import efetch
from dicttoxml import dicttoxml
from IPython.display import clear_output
import json
import re

global entry_data

def extract_mesh_terms(raw_mesh_text):
    tags = None
    t = re.sub('<.*?>', ' ', raw_mesh_text)
    t = t.split()
    tags = t
    return tags
    
def query_pubmed(pmid_file):
    with open(pmid_file, "r") as f:
        for pmid in f:
            pmid = pmid.rstrip()
            try:
                handle = efetch(db="pubmed", id=pmid, retmode="xml", rettype="abstract")
            except:
                yield pmid, None
            yield pmid, handle.read()
            
def parse_pubmed_query(raw_text):
    title, abstract, medline = "None", "None", "None"
    raw_text = "".join(raw_text[0].decode("utf-8"))
    title_start_idx, title_end_idx = raw_text.find("<ArticleTitle>") + len("<ArticleTitle>"), raw_text.find("</ArticleTitle>") 
    title = raw_text[title_start_idx:title_end_idx]
    abstract_start_idx, abstract_end_idx = raw_text.find("<AbstractText>") + \
        len("<AbstractText>"), raw_text.find("</AbstractText>") 
    if "<MeshHeadingList>" in raw_text:
        medline_start_idx, medline_end_idx = raw_text.find("<MeshHeadingList>") + len("</MeshHeadingList>") - 1, \
            raw_text.find("</MeshHeadingList>")
        medline = raw_text[medline_start_idx: medline_end_idx]
        medline = extract_mesh_terms(medline)
    abstract = raw_text[abstract_start_idx:abstract_end_idx]
    print(title)
    
    return {"title": title, "abstract": abstract, "medline": medline}
            
def pubmed_query_to_train(pmid_file, train_size=10000):
    global entry_data
    entry_data = dict()
    current_idx = 0
    for pmid, *data in query_pubmed(pmid_file):
        print(current_idx, "/", train_size)
        try:
            data = parse_pubmed_query(data)
        except:
            data = {"title": None, "abstract": None, "medline": None}
        entry_data[pmid] = data
        clear_output(wait=True)
        current_idx += 1
    json_data      = json.dumps(entry_data)
        
    with open("proteomics_output.json", "w+", encoding='utf-8') as out:
        out.write(json_data)

In [2]:
def download_pubmed_data(file_path):
    pubmed_query_to_train('pmid-proteomics-set.txt')
download_pubmed_data("")

9999 / 10000
Integration of multi-omics datasets enables molecular classification of COPD.


In [3]:
import json
from unidecode import unidecode

json_data = None

with open("proteomics_output.json", "r", encoding="utf-8") as f:
    json_data = json.loads(f.read(), strict=False)

In [6]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

from tqdm import tqdm
import nltk
from nltk.corpus import stopwords

nltk.download("stopwords")

tokens = []
sentences = []

for pmid in tqdm(json_data.keys()):
    document_abstract = json_data[pmid]["abstract"]
    document_mesh     = json_data[pmid]["medline"]
    document_mesh     = " ".join(document_mesh)
    mesh_tokens       = word_tokenize(document_mesh)
    document_tokens   = word_tokenize(document_abstract)
    tokens.extend(document_tokens)
    tokens.extend(mesh_tokens)
    
for pmid in tqdm(json_data.keys()):
    document_abstract = json_data[pmid]["abstract"]
    sentences.append(document_abstract)

porter_stemmer    = PorterStemmer()
pre_tf_idf_tokens = []

for word in tokens:
    pre_tf_idf_tokens.append(porter_stemmer.stem(word))

tf_idf_vec_smooth = TfidfVectorizer(use_idf=True,  
                        smooth_idf=True,  
                        ngram_range=(1,1), stop_words='english')

X_unigram = tf_idf_vec_smooth.fit_transform(pre_tf_idf_tokens)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\joshu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
100%|██████████| 10000/10000 [00:11<00:00, 846.54it/s]
100%|██████████| 10000/10000 [00:00<00:00, 1666257.75it/s]


In [7]:
tf_idf_vec_smooth_bigram = TfidfVectorizer(use_idf=True,
                                  smooth_idf=True,
                                  ngram_range=(1,2), stop_words='english')

X_unigram_bigram = tf_idf_vec_smooth_bigram.fit_transform(pre_tf_idf_tokens)

In [8]:
from numpy import count_nonzero

def calculate_sparsity(sparse_matrix):
    sparsity = 1.0 - ( count_nonzero(sparse_matrix) / float(sparse_matrix.size) )
    return sparsity

In [9]:
from nltk.corpus import wordnet
import string
import re
from nltk.stem import WordNetLemmatizer
from unidecode import unidecode

words = set(nltk.corpus.words.words())

def is_english_token(token):
    if not wordnet.synsets(token):
        print(token)
        
        
translator=str.maketrans('','',string.punctuation)
lemmatizer = WordNetLemmatizer()

chemical_translations = {
    "alpha":"a", "beta":"b", "gamma":"g", "delta":"d",
    "epsilon":"e", "zeta":"z", "eta":"e", "theta":"th",
    "iota":"i", "kappa":"k", "lambda":"l", "mu":"m",
    "xi":"x", "pi":"p", "rho":"r", "simga":"s",
    "tau":"t", "phi":"ph", "chi":"kh","psi":"ps",
    "omega":"o"
}
chemical_keys = set(chemical_translations)

def preprocess_sentence(sentence, translator):
    sentence = sentence.lower()
    if 'doctype' in sentence: #flagging badly encoded JSON documents
        return ""
    out = sentence.translate(translator)
    out = re.sub("\d+\.?\d+?", "",  out)
    out = re.sub("[^\u0000-\u05C0\u2100-\u214F]+", "", out)
    
    out = [lemmatizer.lemmatize(token) for token in nltk.word_tokenize(out)]
    out = " ".join(out)
    out = unidecode(out)
    
    return out

In [None]:
preprocessed_sentences = [preprocess_sentence(sentence, translator) for sentence in sentences]
preprocessed_sentences

In [None]:
normalized_sentences = []

for sentence in preprocessed_sentences:
    for key, val in chemical_translations.items():
        sentence = sentence.replace(key, val)
    normalized_sentences.append(sentence)
normalized_sentences

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer(ngram_range=(1,1),
                           stop_words='english')

count_data = count_vectorizer.fit_transform(preprocessed_sentences)
 
cv_dataframe=pd.DataFrame(count_data.toarray(), columns=count_vectorizer.get_feature_names())
cv_dataframe.shape

(10000, 45902)

### Probabilistic Threshing - Dictionary Size
    1. Chemical Compound Lists - Not compatiable

In [10]:
cv_dataframe

Unnamed: 0,0h,11,13clysine,14,14bdglucosidic,1a,1a1,1a1b,1a1blight,1acid,...,zyggregator,zygosaccharomyces,zygote,zygotic,zymogen,zymograms,zymographic,zymography,zymoseptoria,zz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
calculate_sparsity(cv_dataframe.to_numpy())

0.9984680754651214

In [12]:
tf_idf_vec_smooth = TfidfVectorizer(use_idf=True,  
                        smooth_idf=True,  
                        ngram_range=(1,1), stop_words='english')

X_unigram = tf_idf_vec_smooth.fit_transform(preprocessed_sentences)
cv_dataframe_tfidf=pd.DataFrame(X_unigram.toarray(), columns=count_vectorizer.get_feature_names())
cv_dataframe_tfidf.shape

(10000, 45902)

Check overlap of terms in documents [High overlap = TF-IDF, Low overlap = CountVectorizer]
Kneser-Ney Smoothing might be worth looking into
TF-IDF smoothing is just computationally beneficial
    -mlwiki smoothing for language models

In [None]:
chemical_list_data = []

count = 0
for pmid in tqdm(json_data.keys()):
    chemicals = json_data[pmid]["medline"]
    chemical_list_data.append(chemicals)
    if "Proteome" in chemicals:
        count += 1
chemical_list_data

In [13]:
cv_dataframe_tfidf.head()

Unnamed: 0,0h,11,13clysine,14,14bdglucosidic,1a,1a1,1a1b,1a1blight,1acid,...,zyggregator,zygosaccharomyces,zygote,zygotic,zymogen,zymograms,zymographic,zymography,zymoseptoria,zz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### May want to include as part of Github utilities

In [14]:
from collections import defaultdict
import numpy as np

def co_occurrence(sentences, window_size):
    d = defaultdict(int)
    vocab = set()
    for text in sentences:
        # preprocessing (use tokenizer instead)
        text = text.lower().split()
        # iterate over sentences
        for i in range(len(text)):
            token = text[i]
            vocab.add(token)  # add to vocab
            next_token = text[i+1 : i+1+window_size]
            for t in next_token:
                key = tuple( sorted([t, token]) )
                d[key] += 1
    
    # formulate the dictionary into dataframe
    vocab = sorted(vocab) # sort vocab
    df = pd.DataFrame(data=np.zeros((len(vocab), len(vocab)), dtype=np.int16),
                      index=vocab,
                      columns=vocab)
    for key, value in d.items():
        df.at[key[0], key[1]] = value
        df.at[key[1], key[0]] = value
    return df

co_occurrence_matrix = co_occurrence(normalized_sentences, 2)

In [15]:
co_occurrence_matrix

Unnamed: 0,'ditiodiproponate,(c),(tm),*3,*4,*8,+-,+-1,+-2,+-3,...,zyggregator,zygosaccharomyces,zygote,zygotic,zymogen,zymograms,zymographc,zymography,zymoseptoria,zz
'ditiodiproponate,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
(c),0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
(tm),0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
*3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
*4,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zymograms,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zymographc,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zymography,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zymoseptoria,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
tolerance = 5
high_co_occurrence_mask = co_occurrence_matrix.values >= tolerance

In [None]:
for idx, row in enumerate(high_co_occurrence_mask):
    if True in row:
        for idy, value in enumerate(row): 
            print(idx, idy)

In [21]:
with open("vocab.dat", "w") as f:
    for column in co_occurrence_matrix.columns:
        f.write(column + "\n")

Current Work:
The chemical identification list is too expensive to use practically and more “by-hand” methods are easier for normalizing the dataset.
	Specifically, I wrote a translation guide instead of using a greek-aware lemmatizer to help make sure that chemicals shared the same name, e.g. setting the beta and the greek character beta to just be their unicode equivalent of “b”. 
	The unicode encoding also helps normalize the dataset although there is some concern about the removal of information
		This reduces the vocabulary list by about 5000 terms and decreases sparsity
The PUBMED MESH metadata is now ready to be added to the vocabulary and is roughly 10000 terms.
	It is replacing the chemical list which doesn’t contain useful information and to avoid the problem of multiplying duplicate values in the vector e.g. proteome is included in both the MESH and the chemical list
	I am electing to use the encoding we talked about last time where each word is part of the encoding as opposed to the entire term for the highest resolution on the MESH terms even if this may introduce some ambiguity, the method is set up to handle it any of the ways we talked about however so I can change it easily
	
The co-occurrence matrix looks promising thus far and I am finding a lot of values that overlap which will be useful for both LDA and for identifying words that should be bigrams instead of unigrams
	The actual change from bigram to unigram words is a work in progress
	I also need some help selecting a tolerance for when a word should be a unigram versus a bigram, that is to say what number of co-occurrence (proximally, so next to each other) is a good cutoff
I am currently working on calculating the Jaccard Index as well the documents to help determine if the matrices have sufficient overlap at this point for LDA
	Depending on the results of this I should be good to go with the next step