<a href="https://colab.research.google.com/github/Jaimemorillo/Pointwise-ML-Ranking/blob/main/Pointwise_LOINC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, balanced_accuracy_score
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from tqdm.notebook import tqdm
tqdm.pandas()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [4]:
df_glucose = pd.read_csv('loinc_dataset-glucose-in-blood.csv', sep=';')
df_bilirubin = pd.read_csv('loinc_dataset-bilirubin-in-plasma.csv', sep=';')
df_white_blood_cells = pd.read_csv('loinc_dataset-white-blood-cells-count.csv', sep=';')

In [5]:
df = pd.concat([df_glucose, df_bilirubin, df_white_blood_cells])
df = df.reset_index(drop=True)

In [6]:
drop_columns = ['loinc_num', 'component', 'system', 'property']
df = df.drop(drop_columns, axis=1)

In [7]:
df

Unnamed: 0,long_common_name,label_binary,label_3_cat,query
0,C reactive protein [Mass/volume] in Serum or P...,0,0,GLUCOSE IN BLOOD
1,Bicarbonate [Moles/volume] in Blood,0,1,GLUCOSE IN BLOOD
2,Rh [Type] in Blood,0,1,GLUCOSE IN BLOOD
3,Trimethoprim+Sulfamethoxazole [Susceptibility],0,0,GLUCOSE IN BLOOD
4,Bilirubin.total [Mass/volume] in Serum or Plasma,0,0,GLUCOSE IN BLOOD
...,...,...,...,...
196,Monocytes [#/volume] in Blood,1,1,WHITE BLOOD CELLS COUNT
197,Major crossmatch [interpretation],0,0,WHITE BLOOD CELLS COUNT
198,Ampicillin [Susceptibility],0,0,WHITE BLOOD CELLS COUNT
199,Alanine aminotransferase [Enzymatic activity/v...,0,0,WHITE BLOOD CELLS COUNT


In [8]:
class Normalizer:
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.ps = PorterStemmer() # Create Stemmer
        self.word_net = WordNetLemmatizer() #Create Lemmatizer
    
    def remove_punct(self, text):
        text = text.lower() # To lower case
        # Replace contractions
        text = re.sub(r"\'s", " is ", text)
        text = re.sub(r"\'ve", " have ", text)
        text = re.sub(r"can't", "can not ", text)
        text = re.sub(r"n't", " not ", text)
        text = re.sub(r"i'm", "i am ", text)
        text = re.sub(r"\'re", " are ", text)
        text = re.sub(r"\'d", " would ", text)
        text = re.sub(r"\'ll", " will ", text)
        text = re.sub('\W', ' ', text) # Replace any character which is not a word character
        text = re.sub('\s+', ' ', text) # Replace any whitespace character
        text = re.sub(' +', ' ', text) # Replace multiple whitespaces
        text = text.strip() # Delete sorrounding whitespaces
        return text
    
    def tokenize(self, text):
        return word_tokenize(text)
    
    def remove_stop_words(self, tokens):
        return [word for word in tokens if not word in self.stop_words] 
    
    def stemming(self, tokens):
        return [self.ps.stem(word) for word in tokens]
    
    def lemmatize(self, tokens):
        return [self.word_net.lemmatize(word) for word in tokens]
    
    def return_sentences(self, tokens):
        return " ".join(tokens)
    
    def clean_all(self, text):
        text = self.remove_punct(text)
        tokens = self.tokenize(text)
        tokens = self.remove_stop_words(tokens)
        tokens = self.lemmatize(tokens)
        text = self.return_sentences(tokens)
        return text

In [9]:
df_final = df.copy()

In [10]:
norm = Normalizer()
df_final['long_common_name'] = df_final['long_common_name'].progress_apply(lambda x: norm.clean_all(x))

  0%|          | 0/201 [00:00<?, ?it/s]

In [11]:
df_final

Unnamed: 0,long_common_name,label_binary,label_3_cat,query
0,c reactive protein mass volume serum plasma,0,0,GLUCOSE IN BLOOD
1,bicarbonate mole volume blood,0,1,GLUCOSE IN BLOOD
2,rh type blood,0,1,GLUCOSE IN BLOOD
3,trimethoprim sulfamethoxazole susceptibility,0,0,GLUCOSE IN BLOOD
4,bilirubin total mass volume serum plasma,0,0,GLUCOSE IN BLOOD
...,...,...,...,...
196,monocyte volume blood,1,1,WHITE BLOOD CELLS COUNT
197,major crossmatch interpretation,0,0,WHITE BLOOD CELLS COUNT
198,ampicillin susceptibility,0,0,WHITE BLOOD CELLS COUNT
199,alanine aminotransferase enzymatic activity vo...,0,0,WHITE BLOOD CELLS COUNT


In [31]:
X = df_final[['long_common_name', 'query']]
y = df_final['label_3_cat']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    random_state=9, 
                                                    stratify=df_final[['label_3_cat', 'query']])
print('Shape train: ' + str(X_train.shape))
print('Shape test: ' + str(X_test.shape))

Shape train: (160, 2)
Shape test: (41, 2)


In [32]:
tfidf = TfidfVectorizer(max_df=1.0, min_df=5, use_idf=True, smooth_idf=True, sublinear_tf=True)
X_train_enc = tfidf.fit_transform(X_train['long_common_name'])
X_test_enc = tfidf.transform(X_test['long_common_name'])

X_train_enc = X_train_enc.toarray()
X_test_enc = X_test_enc.toarray()

print('Size of the Vocabulary: ' + str(X_train_enc.shape[1]))
print('Original: ' + X_train.iloc[0])
print('Encoded: ' + str(X_train_enc[0]))

Size of the Vocabulary: 32
long_common_name    Original: bilirubin indirect mass mole volume ...
query                                   Original: BILIRUBIN IN PLASMA
Name: 99, dtype: object
Encoded: [0.         0.         0.         0.         0.         0.48040299
 0.         0.         0.         0.         0.         0.
 0.         0.         0.58475478 0.         0.3296371  0.37192634
 0.25773131 0.         0.         0.         0.         0.25773131
 0.         0.         0.         0.         0.         0.
 0.         0.21779382]


In [33]:
X_train_enc = pd.DataFrame(X_train_enc, index=X_train.index)
X_train_enc['query'] = X_train['query']

In [34]:
X_test_enc = pd.DataFrame(X_test_enc, index=X_test.index)
X_test_enc['query'] = X_test['query']

In [35]:
def query_similarity(documents, query):
  query = norm.clean_all(query)
  query_enc = tfidf.transform([query])

  documents_similarity = cosine_similarity(query_enc, documents)
  documents_similarity = documents_similarity.T

  return documents_similarity

In [36]:
X_train_enc_glucose = X_train_enc[X_train_enc['query']=='GLUCOSE IN BLOOD'].drop('query', axis=1)
X_train_enc_glucose['query_similarity'] = query_similarity(X_train_enc_glucose, 'GLUCOSE IN BLOOD')

X_train_enc_bilirubin = X_train_enc[X_train_enc['query']=='BILIRUBIN IN PLASMA'].drop('query', axis=1)
X_train_enc_bilirubin['query_similarity'] = query_similarity(X_train_enc_bilirubin, 'BILIRUBIN IN PLASMA')

X_train_enc_white = X_train_enc[X_train_enc['query']=='WHITE BLOOD CELLS COUNT'].drop('query', axis=1)
X_train_enc_white['query_similarity'] = query_similarity(X_train_enc_white, 'WHITE BLOOD CELLS COUNT')

In [37]:
X_test_enc_glucose = X_test_enc[X_test_enc['query']=='GLUCOSE IN BLOOD'].drop('query', axis=1)
X_test_enc_glucose['query_similarity'] = query_similarity(X_test_enc_glucose, 'GLUCOSE IN BLOOD')

X_test_enc_bilirubin = X_test_enc[X_test_enc['query']=='BILIRUBIN IN PLASMA'].drop('query', axis=1)
X_test_enc_bilirubin['query_similarity'] = query_similarity(X_test_enc_bilirubin, 'BILIRUBIN IN PLASMA')

X_test_enc_white = X_test_enc[X_test_enc['query']=='WHITE BLOOD CELLS COUNT'].drop('query', axis=1)
X_test_enc_white['query_similarity'] = query_similarity(X_test_enc_white, 'WHITE BLOOD CELLS COUNT')

In [38]:
X_train = pd.concat([X_train_enc_glucose, X_train_enc_bilirubin, X_train_enc_white])
X_train = X_train.reindex(y_train.index)

In [39]:
X_test = pd.concat([X_test_enc_glucose, X_test_enc_bilirubin, X_test_enc_white])
X_test = X_test.reindex(y_test.index)

In [40]:
from sklearn.linear_model import SGDRegressor

clf = SGDRegressor()
clf.fit(X_train.values, y_train)

SGDRegressor()

In [41]:
preds = clf.predict(X_test.values)

In [42]:
results = {
    'real' : y_test,
    'prob' : preds,
    'query': X_test_enc['query']
}

results = pd.DataFrame(results)

In [43]:
results[results['query']=='GLUCOSE IN BLOOD'].sort_values('prob', ascending=False)

Unnamed: 0,real,prob,query
28,2,1.021733,GLUCOSE IN BLOOD
39,1,0.852184,GLUCOSE IN BLOOD
37,1,0.488728,GLUCOSE IN BLOOD
56,1,0.444826,GLUCOSE IN BLOOD
38,0,0.416718,GLUCOSE IN BLOOD
65,0,0.377356,GLUCOSE IN BLOOD
0,0,0.357067,GLUCOSE IN BLOOD
57,0,0.356811,GLUCOSE IN BLOOD
53,0,0.302012,GLUCOSE IN BLOOD
62,0,0.28252,GLUCOSE IN BLOOD


In [44]:
results[results['query']=='BILIRUBIN IN PLASMA'].sort_values('prob', ascending=False)

Unnamed: 0,real,prob,query
108,1,0.622498,BILIRUBIN IN PLASMA
82,1,0.618464,BILIRUBIN IN PLASMA
113,1,0.576966,BILIRUBIN IN PLASMA
123,0,0.488871,BILIRUBIN IN PLASMA
110,0,0.488871,BILIRUBIN IN PLASMA
125,1,0.480484,BILIRUBIN IN PLASMA
78,1,0.472664,BILIRUBIN IN PLASMA
69,0,0.471998,BILIRUBIN IN PLASMA
106,0,0.434235,BILIRUBIN IN PLASMA
128,2,0.28252,BILIRUBIN IN PLASMA


In [45]:
results[results['query']=='WHITE BLOOD CELLS COUNT'].sort_values('prob', ascending=False)

Unnamed: 0,real,prob,query
165,2,1.200275,WHITE BLOOD CELLS COUNT
194,1,1.138905,WHITE BLOOD CELLS COUNT
167,1,0.827763,WHITE BLOOD CELLS COUNT
172,0,0.664615,WHITE BLOOD CELLS COUNT
155,0,0.62189,WHITE BLOOD CELLS COUNT
149,0,0.573997,WHITE BLOOD CELLS COUNT
145,0,0.486113,WHITE BLOOD CELLS COUNT
166,0,0.461413,WHITE BLOOD CELLS COUNT
176,0,0.446283,WHITE BLOOD CELLS COUNT
189,0,0.38462,WHITE BLOOD CELLS COUNT
