<a href="https://colab.research.google.com/github/Jaimemorillo/Pointwise-ML-Ranking/blob/main/Pointwise_LOINC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import ElasticNetCV, LogisticRegressionCV
from sklearn.metrics import accuracy_score, classification_report, balanced_accuracy_score
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from tqdm.notebook import tqdm
tqdm.pandas()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [2]:
df_glucose = pd.read_csv('loinc_dataset-glucose-in-blood.csv', sep=';')
df_bilirubin = pd.read_csv('loinc_dataset-bilirubin-in-plasma.csv', sep=';')
df_white_blood_cells = pd.read_csv('loinc_dataset-white-blood-cells-count.csv', sep=';')

In [3]:
df = pd.concat([df_glucose, df_bilirubin, df_white_blood_cells])
df = df.reset_index(drop=True)

In [4]:
drop_columns = ['loinc_num', 'component']
df = df.drop(drop_columns, axis=1)

df = pd.get_dummies(df, columns=['system', 'property'], drop_first=True)

In [5]:
df

Unnamed: 0,long_common_name,label_binary,label_3_cat,query,system_Bld^BPU,system_Calculus,system_Dose,system_Isolate,system_Plas,system_Plr fld,...,property_MSCnc,property_NCnc,property_NFr,property_Num,property_PrThr,property_SCnc,property_Susc,property_Temp,property_Type,property_VFr
0,C reactive protein [Mass/volume] in Serum or P...,0,0,GLUCOSE IN BLOOD,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Bicarbonate [Moles/volume] in Blood,0,1,GLUCOSE IN BLOOD,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,Rh [Type] in Blood,0,1,GLUCOSE IN BLOOD,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,Trimethoprim+Sulfamethoxazole [Susceptibility],0,0,GLUCOSE IN BLOOD,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
4,Bilirubin.total [Mass/volume] in Serum or Plasma,0,0,GLUCOSE IN BLOOD,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196,Monocytes [#/volume] in Blood,1,1,WHITE BLOOD CELLS COUNT,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
197,Major crossmatch [interpretation],0,0,WHITE BLOOD CELLS COUNT,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
198,Ampicillin [Susceptibility],0,0,WHITE BLOOD CELLS COUNT,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
199,Alanine aminotransferase [Enzymatic activity/v...,0,0,WHITE BLOOD CELLS COUNT,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
class Normalizer:
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.ps = PorterStemmer() # Create Stemmer
        self.word_net = WordNetLemmatizer() #Create Lemmatizer
    
    def remove_punct(self, text):
        text = text.lower() # To lower case
        text = re.sub('\W', ' ', text) # Replace any character which is not a word character
        text = re.sub('\s+', ' ', text) # Replace any whitespace character
        text = re.sub(' +', ' ', text) # Replace multiple whitespaces
        text = text.strip() # Delete sorrounding whitespaces
        return text
    
    def tokenize(self, text):
        return word_tokenize(text)
    
    def remove_stop_words(self, tokens):
        return [word for word in tokens if not word in self.stop_words] 
    
    def stemming(self, tokens):
        return [self.ps.stem(word) for word in tokens]
    
    def lemmatize(self, tokens):
        return [self.word_net.lemmatize(word) for word in tokens]
    
    def return_sentences(self, tokens):
        return " ".join(tokens)
    
    def clean_all(self, text):
        text = self.remove_punct(text)
        tokens = self.tokenize(text)
        tokens = self.remove_stop_words(tokens)
        tokens = self.lemmatize(tokens)
        text = self.return_sentences(tokens)
        return text

In [7]:
df_final = df.copy()

In [8]:
norm = Normalizer()
df_final['long_common_name'] = df_final['long_common_name'].progress_apply(lambda x: norm.clean_all(x))

  0%|          | 0/201 [00:00<?, ?it/s]

In [9]:
df_final

Unnamed: 0,long_common_name,label_binary,label_3_cat,query,system_Bld^BPU,system_Calculus,system_Dose,system_Isolate,system_Plas,system_Plr fld,...,property_MSCnc,property_NCnc,property_NFr,property_Num,property_PrThr,property_SCnc,property_Susc,property_Temp,property_Type,property_VFr
0,c reactive protein mass volume serum plasma,0,0,GLUCOSE IN BLOOD,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,bicarbonate mole volume blood,0,1,GLUCOSE IN BLOOD,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,rh type blood,0,1,GLUCOSE IN BLOOD,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,trimethoprim sulfamethoxazole susceptibility,0,0,GLUCOSE IN BLOOD,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
4,bilirubin total mass volume serum plasma,0,0,GLUCOSE IN BLOOD,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196,monocyte volume blood,1,1,WHITE BLOOD CELLS COUNT,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
197,major crossmatch interpretation,0,0,WHITE BLOOD CELLS COUNT,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
198,ampicillin susceptibility,0,0,WHITE BLOOD CELLS COUNT,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
199,alanine aminotransferase enzymatic activity vo...,0,0,WHITE BLOOD CELLS COUNT,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
X = df_final.drop(['label_binary','label_3_cat'], axis=1)
y = df_final['label_3_cat']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    random_state=1, 
                                                    stratify=df_final[['label_3_cat', 'query']])
print('Shape train: ' + str(X_train.shape))
print('Shape test: ' + str(X_test.shape))

Shape train: (160, 31)
Shape test: (41, 31)


In [11]:
tfidf = TfidfVectorizer(max_df=1.0, min_df=5, use_idf=True, smooth_idf=True, sublinear_tf=True)
X_train_enc = tfidf.fit_transform(X_train['long_common_name'])
X_test_enc = tfidf.transform(X_test['long_common_name'])

X_train_enc = pd.DataFrame.sparse.from_spmatrix(X_train_enc, index = X_train.index)
X_test_enc = pd.DataFrame.sparse.from_spmatrix(X_test_enc, index = X_test.index)

print('Size of the Vocabulary: ' + str(X_train_enc.shape[1]))
print('Original: ' + X_train.iloc[0, 0])
print('Encoded: ' + str(X_train_enc.iloc[0].to_numpy()))

Size of the Vocabulary: 33
Original: carbon dioxide total mole volume serum plasma
Encoded: [0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.54019715 0.         0.
 0.         0.54019715 0.         0.         0.         0.
 0.         0.         0.         0.3192227  0.22697311 0.
 0.         0.         0.         0.22697311 0.         0.41682726
 0.         0.         0.19414527]


In [12]:
X_train = X_train.join(X_train_enc)
X_train_enc['query'] = X_train['query']
X_test = X_test.join(X_test_enc)
X_test_enc['query'] = X_test['query']

In [13]:
def query_similarity(documents, query):
  query = norm.clean_all(query)
  query_enc = tfidf.transform([query])

  documents_similarity = cosine_similarity(query_enc, documents)
  documents_similarity = documents_similarity.T

  return documents_similarity

In [14]:
X_train_enc_glucose = X_train_enc[X_train_enc['query']=='GLUCOSE IN BLOOD'].drop('query', axis=1)
X_train_enc_glucose['query_similarity'] = query_similarity(X_train_enc_glucose, 'GLUCOSE IN BLOOD')

X_train_enc_bilirubin = X_train_enc[X_train_enc['query']=='BILIRUBIN IN PLASMA'].drop('query', axis=1)
X_train_enc_bilirubin['query_similarity'] = query_similarity(X_train_enc_bilirubin, 'BILIRUBIN IN PLASMA')

X_train_enc_white = X_train_enc[X_train_enc['query']=='WHITE BLOOD CELLS COUNT'].drop('query', axis=1)
X_train_enc_white['query_similarity'] = query_similarity(X_train_enc_white, 'WHITE BLOOD CELLS COUNT')

In [15]:
X_test_enc_glucose = X_test_enc[X_test_enc['query']=='GLUCOSE IN BLOOD'].drop('query', axis=1)
X_test_enc_glucose['query_similarity'] = query_similarity(X_test_enc_glucose, 'GLUCOSE IN BLOOD')

X_test_enc_bilirubin = X_test_enc[X_test_enc['query']=='BILIRUBIN IN PLASMA'].drop('query', axis=1)
X_test_enc_bilirubin['query_similarity'] = query_similarity(X_test_enc_bilirubin, 'BILIRUBIN IN PLASMA')

X_test_enc_white = X_test_enc[X_test_enc['query']=='WHITE BLOOD CELLS COUNT'].drop('query', axis=1)
X_test_enc_white['query_similarity'] = query_similarity(X_test_enc_white, 'WHITE BLOOD CELLS COUNT')

In [16]:
X_train_enc = pd.concat([X_train_enc_glucose, X_train_enc_bilirubin, X_train_enc_white])
X_train = X_train.join(X_train_enc['query_similarity'])

In [17]:
X_test_enc = pd.concat([X_test_enc_glucose, X_test_enc_bilirubin, X_test_enc_white])
X_test = X_test.join(X_test_enc['query_similarity'])

# Regression

In [23]:
reg = ElasticNetCV(cv=X_train.shape[0], random_state=1) #Leave one out
reg.fit(X_train.drop(['long_common_name', 'query'], axis=1).values, y_train)

ElasticNetCV(cv=160, random_state=1)

In [24]:
preds = reg.predict(X_test.drop(['long_common_name', 'query'], axis=1).values)

In [25]:
results = {
    'name': X_test['long_common_name'],
    'real' : y_test,
    'prob' : preds,
    'query': X_test['query']
}

results = pd.DataFrame(results)

In [26]:
results[results['query']=='GLUCOSE IN BLOOD'].sort_values('prob', ascending=False)

Unnamed: 0,name,real,prob,query
12,glucose mole volume urine,2,1.565272,GLUCOSE IN BLOOD
22,leukocyte volume blood,1,1.040647,GLUCOSE IN BLOOD
1,bicarbonate mole volume blood,1,0.83551,GLUCOSE IN BLOOD
29,calcium ionized mole volume blood,1,0.662734,GLUCOSE IN BLOOD
0,c reactive protein mass volume serum plasma,0,0.285524,GLUCOSE IN BLOOD
49,carcinoembryonic ag mass volume serum plasma,0,0.285524,GLUCOSE IN BLOOD
30,albumin mass volume serum plasma,0,0.285524,GLUCOSE IN BLOOD
13,amylase enzymatic activity volume serum plasma,0,0.281917,GLUCOSE IN BLOOD
42,tyrosine aminotransferase mass volume plasma,0,0.212596,GLUCOSE IN BLOOD
24,body temperature,0,0.208989,GLUCOSE IN BLOOD


In [27]:
results[results['query']=='BILIRUBIN IN PLASMA'].sort_values('prob', ascending=False)

Unnamed: 0,name,real,prob,query
104,bilirubin total mass volume synovial fluid,2,1.115379,BILIRUBIN IN PLASMA
100,cortisol mass volume serum plasma,1,0.784485,BILIRUBIN IN PLASMA
78,c reactive protein mass volume serum plasma hi...,1,0.621091,BILIRUBIN IN PLASMA
120,fasting glucose mass mole volume serum plasma,1,0.619122,BILIRUBIN IN PLASMA
107,indirect antiglobulin test complement specific...,1,0.583463,BILIRUBIN IN PLASMA
119,calcium mole volume corrected albumin serum pl...,1,0.568095,BILIRUBIN IN PLASMA
129,aspartate aminotransferase enzymatic activity ...,1,0.525442,BILIRUBIN IN PLASMA
73,methicillin resistant staphylococcus aureus pr...,0,0.208989,BILIRUBIN IN PLASMA
77,body temperature,0,0.208989,BILIRUBIN IN PLASMA
83,rh type blood,0,0.10805,BILIRUBIN IN PLASMA


In [28]:
results[results['query']=='WHITE BLOOD CELLS COUNT'].sort_values('prob', ascending=False)

Unnamed: 0,name,real,prob,query
148,leukocyte volume blood,2,1.820093,WHITE BLOOD CELLS COUNT
152,bicarbonate mole volume blood,1,1.41505,WHITE BLOOD CELLS COUNT
150,calcium ionized mole volume blood,1,1.080345,WHITE BLOOD CELLS COUNT
172,blood group antibody screen presence serum plasma,0,0.97525,WHITE BLOOD CELLS COUNT
156,blood product unit id,0,0.75362,WHITE BLOOD CELLS COUNT
161,chloride mole volume serum plasma,0,0.290077,WHITE BLOOD CELLS COUNT
174,cortisol mass volume serum plasma,0,0.285524,WHITE BLOOD CELLS COUNT
151,amylase enzymatic activity volume serum plasma,0,0.281917,WHITE BLOOD CELLS COUNT
178,alkaline phosphatase enzymatic activity volume...,0,0.281917,WHITE BLOOD CELLS COUNT
189,tyrosine aminotransferase mass volume plasma,0,0.212596,WHITE BLOOD CELLS COUNT


# Classification

In [32]:
clf = LogisticRegressionCV(cv=14, random_state=1, max_iter=1000) # the least populated class in y has only 14 members
clf.fit(X_train.drop(['long_common_name', 'query'], axis=1).values, y_train)

LogisticRegressionCV(cv=14, max_iter=1000, random_state=1)

In [33]:
preds = clf.predict(X_test.drop(['long_common_name', 'query'], axis=1).values)

In [34]:
results = {
    'name': X_test['long_common_name'],
    'real' : y_test,
    'prob' : preds,
    'query': X_test['query']
}

results = pd.DataFrame(results)

In [35]:
results[results['query']=='GLUCOSE IN BLOOD'].sort_values('prob', ascending=False)

Unnamed: 0,name,real,prob,query
12,glucose mole volume urine,2,2,GLUCOSE IN BLOOD
29,calcium ionized mole volume blood,1,1,GLUCOSE IN BLOOD
1,bicarbonate mole volume blood,1,1,GLUCOSE IN BLOOD
22,leukocyte volume blood,1,1,GLUCOSE IN BLOOD
3,trimethoprim sulfamethoxazole susceptibility,0,0,GLUCOSE IN BLOOD
11,ampicillin susceptibility,0,0,GLUCOSE IN BLOOD
0,c reactive protein mass volume serum plasma,0,0,GLUCOSE IN BLOOD
49,carcinoembryonic ag mass volume serum plasma,0,0,GLUCOSE IN BLOOD
42,tyrosine aminotransferase mass volume plasma,0,0,GLUCOSE IN BLOOD
24,body temperature,0,0,GLUCOSE IN BLOOD


In [36]:
results[results['query']=='BILIRUBIN IN PLASMA'].sort_values('prob', ascending=False)

Unnamed: 0,name,real,prob,query
104,bilirubin total mass volume synovial fluid,2,2,BILIRUBIN IN PLASMA
100,cortisol mass volume serum plasma,1,1,BILIRUBIN IN PLASMA
129,aspartate aminotransferase enzymatic activity ...,1,1,BILIRUBIN IN PLASMA
78,c reactive protein mass volume serum plasma hi...,1,1,BILIRUBIN IN PLASMA
120,fasting glucose mass mole volume serum plasma,1,0,BILIRUBIN IN PLASMA
89,abo group type blood blood product unit,0,0,BILIRUBIN IN PLASMA
85,nitrofurantoin susceptibility,0,0,BILIRUBIN IN PLASMA
73,methicillin resistant staphylococcus aureus pr...,0,0,BILIRUBIN IN PLASMA
79,cefazolin susceptibility,0,0,BILIRUBIN IN PLASMA
77,body temperature,0,0,BILIRUBIN IN PLASMA


In [37]:
results[results['query']=='WHITE BLOOD CELLS COUNT'].sort_values('prob', ascending=False)

Unnamed: 0,name,real,prob,query
148,leukocyte volume blood,2,1,WHITE BLOOD CELLS COUNT
150,calcium ionized mole volume blood,1,1,WHITE BLOOD CELLS COUNT
172,blood group antibody screen presence serum plasma,0,1,WHITE BLOOD CELLS COUNT
152,bicarbonate mole volume blood,1,1,WHITE BLOOD CELLS COUNT
163,cobalamin vitamin b12 mass volume serum,0,0,WHITE BLOOD CELLS COUNT
156,blood product unit id,0,0,WHITE BLOOD CELLS COUNT
144,methicillin resistant staphylococcus aureus pr...,0,0,WHITE BLOOD CELLS COUNT
151,amylase enzymatic activity volume serum plasma,0,0,WHITE BLOOD CELLS COUNT
141,cefazolin susceptibility,0,0,WHITE BLOOD CELLS COUNT
189,tyrosine aminotransferase mass volume plasma,0,0,WHITE BLOOD CELLS COUNT
