<a href="https://colab.research.google.com/github/Jaimemorillo/Pointwise-ML-Ranking/blob/main/Pointwise_LOINC_binary.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import ElasticNetCV, LogisticRegressionCV
from sklearn.metrics import accuracy_score, classification_report, balanced_accuracy_score
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from tqdm.notebook import tqdm
tqdm.pandas()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
df_glucose = pd.read_csv('loinc_dataset-glucose-in-blood.csv', sep=';')
df_bilirubin = pd.read_csv('loinc_dataset-bilirubin-in-plasma.csv', sep=';')
df_white_blood_cells = pd.read_csv('loinc_dataset-white-blood-cells-count.csv', sep=';')

In [5]:
df = pd.concat([df_glucose, df_bilirubin, df_white_blood_cells])
df = df.reset_index(drop=True)

In [6]:
drop_columns = ['loinc_num', 'component']
df = df.drop(drop_columns, axis=1)

df = pd.get_dummies(df, columns=['system', 'property'], drop_first=True)

In [7]:
df

Unnamed: 0,long_common_name,label_binary,label_3_cat,query,system_Bld^BPU,system_Calculus,system_Dose,system_Isolate,system_Plas,system_Plr fld,...,property_MSCnc,property_NCnc,property_NFr,property_Num,property_PrThr,property_SCnc,property_Susc,property_Temp,property_Type,property_VFr
0,C reactive protein [Mass/volume] in Serum or P...,0,0,GLUCOSE IN BLOOD,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Bicarbonate [Moles/volume] in Blood,0,1,GLUCOSE IN BLOOD,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,Rh [Type] in Blood,0,1,GLUCOSE IN BLOOD,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,Trimethoprim+Sulfamethoxazole [Susceptibility],0,0,GLUCOSE IN BLOOD,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
4,Bilirubin.total [Mass/volume] in Serum or Plasma,0,0,GLUCOSE IN BLOOD,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196,Monocytes [#/volume] in Blood,1,1,WHITE BLOOD CELLS COUNT,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
197,Major crossmatch [interpretation],0,0,WHITE BLOOD CELLS COUNT,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
198,Ampicillin [Susceptibility],0,0,WHITE BLOOD CELLS COUNT,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
199,Alanine aminotransferase [Enzymatic activity/v...,0,0,WHITE BLOOD CELLS COUNT,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
class Normalizer:
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.ps = PorterStemmer() # Create Stemmer
        self.word_net = WordNetLemmatizer() #Create Lemmatizer
    
    def remove_punct(self, text):
        text = text.lower() # To lower case
        text = re.sub('\W', ' ', text) # Replace any character which is not a word character
        text = re.sub('\s+', ' ', text) # Replace any whitespace character
        text = re.sub(' +', ' ', text) # Replace multiple whitespaces
        text = text.strip() # Delete sorrounding whitespaces
        return text
    
    def tokenize(self, text):
        return word_tokenize(text)
    
    def remove_stop_words(self, tokens):
        return [word for word in tokens if not word in self.stop_words] 
    
    def stemming(self, tokens):
        return [self.ps.stem(word) for word in tokens]
    
    def lemmatize(self, tokens):
        return [self.word_net.lemmatize(word) for word in tokens]
    
    def return_sentences(self, tokens):
        return " ".join(tokens)
    
    def clean_all(self, text):
        text = self.remove_punct(text)
        tokens = self.tokenize(text)
        tokens = self.remove_stop_words(tokens)
        tokens = self.lemmatize(tokens)
        text = self.return_sentences(tokens)
        return text

In [9]:
df_final = df.copy()

In [10]:
norm = Normalizer()
df_final['long_common_name'] = df_final['long_common_name'].progress_apply(lambda x: norm.clean_all(x))

  0%|          | 0/201 [00:00<?, ?it/s]

In [11]:
df_final

Unnamed: 0,long_common_name,label_binary,label_3_cat,query,system_Bld^BPU,system_Calculus,system_Dose,system_Isolate,system_Plas,system_Plr fld,...,property_MSCnc,property_NCnc,property_NFr,property_Num,property_PrThr,property_SCnc,property_Susc,property_Temp,property_Type,property_VFr
0,c reactive protein mass volume serum plasma,0,0,GLUCOSE IN BLOOD,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,bicarbonate mole volume blood,0,1,GLUCOSE IN BLOOD,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,rh type blood,0,1,GLUCOSE IN BLOOD,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,trimethoprim sulfamethoxazole susceptibility,0,0,GLUCOSE IN BLOOD,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
4,bilirubin total mass volume serum plasma,0,0,GLUCOSE IN BLOOD,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196,monocyte volume blood,1,1,WHITE BLOOD CELLS COUNT,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
197,major crossmatch interpretation,0,0,WHITE BLOOD CELLS COUNT,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
198,ampicillin susceptibility,0,0,WHITE BLOOD CELLS COUNT,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
199,alanine aminotransferase enzymatic activity vo...,0,0,WHITE BLOOD CELLS COUNT,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
X = df_final.drop(['label_binary','label_3_cat'], axis=1)
y = df_final['label_binary']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    random_state=9, 
                                                    stratify=df_final[['label_binary', 'query']])
print('Shape train: ' + str(X_train.shape))
print('Shape test: ' + str(X_test.shape))

Shape train: (160, 31)
Shape test: (41, 31)


In [30]:
tfidf = TfidfVectorizer(max_df=1.0, min_df=5, use_idf=True, smooth_idf=True, sublinear_tf=True)
X_train_enc = tfidf.fit_transform(X_train['long_common_name'])
X_test_enc = tfidf.transform(X_test['long_common_name'])

X_train_enc = pd.DataFrame.sparse.from_spmatrix(X_train_enc, index = X_train.index)
X_test_enc = pd.DataFrame.sparse.from_spmatrix(X_test_enc, index = X_test.index)

print('Size of the Vocabulary: ' + str(X_train_enc.shape[1]))
print('Original: ' + X_train.iloc[0, 0])
print('Encoded: ' + str(X_train_enc.iloc[0].to_numpy()))

Size of the Vocabulary: 31
Original: bilirubin total mass volume serum plasma
Encoded: [0.         0.         0.         0.         0.         0.57313172
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.36716805
 0.         0.296426   0.         0.         0.         0.
 0.296426   0.         0.         0.54980834 0.         0.
 0.24224305]


In [31]:
X_train = X_train.join(X_train_enc)
X_train_enc['query'] = X_train['query']
X_test = X_test.join(X_test_enc)
X_test_enc['query'] = X_test['query']

In [32]:
def query_similarity(documents, query):
  query = norm.clean_all(query)
  query_enc = tfidf.transform([query])

  documents_similarity = cosine_similarity(query_enc, documents)
  documents_similarity = documents_similarity.T

  return documents_similarity

In [33]:
X_train_enc_glucose = X_train_enc[X_train_enc['query']=='GLUCOSE IN BLOOD'].drop('query', axis=1)
X_train_enc_glucose['query_similarity'] = query_similarity(X_train_enc_glucose, 'GLUCOSE IN BLOOD')

X_train_enc_bilirubin = X_train_enc[X_train_enc['query']=='BILIRUBIN IN PLASMA'].drop('query', axis=1)
X_train_enc_bilirubin['query_similarity'] = query_similarity(X_train_enc_bilirubin, 'BILIRUBIN IN PLASMA')

X_train_enc_white = X_train_enc[X_train_enc['query']=='WHITE BLOOD CELLS COUNT'].drop('query', axis=1)
X_train_enc_white['query_similarity'] = query_similarity(X_train_enc_white, 'WHITE BLOOD CELLS COUNT')

In [34]:
X_test_enc_glucose = X_test_enc[X_test_enc['query']=='GLUCOSE IN BLOOD'].drop('query', axis=1)
X_test_enc_glucose['query_similarity'] = query_similarity(X_test_enc_glucose, 'GLUCOSE IN BLOOD')

X_test_enc_bilirubin = X_test_enc[X_test_enc['query']=='BILIRUBIN IN PLASMA'].drop('query', axis=1)
X_test_enc_bilirubin['query_similarity'] = query_similarity(X_test_enc_bilirubin, 'BILIRUBIN IN PLASMA')

X_test_enc_white = X_test_enc[X_test_enc['query']=='WHITE BLOOD CELLS COUNT'].drop('query', axis=1)
X_test_enc_white['query_similarity'] = query_similarity(X_test_enc_white, 'WHITE BLOOD CELLS COUNT')

In [35]:
X_train_enc = pd.concat([X_train_enc_glucose, X_train_enc_bilirubin, X_train_enc_white])
X_train = X_train.join(X_train_enc['query_similarity'])

In [36]:
X_test_enc = pd.concat([X_test_enc_glucose, X_test_enc_bilirubin, X_test_enc_white])
X_test = X_test.join(X_test_enc['query_similarity'])

# Classification

In [43]:
clf = LogisticRegressionCV(cv=5, random_state=9, max_iter=1000)
clf.fit(X_train.drop(['long_common_name', 'query'], axis=1).values, y_train)

LogisticRegressionCV(cv=5, max_iter=1000, random_state=9)

In [44]:
preds = clf.predict_proba(X_test.drop(['long_common_name', 'query'], axis=1).values)

In [45]:
results = {
    'name': X_test['long_common_name'],
    'real' : y_test,
    'prob' : preds[:,1],
    'query': X_test['query']
}

results = pd.DataFrame(results)

In [46]:
results[results['query']=='GLUCOSE IN BLOOD'].sort_values('prob', ascending=False)

Unnamed: 0,name,real,prob,query
12,glucose mole volume urine,1,0.628341,GLUCOSE IN BLOOD
62,calcium bilirubinate total stone,0,0.485309,GLUCOSE IN BLOOD
61,calcium mass volume serum plasma,0,0.004211,GLUCOSE IN BLOOD
52,lymphocyte 100 leukocyte blood,0,0.002737,GLUCOSE IN BLOOD
59,bilirubin indirect mass mole volume serum plasma,0,0.00153,GLUCOSE IN BLOOD
57,albumin globulin mass ratio serum plasma,0,0.000608,GLUCOSE IN BLOOD
40,alkaline phosphatase enzymatic activity volume...,0,0.000356,GLUCOSE IN BLOOD
54,calcium ionized mole volume serum plasma,0,0.000288,GLUCOSE IN BLOOD
2,rh type blood,0,0.000113,GLUCOSE IN BLOOD
65,calcium mole volume corrected albumin serum pl...,0,5.6e-05,GLUCOSE IN BLOOD


In [47]:
results[results['query']=='BILIRUBIN IN PLASMA'].sort_values('prob', ascending=False)

Unnamed: 0,name,real,prob,query
99,bilirubin indirect mass mole volume serum plasma,1,0.807358,BILIRUBIN IN PLASMA
100,cortisol mass volume serum plasma,0,0.651707,BILIRUBIN IN PLASMA
132,bilirubin total presence unspecified specimen,1,0.047265,BILIRUBIN IN PLASMA
125,chloride mole volume serum plasma,0,0.040555,BILIRUBIN IN PLASMA
130,cholesterol hdl mass volume serum plasma,0,0.013281,BILIRUBIN IN PLASMA
91,albumin globulin mass ratio serum plasma,0,0.005182,BILIRUBIN IN PLASMA
126,protein c mass volume plasma,0,0.003156,BILIRUBIN IN PLASMA
97,glucose mass volume serum plasma blood,0,0.003006,BILIRUBIN IN PLASMA
124,amylase enzymatic activity volume serum plasma,0,0.002144,BILIRUBIN IN PLASMA
112,cobalamin vitamin b12 mass volume serum,0,0.001626,BILIRUBIN IN PLASMA


In [48]:
results[results['query']=='WHITE BLOOD CELLS COUNT'].sort_values('prob', ascending=False)

Unnamed: 0,name,real,prob,query
165,lymphocyte 100 leukocyte blood,1,0.541484,WHITE BLOOD CELLS COUNT
174,cortisol mass volume serum plasma,0,0.073556,WHITE BLOOD CELLS COUNT
152,bicarbonate mole volume blood,0,0.025513,WHITE BLOOD CELLS COUNT
176,bilirubin indirect mass volume serum plasma,0,0.008202,WHITE BLOOD CELLS COUNT
159,choriogonadotropin beta subunit mole volume se...,0,0.000456,WHITE BLOOD CELLS COUNT
178,alkaline phosphatase enzymatic activity volume...,0,0.000356,WHITE BLOOD CELLS COUNT
181,calcium ionized mole volume serum plasma,0,0.000288,WHITE BLOOD CELLS COUNT
195,abo rh group type blood,0,0.000253,WHITE BLOOD CELLS COUNT
140,aspartate aminotransferase enzymatic activity ...,0,0.000189,WHITE BLOOD CELLS COUNT
135,calcium mole volume corrected albumin serum pl...,0,5.6e-05,WHITE BLOOD CELLS COUNT
