<a href="https://colab.research.google.com/github/Jaimemorillo/Pointwise-ML-Ranking/blob/main/Pointwise_LOINC_binary.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import ElasticNetCV, LogisticRegressionCV
from sklearn.metrics import accuracy_score, classification_report, balanced_accuracy_score
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from tqdm.notebook import tqdm
tqdm.pandas()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [2]:
df_glucose = pd.read_csv('loinc_dataset-glucose-in-blood.csv', sep=';')
df_bilirubin = pd.read_csv('loinc_dataset-bilirubin-in-plasma.csv', sep=';')
df_white_blood_cells = pd.read_csv('loinc_dataset-white-blood-cells-count.csv', sep=';')

In [3]:
df = pd.concat([df_glucose, df_bilirubin, df_white_blood_cells])
df = df.reset_index(drop=True)

In [4]:
drop_columns = ['loinc_num', 'component']
df = df.drop(drop_columns, axis=1)

df = pd.get_dummies(df, columns=['system', 'property'], drop_first=True)

In [5]:
df

Unnamed: 0,long_common_name,label_binary,label_3_cat,query,system_BldA,system_BldC,system_BldV,system_Bld^BPU,system_Calculus,system_Dose,...,property_NCnc,property_NFr,property_NFr.DF,property_Num,property_PrThr,property_SCnc,property_Susc,property_Temp,property_Type,property_VFr
0,C reactive protein [Mass/volume] in Serum or P...,0,0,GLUCOSE IN BLOOD,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Bicarbonate [Moles/volume] in Blood,0,1,GLUCOSE IN BLOOD,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,Rh [Type] in Blood,0,1,GLUCOSE IN BLOOD,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,Trimethoprim+Sulfamethoxazole [Susceptibility],0,0,GLUCOSE IN BLOOD,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,Bilirubin.total [Mass/volume] in Serum or Plasma,0,0,GLUCOSE IN BLOOD,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
241,Smudge cells/Leukocytes [Pure number fraction]...,1,2,WHITE BLOOD CELLS COUNT,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
242,Mononuclear cells/100 leukocytes in Blood by M...,1,2,WHITE BLOOD CELLS COUNT,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
243,Unidentified cells/100 leukocytes in Blood by ...,1,2,WHITE BLOOD CELLS COUNT,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
244,Lymphoma cells/100 leukocytes in Blood by Manu...,1,2,WHITE BLOOD CELLS COUNT,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [6]:
class Normalizer:
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.ps = PorterStemmer() # Create Stemmer
        self.word_net = WordNetLemmatizer() #Create Lemmatizer
    
    def remove_punct(self, text):
        text = text.lower() # To lower case
        text = re.sub('\W', ' ', text) # Replace any character which is not a word character
        text = re.sub('\s+', ' ', text) # Replace any whitespace character
        text = re.sub(' +', ' ', text) # Replace multiple whitespaces
        text = text.strip() # Delete sorrounding whitespaces
        return text
    
    def tokenize(self, text):
        return word_tokenize(text)
    
    def remove_stop_words(self, tokens):
        return [word for word in tokens if not word in self.stop_words] 
    
    def stemming(self, tokens):
        return [self.ps.stem(word) for word in tokens]
    
    def lemmatize(self, tokens):
        return [self.word_net.lemmatize(word) for word in tokens]
    
    def return_sentences(self, tokens):
        return " ".join(tokens)
    
    def clean_all(self, text):
        text = self.remove_punct(text)
        tokens = self.tokenize(text)
        tokens = self.remove_stop_words(tokens)
        tokens = self.stemming(tokens)
        text = self.return_sentences(tokens)
        return text

In [7]:
df_final = df.copy()

In [8]:
norm = Normalizer()
df_final['long_common_name'] = df_final['long_common_name'].progress_apply(lambda x: norm.clean_all(x))

  0%|          | 0/246 [00:00<?, ?it/s]

In [9]:
df_final

Unnamed: 0,long_common_name,label_binary,label_3_cat,query,system_BldA,system_BldC,system_BldV,system_Bld^BPU,system_Calculus,system_Dose,...,property_NCnc,property_NFr,property_NFr.DF,property_Num,property_PrThr,property_SCnc,property_Susc,property_Temp,property_Type,property_VFr
0,c reactiv protein mass volum serum plasma,0,0,GLUCOSE IN BLOOD,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,bicarbon mole volum blood,0,1,GLUCOSE IN BLOOD,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,rh type blood,0,1,GLUCOSE IN BLOOD,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,trimethoprim sulfamethoxazol suscept,0,0,GLUCOSE IN BLOOD,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,bilirubin total mass volum serum plasma,0,0,GLUCOSE IN BLOOD,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
241,smudg cell leukocyt pure number fraction blood...,1,2,WHITE BLOOD CELLS COUNT,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
242,mononuclear cell 100 leukocyt blood manual count,1,2,WHITE BLOOD CELLS COUNT,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
243,unidentifi cell 100 leukocyt blood manual count,1,2,WHITE BLOOD CELLS COUNT,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
244,lymphoma cell 100 leukocyt blood manual count,1,2,WHITE BLOOD CELLS COUNT,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [30]:
X = df_final.drop(['label_binary','label_3_cat'], axis=1)
y = df_final['label_binary']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    random_state=1, 
                                                    stratify=df_final[['label_binary', 'query']])
print('Shape train: ' + str(X_train.shape))
print('Shape test: ' + str(X_test.shape))

Shape train: (196, 35)
Shape test: (50, 35)


In [31]:
tfidf = TfidfVectorizer(max_df=1.0, min_df=5, use_idf=True, smooth_idf=True, sublinear_tf=True)
X_train_enc = tfidf.fit_transform(X_train['long_common_name'])
X_test_enc = tfidf.transform(X_test['long_common_name'])

X_train_enc = pd.DataFrame.sparse.from_spmatrix(X_train_enc, index = X_train.index)
X_test_enc = pd.DataFrame.sparse.from_spmatrix(X_test_enc, index = X_test.index)

print('Size of the Vocabulary: ' + str(X_train_enc.shape[1]))
print('Original: ' + X_train.iloc[0, 0])
print('Encoded: ' + str(X_train_enc.iloc[0].to_numpy()))

Size of the Vocabulary: 37
Original: fast glucos mass mole volum serum plasma
Encoded: [0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.57165401 0.         0.         0.         0.         0.
 0.39340222 0.47487241 0.32859409 0.         0.         0.
 0.         0.32859409 0.         0.         0.         0.
 0.2774787 ]


In [32]:
X_train = X_train.join(X_train_enc)
X_train_enc['query'] = X_train['query']
X_test = X_test.join(X_test_enc)
X_test_enc['query'] = X_test['query']

In [33]:
def query_similarity(documents, query):
  query = norm.clean_all(query)
  query_enc = tfidf.transform([query])

  documents_similarity = cosine_similarity(query_enc, documents)
  documents_similarity = documents_similarity.T

  return documents_similarity

In [34]:
X_train_enc_glucose = X_train_enc[X_train_enc['query']=='GLUCOSE IN BLOOD'].drop('query', axis=1)
X_train_enc_glucose['query_similarity'] = query_similarity(X_train_enc_glucose, 'GLUCOSE IN BLOOD')

X_train_enc_bilirubin = X_train_enc[X_train_enc['query']=='BILIRUBIN IN PLASMA'].drop('query', axis=1)
X_train_enc_bilirubin['query_similarity'] = query_similarity(X_train_enc_bilirubin, 'BILIRUBIN IN PLASMA')

X_train_enc_white = X_train_enc[X_train_enc['query']=='WHITE BLOOD CELLS COUNT'].drop('query', axis=1)
X_train_enc_white['query_similarity'] = query_similarity(X_train_enc_white, 'WHITE BLOOD CELLS COUNT')

In [35]:
X_test_enc_glucose = X_test_enc[X_test_enc['query']=='GLUCOSE IN BLOOD'].drop('query', axis=1)
X_test_enc_glucose['query_similarity'] = query_similarity(X_test_enc_glucose, 'GLUCOSE IN BLOOD')

X_test_enc_bilirubin = X_test_enc[X_test_enc['query']=='BILIRUBIN IN PLASMA'].drop('query', axis=1)
X_test_enc_bilirubin['query_similarity'] = query_similarity(X_test_enc_bilirubin, 'BILIRUBIN IN PLASMA')

X_test_enc_white = X_test_enc[X_test_enc['query']=='WHITE BLOOD CELLS COUNT'].drop('query', axis=1)
X_test_enc_white['query_similarity'] = query_similarity(X_test_enc_white, 'WHITE BLOOD CELLS COUNT')

In [36]:
X_train_enc = pd.concat([X_train_enc_glucose, X_train_enc_bilirubin, X_train_enc_white])
X_train = X_train.join(X_train_enc['query_similarity'])

In [37]:
X_test_enc = pd.concat([X_test_enc_glucose, X_test_enc_bilirubin, X_test_enc_white])
X_test = X_test.join(X_test_enc['query_similarity'])

# Classification

In [39]:
clf = LogisticRegressionCV(cv=26, random_state=1, max_iter=1000)
clf.fit(X_train.drop(['long_common_name', 'query'], axis=1).values, y_train)

LogisticRegressionCV(cv=26, max_iter=1000, random_state=1)

In [40]:
preds = clf.predict_proba(X_test.drop(['long_common_name', 'query'], axis=1).values)

In [41]:
results = {
    'name': X_test['long_common_name'],
    'real' : y_test,
    'prob' : preds[:,1],
    'query': X_test['query']
}

results = pd.DataFrame(results)

In [42]:
results[results['query']=='GLUCOSE IN BLOOD'].sort_values('prob', ascending=False)

Unnamed: 0,name,real,prob,query
68,glucos mass volum blood,1,0.920873,GLUCOSE IN BLOOD
33,monocyt volum blood,0,0.695019,GLUCOSE IN BLOOD
12,glucos mole volum urin,1,0.575596,GLUCOSE IN BLOOD
19,glucos mole volum pleural fluid,1,0.281913,GLUCOSE IN BLOOD
80,lymphoma cell 100 leukocyt blood manual count,0,0.141773,GLUCOSE IN BLOOD
79,unidentifi cell 100 leukocyt blood manual count,0,0.141773,GLUCOSE IN BLOOD
44,bilirubin total presenc unspecifi specimen,0,0.121672,GLUCOSE IN BLOOD
29,calcium ioniz mole volum blood,0,0.073285,GLUCOSE IN BLOOD
17,blood product disposit type,0,0.02478,GLUCOSE IN BLOOD
43,c reactiv protein mass volum serum plasma high...,0,0.021089,GLUCOSE IN BLOOD


In [43]:
results[results['query']=='BILIRUBIN IN PLASMA'].sort_values('prob', ascending=False)

Unnamed: 0,name,real,prob,query
157,bilirubin conjug bilirubin total serum plasma,1,0.940259,BILIRUBIN IN PLASMA
110,bilirubin indirect mass volum serum plasma,1,0.685835,BILIRUBIN IN PLASMA
163,malign cell 100 leukocyt blood manual count,0,0.064612,BILIRUBIN IN PLASMA
153,glucos mass volum blood pre meal,0,0.05684,BILIRUBIN IN PLASMA
149,glucos mass volum venou blood,0,0.050573,BILIRUBIN IN PLASMA
137,alkalin phosphatas enzymat activ volum serum p...,0,0.032821,BILIRUBIN IN PLASMA
88,methicillin resist staphylococcu aureu presenc...,0,0.029233,BILIRUBIN IN PLASMA
83,alanin aminotransferas enzymat activ volum ser...,0,0.025035,BILIRUBIN IN PLASMA
95,calcium ioniz mole volum blood,0,0.018039,BILIRUBIN IN PLASMA
92,bodi temperatur,0,0.016785,BILIRUBIN IN PLASMA


In [44]:
results[results['query']=='WHITE BLOOD CELLS COUNT'].sort_values('prob', ascending=False)

Unnamed: 0,name,real,prob,query
244,lymphoma cell 100 leukocyt blood manual count,1,0.84138,WHITE BLOOD CELLS COUNT
243,unidentifi cell 100 leukocyt blood manual count,1,0.84138,WHITE BLOOD CELLS COUNT
235,glucos mass volum blood pre meal,0,0.147812,WHITE BLOOD CELLS COUNT
234,glucos mole volum capillari blood,0,0.112766,WHITE BLOOD CELLS COUNT
239,bilirubin conjug bilirubin total serum plasma,0,0.086039,WHITE BLOOD CELLS COUNT
238,bilirubin direct bilirubin total serum plasma,0,0.086039,WHITE BLOOD CELLS COUNT
182,bicarbon mole volum blood,0,0.081442,WHITE BLOOD CELLS COUNT
214,bilirubin direct mass volum serum plasma,0,0.061401,WHITE BLOOD CELLS COUNT
236,bilirubin conjug indirect mass volum serum plasma,0,0.056528,WHITE BLOOD CELLS COUNT
203,glucos mole volum serum plasma 3 hour post 100...,0,0.046908,WHITE BLOOD CELLS COUNT
