In [1]:
from tqdm import tqdm

# Maths modules
import numpy as np
import pandas as pd
import tensorflow as tf
import random
import re

from pandas import read_excel
from sklearn.model_selection import train_test_split

2022-10-02 23:40:58.472400: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
file_mapping = 'ZHAW_DATA/Mapping_table_hospital_KIS_LIS.xlsx' 
file_roche_LOINC_db = 'ZHAW_DATA/Roche_LOINC_Database_filter.xlsm'
file_lab_codes = 'ZHAW_DATA/Example_Data_Laboratory_Codes_eng.xlsx'

df = read_excel(file_roche_LOINC_db, engine='openpyxl')

# to lower case and remove special characters before converting to 'category' 
df['LOINC Long Name'] = df['LOINC Long Name'].str.lower()
for index, row in df.iterrows():
    df.loc[index, 'LOINC Long Name'] = re.sub("[^0-9a-zA-Z ]", ' ', row['LOINC Long Name'])
    
df.head(2)

Unnamed: 0,LOINCID,Publication Version ID,Model,Vendor Analyte Code,Vendor Analyte Name,Vendor Specimen Description,Vendor Result Description,LOINC Term,LOINC Long Name,Component,System,LOINC Version ID,Unit
0,30,RMD_v1.0,cobas 6800,89342-0^Babesia^LN,Babesia qual. 480 Tests cobas 6800/8800 IVD,whole blood,Ord,89342-0,babesia sp 18s rrna presence in blood by naa...,Babesia sp 18S rRNA,Bld,2.68,Ord
1,40,RMD_v1.0,cobas 8800,89342-0^Babesia^LN,Babesia qual. 480 Tests cobas 6800/8800 IVD,whole blood,Ord,89342-0,babesia sp 18s rrna presence in blood by naa...,Babesia sp 18S rRNA,Bld,2.68,Ord


In [3]:
data_df = df.loc[:,('LOINC Term', 'LOINC Long Name')]

# First, change the type of the specified columns from object to strings and to 'category'. This will 
# assign a 'code' to each unique category value.
data_df.loc[:,'LOINC Term'] = data_df.loc[:,'LOINC Term'].astype('string')
data_df.loc[:,'LOINC Long Name'] = data_df.loc[:,'LOINC Long Name'].astype('category')

# Second, replace the strings with their code values.
data_df.loc[:,'LOINC Code LN'] = data_df.loc[:,'LOINC Long Name'].cat.codes

print(data_df.head(2))
print(data_df.dtypes)

  LOINC Term                                    LOINC Long Name  LOINC Code LN
0    89342-0  babesia sp 18s rrna  presence  in blood by naa...             69
1    89342-0  babesia sp 18s rrna  presence  in blood by naa...             69
LOINC Term           string
LOINC Long Name    category
LOINC Code LN         int16
dtype: object


In [4]:
# Bert Tokenizers
from transformers import BertTokenizerFast

BERT_MODEL = "bert-base-uncased"

tokenizer = BertTokenizerFast.from_pretrained(BERT_MODEL, do_lower_case=True)

input_ids = np.asarray([tokenizer(ln, padding="max_length", truncation=True)["input_ids"] for ln in tqdm(data_df['LOINC Long Name'])])
attention_mask = np.asarray([tokenizer(ln,padding="max_length",truncation=True)["attention_mask"] for ln in tqdm(data_df['LOINC Long Name'])])
token_type_ids = np.asarray([tokenizer(ln,padding="max_length",truncation=True)["token_type_ids"] for ln in tqdm(data_df['LOINC Long Name'])])

print(input_ids.shape)
print(attention_mask.shape)
print(token_type_ids.shape)


100%|██████████| 7005/7005 [00:01<00:00, 4816.58it/s]
100%|██████████| 7005/7005 [00:01<00:00, 4632.21it/s]
100%|██████████| 7005/7005 [00:01<00:00, 5085.42it/s]


(7005, 512)
(7005, 512)
(7005, 512)


In [5]:
data_df.groupby('LOINC Code LN').describe()

Unnamed: 0_level_0,LOINC Term,LOINC Term,LOINC Term,LOINC Term,LOINC Long Name,LOINC Long Name,LOINC Long Name,LOINC Long Name
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq
LOINC Code LN,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0,17,1,93495-0,17,17,1,2 ethylidene 1 5 dimethyl 3 3 diphenylpyrrolid...,17
1,16,1,77752-4,16,16,1,2 ethylidene 1 5 dimethyl 3 3 diphenylpyrrolid...,16
2,36,1,83070-3,36,36,1,25 hydroxyvitamin d3 25 hydroxyvitamin d2 mas...,36
3,36,1,83071-1,36,36,1,25 hydroxyvitamin d3 25 hydroxyvitamin d2 mol...,36
4,3,1,19321-9,3,3,1,6 monoacetylmorphine 6 mam presence in uri...,3
...,...,...,...,...,...,...,...,...
506,15,1,39797-6,15,15,1,vancomycin moles volume in serum or plasma ...,15
507,2,1,68324-3,2,2,1,von willebrand factor vwf activity actual no...,2
508,4,1,74857-4,4,4,1,west nile virus rna presence in serum or pla...,4
509,4,1,91080-2,4,4,1,zika virus igg ab presence in serum or plasm...,4


In [6]:
(
    texts_train,
    texts_test,
    input_ids_train,
    input_ids_test,
    attention_mask_train,
    attention_mask_test,
    token_type_ids_train,
    token_type_ids_test,
    labels_train,
    labels_test,
) = train_test_split(
    data_df['LOINC Long Name'].values,
    input_ids,
    attention_mask,
    token_type_ids,
    data_df['LOINC Code LN'].values,
    test_size=0.2,
    random_state=42,
)

print(texts_train.shape)
print(texts_test.shape)
print(input_ids_train.shape)
print(input_ids_test.shape)
print(attention_mask_train.shape)
print(attention_mask_test.shape)
print(token_type_ids_train.shape)
print(token_type_ids_test.shape)
print(labels_train.shape)
print(labels_test.shape)

(5604,)
(1401,)
(5604, 512)
(1401, 512)
(5604, 512)
(1401, 512)
(5604, 512)
(1401, 512)
(5604,)
(1401,)


In [7]:
from transformers import TFBertForSequenceClassification, AdamW
from tensorflow.keras.callbacks import TensorBoard, EarlyStopping

# Define NN model
print("Defining model...")
model = TFBertForSequenceClassification.from_pretrained(
    BERT_MODEL, num_labels=511
)

#optimizer = AdamW(model.parameters(), lr = 1e-5, eps = 1e-8)

optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)

# compile NN network
print("Compiling model...")
model.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer=optimizer # Value recommended by the Bert team
)

Defining model...


2022-10-02 23:41:21.382447: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Compiling model...


In [8]:

# fit NN model
print("Fitting model...")
model.fit(
    [input_ids_train, attention_mask_train, token_type_ids_train],
    labels_train,
    epochs=4,
    batch_size=64,
    validation_split=0.2,
    callbacks=[
        EarlyStopping(monitor="val_loss", patience=2),
    ],
    workers=4,
    use_multiprocessing=True,
)

print(model.summary())

Fitting model...
Epoch 1/4
 3/71 [>.............................] - ETA: 3:01:26 - loss: 6.2695

In [None]:
# Get predictions
y_pred = model.predict([input_ids_test, attention_mask_test, token_type_ids_test])
y_pred_proba = [float(x[1]) for x in tf.nn.softmax(y_pred.logits)]
y_pred_label = [0 if x[0] > x[1] else 1 for x in tf.nn.softmax(y_pred.logits)]


# Evaluate the model
from sklearn.metrics import (
    confusion_matrix,
    roc_auc_score,
    average_precision_score,
)

print("Confusion Matrix : ")
print(confusion_matrix(labels_test, y_pred_label))

print("ROC AUC score : ", round(roc_auc_score(labels_test, y_pred_proba), 3))

print("Average Precision score : ", round(average_precision_score(labels_test, y_pred_proba), 3))

In [None]:

# Sample data for development
TEXT_SAMPLE_SIZE = 10000  # <= 0 for all

# Sample data
if TEXT_SAMPLE_SIZE > 0:
    df = df.groupby("target", group_keys=False).apply(
        lambda x: x.sample(
            n=int(TEXT_SAMPLE_SIZE / df["target"].nunique()), random_state=42
        )
    ).reset_index(drop=True)



# Bert Tokenizers
from transformers import BertTokenizerFast

BERT_MODEL = "bert-base-uncased"

tokenizer = BertTokenizerFast.from_pretrained(BERT_MODEL, do_lower_case=True)

input_ids = np.asarray([tokenizer(sent, padding="max_length", truncation=True)["input_ids"] for sent in tqdm(df.text)])
attention_mask = np.asarray([tokenizer(sent,padding="max_length",truncation=True)["attention_mask"] for sent in tqdm(df.text)])
token_type_ids = np.asarray([tokenizer(sent,padding="max_length",truncation=True)["token_type_ids"] for sent in tqdm(df.text)])

from sklearn.model_selection import train_test_split


# Train-test split
(
    texts_train,
    texts_test,
    input_ids_train,
    input_ids_test,
    attention_mask_train,
    attention_mask_test,
    token_type_ids_train,
    token_type_ids_test,
    labels_train,
    labels_test,
) = train_test_split(
    df.text.values,
    input_ids,
    attention_mask,
    token_type_ids,
    df.target.values,
    test_size=0.2,
    stratify=df.target.values,
    random_state=42,
)


from transformers import TFBertForSequenceClassification
from tensorflow.keras.callbacks import TensorBoard, EarlyStopping
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import BinaryAccuracy


# Define NN model
print("Defining model...")
model = TFBertForSequenceClassification.from_pretrained(
    BERT_MODEL, num_labels=2
)

# compile NN network
print("Compiling model...")
model.compile(
    loss=BinaryCrossentropy(),
    optimizer=Adam(learning_rate=2e-5), # Value recommended by the Bert team
    metrics=BinaryAccuracy(),
)

# fit NN model
print("Fitting model...")
model.fit(
    [input_ids_train, attention_mask_train, token_type_ids_train],
    labels_train,
    epochs=10,
    batch_size=8,
    validation_split=0.2,
    callbacks=[
        EarlyStopping(monitor="val_loss", patience=2),
    ],
    workers=4,
    use_multiprocessing=True,
)

print(model.summary())


# Get predictions
y_pred = model.predict([input_ids_test, attention_mask_test, token_type_ids_test])
y_pred_proba = [float(x[1]) for x in tf.nn.softmax(y_pred.logits)]
y_pred_label = [0 if x[0] > x[1] else 1 for x in tf.nn.softmax(y_pred.logits)]


# Evaluate the model
from sklearn.metrics import (
    confusion_matrix,
    roc_auc_score,
    average_precision_score,
)

print("Confusion Matrix : ")
print(confusion_matrix(labels_test, y_pred_label))

print("ROC AUC score : ", round(roc_auc_score(labels_test, y_pred_proba), 3))

print("Average Precision score : ", round(average_precision_score(labels_test, y_pred_proba), 3))


In [None]:
'g/L'


dict = {
    'pmol/L':'Moles/volume',
 'ng/mL':'Mass/volume',
 'g/L': 'Mass/volume'
...
}

dict.get('g/L') => 'Mass/volume'

'abgeleitetes fibrinogen' + 'mass volume'