# Identifying Entities in Healthcare Data

 Workspace set up: Import and Install useful packages

In [1]:
!pip install pycrf
!pip install sklearn-crfsuite

import spacy
import sklearn_crfsuite
from sklearn_crfsuite import metrics

model = spacy.load("en_core_web_sm")



In [2]:
import pandas as pd
import numpy as np
from IPython.display import display


In [3]:
# Create a function to process the file and return a sentence list
def preprocess_inputfile(input_file):
    i_file = open(input_file, 'r')
    file_name = i_file.readlines()
    i_file.close()

    output_list = []

    full_sentence = ""

    for each_word in file_name:
        each_word = each_word.strip()
        if each_word == "":
            output_list.append(full_sentence) # To append the complete sentence to the output list
            full_sentence = "" # For new sentence start
        else:
            if full_sentence:
                full_sentence += " " + each_word
            else:
                full_sentence = each_word

    return output_list

In [4]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sukht\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
train_sent = preprocess_inputfile('train_sent')
train_label = preprocess_inputfile('train_label')
test_sent = preprocess_inputfile('test_sent')
test_label = preprocess_inputfile('test_label')

# Data Preprocessing

The dataset provided is in the form of one word per line. Let's understand the format of data below:

Suppose there are x words in a sentence, then there will be x continuous lines with one word in each line.

Further, the two sentences are separated by empty lines. The labels for the data follow the same format. We need to pre-process the data to recover the complete sentences and their labels.

In [6]:
# Print first five sentences from the processed dataset
for each_item in range(5):
    print(f"Sentence {each_item+1} is: {train_sent[each_item]}")
    print(f"Label {each_item+1} is: {train_label[each_item]}")
    print("*"*100)

Sentence 1 is: All live births > or = 23 weeks at the University of Vermont in 1995 ( n = 2395 ) were retrospectively analyzed for delivery route , indication for cesarean , gestational age , parity , and practice group ( to reflect risk status )
Label 1 is: O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O
****************************************************************************************************
Sentence 2 is: The total cesarean rate was 14.4 % ( 344 of 2395 ) , and the primary rate was 11.4 % ( 244 of 2144 )
Label 2 is: O O O O O O O O O O O O O O O O O O O O O O O O O
****************************************************************************************************
Sentence 3 is: Abnormal presentation was the most common indication ( 25.6 % , 88 of 344 )
Label 3 is: O O O O O O O O O O O O O O O
****************************************************************************************************
Sentence 4 is: The `` corrected '' ce

In [7]:
# Reading the train and test sentences and labels
with open('train_sent', 'r') as train_sent_file:
  train_words = train_sent_file.readlines()

with open('train_label', 'r') as train_labels_file:
  train_labels_by_word = train_labels_file.readlines()

with open('test_sent', 'r') as test_sent_file:
  test_words = test_sent_file.readlines()

with open('test_label', 'r') as test_labels_file:
  test_labels_by_word = test_labels_file.readlines()

In [8]:
# Sanity check to see that the number of tokens and no. of corresponding labels match.
print("Count of tokens in training set\n","No. of words: ",len(train_words),"\nNo. of labels: ",len(train_labels_by_word))
print("\n\nCount of tokens in test set\n","No. of words: ",len(test_words),"\nNo. of labels: ",len(test_labels_by_word))

Count of tokens in training set
 No. of words:  48501 
No. of labels:  48501


Count of tokens in test set
 No. of words:  19674 
No. of labels:  19674


In [9]:
def createDFfromFile(file_name, df_col_name):

  import re
  file = open(file_name, "r")
  wordlist = []
  sent = []
  for word in file:
    if word == '\n':
      # sentence complete store previous sent
      full_sent = ' '.join(wordlist)
      sent.append(full_sent)
      full_sent=''
      wordlist = []
    else:
      w = word.replace('\n','')
      wordlist.append(w)

  df = pd.DataFrame({df_col_name:sent})
  return df


In [10]:
#Create Train_sentences df
df_train_sent = createDFfromFile( 'train_sent', df_col_name='Sentence #')
df_train_sent['index'] = df_train_sent.index
print(df_train_sent)

                                             Sentence #  index
0     All live births > or = 23 weeks at the Univers...      0
1     The total cesarean rate was 14.4 % ( 344 of 23...      1
2     Abnormal presentation was the most common indi...      2
3     The `` corrected '' cesarean rate ( maternal-f...      3
4     Arrest of dilation was the most common indicat...      4
...                                                 ...    ...
2594  Special report : comparative efficacy of diffe...   2594
2595  Special report : pressure-reducing support sur...   2595
2596  External counterpulsation for treatment of chr...   2596
2597  Intra-articular hyaluronan injections for trea...   2597
2598               Pneumococcal vaccine : a second look   2598

[2599 rows x 2 columns]


In [11]:
#Similarly create for train_label, test_sent, test_label df
df_train_label = createDFfromFile( 'train_label', df_col_name='Sentence #')
df_train_label['index'] = df_train_label.index
print(df_train_label)

                                             Sentence #  index
0     O O O O O O O O O O O O O O O O O O O O O O O ...      0
1     O O O O O O O O O O O O O O O O O O O O O O O O O      1
2                         O O O O O O O O O O O O O O O      2
3     O O O O O O O O O O O O O O O O O O O O O O O ...      3
4           O O O O O O O O O O O O O O O O O O O O O O      4
...                                                 ...    ...
2594                  O O O O O O O O O T T T O O O O D   2594
2595                O O O T T T O O O O O O D D O O O O   2595
2596                                  T T O O O D D D D   2596
2597                                T T T O O O D O O O   2597
2598                                        D T O O O O   2598

[2599 rows x 2 columns]


In [12]:
df_test_sent = createDFfromFile( 'test_sent', df_col_name='Sentence #')
df_test_sent['index'] = df_test_sent.index
print(df_test_sent)

                                             Sentence #  index
0     Furthermore , when all deliveries were analyze...      0
1     As the ambient temperature increases , there i...      1
2     The daily high temperature ranged from 71 to 1...      2
3     There was a significant correlation between th...      3
4     Fluctuations in ambient temperature are invers...      4
...                                                 ...    ...
1051  Reduction of vasoreactivity and thrombogenicit...   1051
1052  Effects of ultrasound energy on total peripher...   1052
1053  High-dose chemotherapy with autologous stem-ce...   1053
1054  `` Tandem '' high-dose chemoradiotherapy with ...   1054
1055  Intravenous immune globulin for recurrent spon...   1055

[1056 rows x 2 columns]


In [13]:
df_test_label = createDFfromFile( 'test_label', df_col_name='Sentence #')
df_test_label['index'] = df_test_label.index
print(df_test_label)

                                             Sentence #  index
0     O O O O O O O O O O O O O O O O O O O O O O O ...      0
1                 O O O O O O O O O O O O O O O O O O O      1
2       O O O O O O O O O O O O O O O O O O O O O O O O      2
3     O O O O O O O O O O O O O O O O O O O O O O O ...      3
4                                 O O O O O O O O O O O      4
...                                                 ...    ...
1051                          O O D O D O T T O O O T T   1051
1052                    O O T T O D D D D O O O O O O O   1052
1053                                T T T T T T O D D D   1053
1054              T T T T T T T T T O O O O O O O O D D   1054
1055                                      T T T O D D D   1055

[1056 rows x 2 columns]


In [14]:
# Merge the two to form one dataset of all train data
df_train_data = pd.merge(df_train_sent, df_train_label, on="index")
print(df_train_data)

                                           Sentence #_x  index  \
0     All live births > or = 23 weeks at the Univers...      0   
1     The total cesarean rate was 14.4 % ( 344 of 23...      1   
2     Abnormal presentation was the most common indi...      2   
3     The `` corrected '' cesarean rate ( maternal-f...      3   
4     Arrest of dilation was the most common indicat...      4   
...                                                 ...    ...   
2594  Special report : comparative efficacy of diffe...   2594   
2595  Special report : pressure-reducing support sur...   2595   
2596  External counterpulsation for treatment of chr...   2596   
2597  Intra-articular hyaluronan injections for trea...   2597   
2598               Pneumococcal vaccine : a second look   2598   

                                           Sentence #_y  
0     O O O O O O O O O O O O O O O O O O O O O O O ...  
1     O O O O O O O O O O O O O O O O O O O O O O O O O  
2                         O O O O

In [15]:
# Merge the two to form one dataset of all test data
df_test_data = pd.merge(df_test_sent, df_test_label, on="index")
print(df_test_data)

                                           Sentence #_x  index  \
0     Furthermore , when all deliveries were analyze...      0   
1     As the ambient temperature increases , there i...      1   
2     The daily high temperature ranged from 71 to 1...      2   
3     There was a significant correlation between th...      3   
4     Fluctuations in ambient temperature are invers...      4   
...                                                 ...    ...   
1051  Reduction of vasoreactivity and thrombogenicit...   1051   
1052  Effects of ultrasound energy on total peripher...   1052   
1053  High-dose chemotherapy with autologous stem-ce...   1053   
1054  `` Tandem '' high-dose chemoradiotherapy with ...   1054   
1055  Intravenous immune globulin for recurrent spon...   1055   

                                           Sentence #_y  
0     O O O O O O O O O O O O O O O O O O O O O O O ...  
1                 O O O O O O O O O O O O O O O O O O O  
2       O O O O O O O O O O O O O

In [16]:
print("First five training sentences and their labels:\n")
for i in range(5):
    print(train_sent[i],"\n",train_label[i],"\n")
    print("*"*100)

First five training sentences and their labels:

All live births > or = 23 weeks at the University of Vermont in 1995 ( n = 2395 ) were retrospectively analyzed for delivery route , indication for cesarean , gestational age , parity , and practice group ( to reflect risk status ) 
 O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O 

****************************************************************************************************
The total cesarean rate was 14.4 % ( 344 of 2395 ) , and the primary rate was 11.4 % ( 244 of 2144 ) 
 O O O O O O O O O O O O O O O O O O O O O O O O O 

****************************************************************************************************
Abnormal presentation was the most common indication ( 25.6 % , 88 of 344 ) 
 O O O O O O O O O O O O O O O 

****************************************************************************************************
The `` corrected '' cesarean rate ( maternal-fetal medici

In [17]:
print("First five training sentences and their labels:\n")
for i in range(5):
    print(test_sent[i],"\n",test_label[i],"\n")
    print("*"*100)

First five training sentences and their labels:

Furthermore , when all deliveries were analyzed , regardless of risk status but limited to gestational age > or = 36 weeks , the rates did not change ( 12.6 % , 280 of 2214 ; primary 9.2 % , 183 of 1994 ) 
 O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O 

****************************************************************************************************
As the ambient temperature increases , there is an increase in insensible fluid loss and the potential for dehydration 
 O O O O O O O O O O O O O O O O O O O 

****************************************************************************************************
The daily high temperature ranged from 71 to 104 degrees F and AFI values ranged from 1.7 to 24.7 cm during the study period 
 O O O O O O O O O O O O O O O O O O O O O O O O 

****************************************************************************************************
There was a s

### Count the number of sentences in the processed train and test dataset

In [18]:
print("No. sentences in processed train dataset is: ", len(train_sent))
print("No. sentences in processed test dataset is: ", len(test_sent))

No. sentences in processed train dataset is:  2599
No. sentences in processed test dataset is:  1056


### Count the number of lines of labels in the processed train and test dataset.

In [19]:
print("No. sentences in processed train dataset is: ", len(train_label))
print("No. sentences in processed test dataset is: ", len(test_label))

No. sentences in processed train dataset is:  2599
No. sentences in processed test dataset is:  1056


Concept Identification- We will first explore what are the various concepts present in the dataset. For this, we will use PoS Tagging.

We will identify all the words from the corpus that have a tag of NOUN or PROPN (nouns) and prepare a dictionary of their counts. We will then output the top 25 most frequently discussed concepts in the entire corpus.

The key thing to check is that we are using both test and train sentences. Note that this is okay because we are using a pre-trained model and applying directly on our data. This is an exploratory analysis on the complete data. Since we are not training anything, there is no point is discarding information in test data

### Extract those tokens which have NOUN or PROPN as their PoS tag and find their frequency

# Creating a combined dataset from training and test sentences, since this is an Exploratory analysis.
combined = train_sent + test_sent
print("Number of sentences in combined dataset (training + test): {}".format(len(combined)))

In [20]:
# Creating a list of tokens which have PoS tag of 'NOUN' or 'PROPN'
noun_propn = []         # Initiating list for nouns and proper nouns
pos_tag = []            # initiating list for corresponding PoS tags.
combined = train_sent + test_sent
print("Number of sentences in combined dataset (training + test): {}".format(len(combined)))
for sent in combined:
    for token in model(sent):
        if token.pos_ in ['NOUN', 'PROPN']:
           noun_propn.append(token.text)
           pos_tag.append(token.pos_)
print("No. of tokens in combined dataset with PoS tag of 'NOUN' or 'PROPN': {}".format(len(noun_propn)))

Number of sentences in combined dataset (training + test): 3655
No. of tokens in combined dataset with PoS tag of 'NOUN' or 'PROPN': 24376


### Print the top 25 most common tokens with NOUN or PROPN PoS tags

In [21]:
noun_pos = pd.DataFrame({"NOUN_PROPN":noun_propn,"POS_tag":pos_tag})
print("Top 25 comon tokens with PoS tag of 'NOUN' or 'PROPN' \n")
print(noun_pos["NOUN_PROPN"].value_counts().head(25))

Top 25 comon tokens with PoS tag of 'NOUN' or 'PROPN' 

NOUN_PROPN
patients        492
treatment       281
%               247
cancer          200
therapy         175
study           154
disease         142
cell            140
lung            116
group            94
chemotherapy     88
gene             87
effects          85
results          79
women            77
use              74
TO_SEE           74
surgery          71
cases            71
risk             71
analysis         70
rate             67
response         66
survival         65
children         64
Name: count, dtype: int64


# Defining features for CRF

In [22]:
 # Let's define the features to get the feature value for one word.

def getFeaturesForOneWord(sentence, pos, pos_tags):
  word = sentence[pos]

  features = [
    'word.lower=' + word.lower(), # serves as word id
    'word[-3:]=' + word[-3:],     # last three characters
    'word[-2:]=' + word[-2:],     # last two characters
    'word.isupper=%s' % word.isupper(),  # is the word in all uppercase
    'word.isdigit=%s' % word.isdigit(),  # is the word a number
    'word.startsWithCapital=%s' % word[0].isupper(), # is the word starting with a capital letter
    'word.pos=' + pos_tags[pos]
  ]

  #Use the previous word also while defining features
  if(pos > 0):
    prev_word = sentence[pos-1]
    features.extend([
    'prev_word.lower=' + prev_word.lower(),
    'prev_word.isupper=%s' % prev_word.isupper(),
    'prev_word.isdigit=%s' % prev_word.isdigit(),
    'prev_word.startsWithCapital=%s' % prev_word[0].isupper(),
    'prev_word.pos=' + pos_tags[pos-1]
  ])
  # Mark the begining and the end words of a sentence correctly in the form of features.
  else:
    features.append('BEG') # feature to track begin of sentence

  if(pos == len(sentence)-1):
    features.append('END') # feature to track end of sentence

  return features

## Getting the features

### Write a code/function to get the features for a sentence

In [23]:
# Function to get features for a sentence.
def getFeaturesForOneSentence(sentence):

    # We need to get the pos_tags to be passed to the function
    processed_sent = model(sentence)
    postags = []

    for each_token in processed_sent:
        postags.append(each_token.pos_)

    sentence_list = sentence.split()
    return [getFeaturesForOneWord(sentence_list, pos, postags) for pos in range(len(sentence_list))]

### Write a code/function to get the labels of a sentence

In [24]:
# Function to get the labels for a sentence.
def getLabelsInListForOneSentence(labels):
  return labels.split()

# Define input and target variables

### Define the features values for each sentence as input variable for CRF model in test and the train dataset

In [25]:
X_train = [getFeaturesForOneSentence(sentence) for sentence in train_sent]
X_test = [getFeaturesForOneSentence(sentence) for sentence in test_sent]

### Define the labels as the target variable for test and the train dataset

In [26]:
Y_train = [getLabelsInListForOneSentence(labels) for labels in train_label]
Y_test = [getLabelsInListForOneSentence(labels) for labels in test_label]

# Build the CRF Model

In [27]:
pip install -U 'scikit-learn<0.24

Note: you may need to restart the kernel to use updated packages.


The system cannot find the file specified.


In [28]:
# Build the CRF model.
crf = sklearn_crfsuite.CRF(max_iterations=100)
try:
    crf.fit(X_train, Y_train)
except AttributeError:
    pass
predictions = crf.predict(X_test)

# Evaluation

### Predict the labels of each of the tokens in each sentence of the test dataset that has been pre processed earlier.

In [29]:
Y_pred = crf.predict(X_test)

### Calculate the f1 score using the actual labels and the predicted labels of the test dataset.

In [30]:
f1_score = metrics.flat_f1_score(Y_test, Y_pred, average='weighted')
print(f"F1 score is: {round(f1_score,4)}")

F1 score is: 0.9053


An f1-score of nearly 91% is fairly decent.



In [31]:
# Print the orginal labels and predicted labels for the sentence  in test data, which is at index value 25.
id = 25
print("Sentence:",test_sent[id])
print("Orig Labels:", Y_test[id])
print("Pred Labels:", Y_pred[id])

Sentence: CONCLUSIONS : A complete genomic screen in families affected with late-onset AD identified 4 regions of interest after follow-up
Orig Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


# Identifying Diseases and Treatments using Custom NER



We now use the CRF model's prediction to prepare a record of diseases identified in the corpus and treatments used for the diseases.

Create the logic to get all the predicted treatments (T) labels corresponding to each disease (D) label in the test dataset.

In [32]:
D_n_T_DF = pd.DataFrame([], columns=["Disease", "Treatments"])
D_n_T_DF

Unnamed: 0,Disease,Treatments


In [33]:
for i in range(len(Y_pred)):

  p_val = Y_pred[i]
  dise = ""
  treat = ""

  DTO = set(p_val)
  if "D" in DTO and "T" in DTO:

    for j in range(len(p_val)):
      if p_val[j] == 'D':
        dise += test_sent[i].split()[j] + " "
      elif p_val[j] == 'T':
        treat += test_sent[i].split()[j] + " "

    dise = dise.strip()
    treat = treat.strip()

    present = D_n_T_DF.loc[(D_n_T_DF.Disease == dise), ["Disease"]]
    if present.size:
      treatment_df = D_n_T_DF.loc[(D_n_T_DF.Disease == dise), ["Treatments"]]
      treatment = treatment_df.values.tolist()
      treatment.extend([treat])
      D_n_T_DF.loc[(D_n_T_DF.Disease == dise), ["Treatments"]] = [[treatment]]

    else:
      D_n_T_DF = pd.concat([D_n_T_DF, pd.DataFrame([[dise, treat]], columns=D_n_T_DF.columns )])   
    D_n_T_DF = D_n_T_DF.set_index(np.arange(D_n_T_DF.shape[0]))

In [34]:
D_n_T_DF

Unnamed: 0,Disease,Treatments
0,hereditary retinoblastoma,radiotherapy
1,myocardial infarction,"warfarin with 80 mg aspirin , or 1 mg warfarin..."
2,unstable angina,roxithromycin
3,coronary-artery disease,Antichlamydial antibiotics
4,primary pulmonary hypertension ( PPH ),fenfluramines
...,...,...
96,temporomandibular joint arthropathy,arthroscopic treatment
97,severe secondary peritonitis,Surgical management
98,hepatic metastases from colorectal cancer,Hepatic arterial infusion of chemotherapy afte...
99,chronic renal failure,Epoetin


### Predict the treatment for the disease name: 'hereditary retinoblastoma'


In [35]:
dise = 'hereditary retinoblastoma'
D_n_T_DF.loc[(D_n_T_DF.Disease == dise), ["Disease", "Treatments"]]

Unnamed: 0,Disease,Treatments
0,hereditary retinoblastoma,radiotherapy


Here we can see the the treatment for the disease name 'hereditary retinoblastoma' is radiotherapy.