In [1]:
# !pip install pycrf
# !pip install sklearn-crfsuite

import spacy
import sklearn_crfsuite
from sklearn_crfsuite import metrics
from collections import Counter

model = spacy.load("en_core_web_sm")

In [2]:
# Load the training data and its corresponding labels
with open('test_sent', 'r') as file:
    sent_doc = file.read()
with open('test_label', 'r') as file:
    label_doc = file.read()

In [3]:
sent_doc

"Furthermore\n,\nwhen\nall\ndeliveries\nwere\nanalyzed\n,\nregardless\nof\nrisk\nstatus\nbut\nlimited\nto\ngestational\nage\n>\nor\n=\n36\nweeks\n,\nthe\nrates\ndid\nnot\nchange\n(\n12.6\n%\n,\n280\nof\n2214\n;\nprimary\n9.2\n%\n,\n183\nof\n1994\n)\n\nAs\nthe\nambient\ntemperature\nincreases\n,\nthere\nis\nan\nincrease\nin\ninsensible\nfluid\nloss\nand\nthe\npotential\nfor\ndehydration\n\nThe\ndaily\nhigh\ntemperature\nranged\nfrom\n71\nto\n104\ndegrees\nF\nand\nAFI\nvalues\nranged\nfrom\n1.7\nto\n24.7\ncm\nduring\nthe\nstudy\nperiod\n\nThere\nwas\na\nsignificant\ncorrelation\nbetween\nthe\n2-\n,\n3-\n,\nand\n4-day\nmean\ntemperature\nand\nAFI\n,\nwith\nthe\n4-day\nmean\nbeing\nthe\nmost\nsignificant\n(\nr\n=\n0.31\n,\np\n&\n#\n60\n;\n0.001\n)\n\nFluctuations\nin\nambient\ntemperature\nare\ninversely\ncorrelated\nto\nchanges\nin\nAFI\n\nThis\nstudy\ntested\nthe\nhypothesis\nthat\nto\nreduce\nthe\nrate\nof\nmacrosomic\ninfants\nin\ngestational\ndiabetes\ncases\n,\ngood\nglycemic\ncontro

In [4]:
len(sent_doc)

121020

In [5]:
len(label_doc)

38292

In [6]:
# Function to convert the lines into sentences
#
# Read each line from the input file and append it to form a sentence
# The output will return a python list of sentences
#
def make_sentences_from_words(filename):
    # Open the file, read it line by line and store those in a variable
    file_pointer = open(filename, 'r')
    file_content = file_pointer.readlines() 
    file_pointer.close()

    # An empty list to store all the sentences
    sentence_list = [] 

    # Initialising the sentence variable to store a sentence by appending words
    sentence = ""

    # Scroll through each line of the file content variable
    for word in file_content:
        # Remove leading and trailing spaces from the line
        word = word.strip() 
        # If the line is empty, it means it is the end or the beginning of a sentence 
        # add the current content to go into a new element in the sentence_list
        if word == "": 
            # Go into a new element
            sentence_list.append(sentence)
            # Reset the sentence variable
            sentence = ""; 
        else:
        # if word != "": 
            if sentence: #if non-empty, add new word after space
                sentence += " "+word
            else:
                sentence = word # first word, no space required

    return sentence_list

In [7]:
# Let's test the function
print(make_sentences_from_words("test_sent"))

['Furthermore , when all deliveries were analyzed , regardless of risk status but limited to gestational age > or = 36 weeks , the rates did not change ( 12.6 % , 280 of 2214 ; primary 9.2 % , 183 of 1994 )', 'As the ambient temperature increases , there is an increase in insensible fluid loss and the potential for dehydration', 'The daily high temperature ranged from 71 to 104 degrees F and AFI values ranged from 1.7 to 24.7 cm during the study period', 'There was a significant correlation between the 2- , 3- , and 4-day mean temperature and AFI , with the 4-day mean being the most significant ( r = 0.31 , p & # 60 ; 0.001 )', 'Fluctuations in ambient temperature are inversely correlated to changes in AFI', 'This study tested the hypothesis that to reduce the rate of macrosomic infants in gestational diabetes cases , good glycemic control should be initiated before 34 completed gestational weeks', "In the `` early '' and `` late '' groups , mean gestational age at the beginning of tre

In [8]:
# Load the files into lists using the function
train_sentences = make_sentences_from_words("train_sent")
train_labels = make_sentences_from_words("train_label")
test_sentences = make_sentences_from_words("test_sent")
test_labels = make_sentences_from_words("test_label")

In [9]:
print(train_sentences[1])
print(train_labels[1])

The total cesarean rate was 14.4 % ( 344 of 2395 ) , and the primary rate was 11.4 % ( 244 of 2144 )
O O O O O O O O O O O O O O O O O O O O O O O O O


In [10]:
print(train_sentences[1])
print(train_labels[1])

The total cesarean rate was 14.4 % ( 344 of 2395 ) , and the primary rate was 11.4 % ( 244 of 2144 )
O O O O O O O O O O O O O O O O O O O O O O O O O


In [11]:
# No. of sentences in the processed train and test dataset
train_sentence_count = len(train_sentences)
test_sentence_count = len(test_sentences)
print("The number of training sentences are ", train_sentence_count)
print("The number of test sentences are ", test_sentence_count)
print("The teo tal number of sentences are ", train_sentence_count + test_sentence_count)
print("Ratio of train:test sentences is 1:", test_sentence_count/train_sentence_count)
print("The percentage of test sentences over train is ", round(test_sentence_count*100/train_sentence_count, 2), "%")

The number of training sentences are  2599
The number of test sentences are  1056
The teo tal number of sentences are  3655
Ratio of train:test sentences is 1: 0.40631011927664484
The percentage of test sentences over train is  40.63 %


In [12]:
# No. of labels in the processed train and test dataset
train_label_count = len(train_labels)
test_label_count = len(test_labels)
print("The number of training labels are ", train_label_count)
print("The number of test labels are ", test_label_count)
print("The teo tal number of labels are ", train_label_count + test_label_count)
print("Ratio of train:test labels is 1:", test_label_count/train_label_count)
print("The percentage of test labels over train is ", round(test_label_count*100/train_label_count, 2), "%")

The number of training labels are  2599
The number of test labels are  1056
The teo tal number of labels are  3655
Ratio of train:test labels is 1: 0.40631011927664484
The percentage of test labels over train is  40.63 %


In [13]:
# We use POS tags to identify the token types and filter out the NOUN and PROPN types

pos_noun_list = {} # Empty dictionary to store the POS tags
pos_propn_list = {}

# Scroll through the sentence list; pass each sentence into the Spacy model and extract the tokens
for sentence in train_sentences:
    processed_sentence = model(sentence)
    for token in processed_sentence:
        # print(token.text, token.pos_, token.tag_)
        # print(token.text, " -- ", token.pos_)
        if token.pos_ == "NOUN":
            # Store the POS tag along with the corresponding test and add 1 to its count when there's a match
            pos_noun_list[token.text] = pos_noun_list.get(token.text, 0) + 1;
        if token.pos_ == "PROPN":
            # Store the POS tag along with the corresponding test and add 1 to its count when there's a match
            pos_propn_list[token.text] = pos_propn_list.get(token.text, 0) + 1;

In [14]:
pos_noun_list

{'births': 1,
 'weeks': 27,
 'delivery': 18,
 'route': 2,
 'indication': 3,
 'age': 34,
 'parity': 1,
 'practice': 22,
 'group': 63,
 'risk': 49,
 'status': 14,
 'rate': 46,
 '%': 161,
 'presentation': 6,
 'medicine': 29,
 'patients': 354,
 'Arrest': 1,
 'dilation': 1,
 'subgroups': 1,
 'rates': 22,
 'care': 47,
 'hospitals': 8,
 'community': 11,
 'groups': 23,
 'trimester': 4,
 'fluid': 5,
 'index': 11,
 'temperature': 5,
 'increases': 6,
 'decrease': 3,
 'period': 22,
 'heat': 2,
 'women': 48,
 'singleton': 1,
 'pregnancies': 7,
 'gestation': 4,
 'testing': 15,
 'determinations': 2,
 'ambient': 1,
 'area': 8,
 'day': 27,
 'mean': 1,
 'test': 15,
 'date': 2,
 'rank': 1,
 'Correlation': 3,
 'relationship': 9,
 'account': 4,
 'measure': 3,
 'well': 1,
 'being': 2,
 'study': 103,
 'population': 23,
 'diabetes': 23,
 'screening': 10,
 'clinic': 4,
 'year': 27,
 'treatment': 195,
 'week': 6,
 'protocol': 4,
 'control': 29,
 'characteristics': 14,
 'differences': 13,
 'mode': 3,
 'scores': 

In [15]:
pos_propn_list

{'University': 6,
 'Vermont': 1,
 'AFI': 5,
 'June': 2,
 'August': 5,
 'Spearman': 1,
 'Hadassah': 1,
 'Medical': 5,
 'Center': 3,
 'Apgar': 1,
 'Edward': 2,
 'Patau': 2,
 'atresia': 1,
 'New': 6,
 'York': 4,
 'Hospital': 4,
 'Cornell': 1,
 'Student': 1,
 'Chi': 2,
 '-': 8,
 'square': 1,
 'preeclampsia': 12,
 'eclampsia': 1,
 'NICUs': 1,
 'Inter': 1,
 'Omphalocele': 1,
 'Exstrophy': 1,
 'Imperforate': 1,
 'OEIS': 4,
 'omphalocele': 1,
 'hypoplasia': 2,
 'Nonimmune': 1,
 'hematoma': 1,
 'Chronic': 2,
 'Progressive': 1,
 'External': 1,
 'Ophthalmoplegia': 1,
 'Kearns': 1,
 'Sayre': 1,
 'Alzheimer': 7,
 'presenilin': 2,
 'N=280': 1,
 'National': 8,
 'Insititute': 1,
 'Neurological': 1,
 'Disorders': 2,
 'Stroke': 2,
 'Disease': 5,
 'Related': 1,
 'Association': 6,
 'DNA': 6,
 'MAIN': 6,
 'n=54': 2,
 '.03': 1,
 'D12S1042': 1,
 'MLS': 1,
 'BRCA1': 6,
 'MAJOR': 1,
 'MEASURES': 6,
 'exon': 2,
 'HPC1': 4,
 'Cancer': 6,
 'Data': 1,
 'Base': 1,
 'serum': 6,
 'P=.01': 1,
 'MS': 5,
 'HLA': 4,
 'MB

In [16]:
# Initialise 
noun_counter = Counter(pos_noun_list)
propn_counter = Counter(pos_propn_list)

In [17]:
# Display the top 25 common nouns
noun_counter.most_common(25)

[('patients', 354),
 ('treatment', 195),
 ('%', 161),
 ('cancer', 135),
 ('therapy', 120),
 ('disease', 105),
 ('study', 103),
 ('cell', 99),
 ('lung', 86),
 ('chemotherapy', 65),
 ('group', 63),
 ('effects', 61),
 ('gene', 54),
 ('results', 54),
 ('use', 53),
 ('surgery', 51),
 ('survival', 50),
 ('risk', 49),
 ('women', 48),
 ('care', 47),
 ('analysis', 47),
 ('children', 47),
 ('activity', 47),
 ('rate', 46),
 ('cases', 46)]

In [18]:
# Display the top 25 common propositions
propn_counter.most_common(25)

[('TO_SEE', 54),
 ('HIV', 23),
 ('METHODS', 19),
 ('A', 16),
 ('B', 16),
 ('CO2', 16),
 ('C', 15),
 ('II', 14),
 ('nsclc', 14),
 ('preeclampsia', 12),
 ('Study', 12),
 ('Group', 12),
 ('G', 11),
 ('CONCLUSION', 11),
 ('Use', 11),
 ('mg', 10),
 ('L.', 10),
 ('S', 10),
 ('fatty', 10),
 ('American', 10),
 ('AIDS', 10),
 ('International', 9),
 ('ng', 9),
 ('PAI-1', 9),
 ('mL', 9)]

In [19]:
# Let's define the features to get the feature value of one word.
# We will use custom NER processing using custom feature functions

def getFeaturesOfOneWord(sentence, pos):
  word = sentence[pos]

  # Define features with PoS tag as one of the features
  features = [
    'word.lower=' + word.lower(), # serves as word id
    'word[-3:]=' + word[-3:],     # last three characters
    'word[-2:]=' + word[-2:],     # last two characters
    'word.isupper=%s' % word.isupper(),  # is the word in all uppercase
    'word.isdigit=%s' % word.isdigit(),  # is the word a number
    'word.startsWithCapital=%s' % word[0].isupper(), # is the word starting with a capital letter
  ]

  # Use the previous word also while defining features
  if(pos > 0):
    prev_word = sentence[pos-1]
    features.extend([
    'prev_word.lower=' + prev_word.lower(), 
    'prev_word.isupper=%s' % prev_word.isupper(),
    'prev_word.isdigit=%s' % prev_word.isdigit(),
    'prev_word.startsWithCapital=%s' % prev_word[0].isupper(),
  ])
    
  # Mark the begining and the end words of a sentence correctly in the form of features.
  else:
    features.append('BEG') # feature to track begin of sentence 

  if(pos == len(sentence)-1):
    features.append('END') # feature to track end of sentence

  return features


In [20]:
# Write a code to get features of a sentence.

def getFeaturesOfOneSentence(sentence):

  word_list = sentence.split()
  
  return [getFeaturesOfOneWord(word_list, pos) for pos in range(len(word_list))]

In [21]:
# Test the sentence function to view the output, i.e. features of its words
print(getFeaturesOfOneSentence("The rain in Spain falls mainly on the plains"))

[['word.lower=the', 'word[-3:]=The', 'word[-2:]=he', 'word.isupper=False', 'word.isdigit=False', 'word.startsWithCapital=True', 'BEG'], ['word.lower=rain', 'word[-3:]=ain', 'word[-2:]=in', 'word.isupper=False', 'word.isdigit=False', 'word.startsWithCapital=False', 'prev_word.lower=the', 'prev_word.isupper=False', 'prev_word.isdigit=False', 'prev_word.startsWithCapital=True'], ['word.lower=in', 'word[-3:]=in', 'word[-2:]=in', 'word.isupper=False', 'word.isdigit=False', 'word.startsWithCapital=False', 'prev_word.lower=rain', 'prev_word.isupper=False', 'prev_word.isdigit=False', 'prev_word.startsWithCapital=False'], ['word.lower=spain', 'word[-3:]=ain', 'word[-2:]=in', 'word.isupper=False', 'word.isdigit=False', 'word.startsWithCapital=True', 'prev_word.lower=in', 'prev_word.isupper=False', 'prev_word.isdigit=False', 'prev_word.startsWithCapital=False'], ['word.lower=falls', 'word[-3:]=lls', 'word[-2:]=ls', 'word.isupper=False', 'word.isdigit=False', 'word.startsWithCapital=False', 'prev_

In [22]:
# Write a code to get the labels for a sentence.
def getLabelsInListOfOneSentence(labels):
  return labels.split()

In [23]:
X_train = [getFeaturesOfOneSentence(sentence) for sentence in train_sentences]
X_test = [getFeaturesOfOneSentence(sentence) for sentence in test_sentences]

In [24]:
Y_train = [getLabelsInListOfOneSentence(labels) for labels in train_labels]
Y_test = [getLabelsInListOfOneSentence(labels) for labels in test_labels]

In [25]:
# Build the CRF model.

crf = sklearn_crfsuite.CRF(max_iterations=100)
crf.fit(X_train, Y_train)


CRF(max_iterations=100)

In [26]:
# Get the predicted values from the model by passing in the test values
Y_pred = crf.predict(X_test)


In [27]:
# Calculate the f1 score using the test data
metrics.flat_f1_score(Y_test, Y_pred, average='weighted')

0.9042560946986944

In [28]:
# We will check the predictions of the model looking at some of the test sentences
id = 1
print("Sentence:", test_sentences[id])
print("Original Labels:", Y_test[id])
print("Predicted Labels:", Y_pred[id])

Sentence: As the ambient temperature increases , there is an increase in insensible fluid loss and the potential for dehydration
Original Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Predicted Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [29]:
id = 699
print("Sentence:", test_sentences[id])
print("Original Labels:", Y_test[id])
print("Predicted Labels:", Y_pred[id])

Sentence: Serologic evidence for mother-to-child transmission of Kaposi sarcoma-associated herpesvirus infection
Original Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'D', 'D', 'D', 'D']
Predicted Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'D', 'D', 'D', 'D']


In [30]:
# Prepare an empty dictionary to store the diseases and treatments
diseases_and_treatments =  {}

# Scroll through the all predictions
for pred_counter in range(len(Y_pred)):
  
  label = Y_pred[pred_counter]

  disease = ""
  treatment = ""
  
  # Each label is a list, so scroll through those 
  for label_counter in range(len(label)): # for each individual label in the sequence
    # If the label is O then skip this label and go to the next one
    if label[label_counter] == 'O':
      continue
    
    #
    # The understanding here is that the disease and the treatments are in the same sentence
    #
    # If the label is identified as D, i.e. disease, add the word to the list in the disease variable
    if(label[label_counter] == 'D'):
      disease += test_sentences[pred_counter].split()[label_counter] + " "
      continue
    
    # If the label is identified as T, i.e. treatment, add the word to the dictionary in the treatment variable
    if(label[label_counter] == 'T'):
      # print(test_sentences[i].split()[j])
      treatment += test_sentences[pred_counter].split()[label_counter] + " "

  # Remove the extra spaces
  disease = disease.strip()
  treatment = treatment.strip()

  # Add the identified disease and treatment to the dictionary
  # if it is a new disease, directly add the value
  # if the disease has been seen previously, get the treatment list
  # and add current treatment to the list.
  if disease != "" and treatment != "":
    if disease not in diseases_and_treatments.keys():
      diseases_and_treatments[disease] = [treatment]
    else:
      treatment_list = diseases_and_treatments.get(disease)
      treatment_list.append(treatment)
      diseases_and_treatments[disease] = treatment_list 

In [31]:
search_item = 'hereditary retinoblastoma'
treatments = diseases_and_treatments[search_item]
print("Treatment(s) for '{0}' is '{1}'".format(search_item,"".join(treatments)), end = "") 

Treatment(s) for 'hereditary retinoblastoma' is 'radiotherapy'