In [None]:
import spacy
import textwrap
import warnings
import json
import sklearn_crfsuite

import pandas as pd

from tqdm import tqdm
from sklearn_crfsuite import metrics

warnings.filterwarnings("ignore")
model = spacy.load("en_core_web_sm")

In [None]:
def read_file(file_name):
    with open(file_name, 'r', encoding='utf-8') as file:
        content = file.readlines()
    
    sentences = []
    sentence = ""
    word_count = 0
    
    for word in content:
        word = word.strip('\n')
        if word == "":
            sentences.append(sentence.rstrip(" "))
            sentence = ""
        else:
            word_count += 1
            sentence += word + " "
            
    print("Items in File       : ", len(content))
    print("Number of Words     : ", word_count)
    print("Number of Sentences : ", len(sentences))
    
    prefix = "First Sentence      :  "
    wrapper = textwrap.TextWrapper(initial_indent = prefix, width = 150, subsequent_indent = ' '*len(prefix))
    print(wrapper.fill(sentences[0]))
    
    return sentences

In [None]:
print("Training Sentences")
print("------------------")
train_sentences = read_file(r"train_sent")

print("\n")
print("Testing Sentences")
print("------------------")
test_sentences = read_file(r"test_sent")

Training Sentences
------------------
Items in File       :  48501
Number of Words     :  45902
Number of Sentences :  2599
First Sentence      :  All live births > or = 23 weeks at the University of Vermont in 1995 ( n = 2395 ) were retrospectively analyzed for delivery
                       route , indication for cesarean , gestational age , parity , and practice group ( to reflect risk status )


Testing Sentences
------------------
Items in File       :  19674
Number of Words     :  18618
Number of Sentences :  1056
First Sentence      :  Furthermore , when all deliveries were analyzed , regardless of risk status but limited to gestational age > or = 36 weeks ,
                       the rates did not change ( 12.6 % , 280 of 2214 ; primary 9.2 % , 183 of 1994 )


In [None]:
print("Training Labels")
print("--------------")
train_labels = read_file(r"train_label")

print("\n")
print("Testing Labels")
print("--------------")
test_labels = read_file(r"test_label")

Training Labels
--------------
Items in File       :  48501
Number of Words     :  45902
Number of Sentences :  2599
First Sentence      :  O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O


Testing Labels
--------------
Items in File       :  19674
Number of Words     :  18618
Number of Sentences :  1056
First Sentence      :  O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O


In [None]:
def get_pos_tags(reviews, labels, tag = ""):
    
    sentence = []
    pos = []
    lemma = []
    text = []
    label = []
    
    i = 1 # Sentence Count
    
    for review, review_labels in tqdm(zip(reviews, labels)):
        #doc = model(review)
        for doc, review_label in zip(review.split(), review_labels.split()):
            s = model(doc)
            for tok in s:
                sentence.append(tag + str(i))
                pos.append(tok.pos_)
                lemma.append(tok.lemma_)
                text.append(tok.text)
                label.append(review_label)
        
        i += 1
    
    return sentence, pos, lemma, text, label

print("Training Sentences")
print("------------------")
train_sentence, train_pos, train_lemma, train_text, train_label = get_pos_tags(train_sentences, train_labels, "train_")
train_frequency_df = pd.DataFrame({'sentence':train_sentence, 'text':train_text,'lemma':train_lemma,'pos':train_pos,'label':train_label})

print("\n")
print("Testing Sentences")
print("------------------")
test_sentence, test_pos, test_lemma, test_text, test_label = get_pos_tags(test_sentences, test_labels, "test_")
test_frequency_df = pd.DataFrame({'sentence':test_sentence, 'text':test_text,'lemma':test_lemma,'pos':test_pos,'label':test_label})

# Convert the data into a dataframe object.
frequency_df = pd.concat((train_frequency_df.copy(), test_frequency_df.copy()),axis=0)

Training Sentences
------------------


2599it [06:23,  6.78it/s]




Testing Sentences
------------------


1056it [02:29,  7.07it/s]


In [None]:
frequency_df[(frequency_df['pos'] == 'NOUN') | (frequency_df['pos'] == 'PROPN')]['text'].value_counts()[:25]

patients        492
treatment       281
%               247
cancer          200
therapy         175
study           162
disease         143
cell            140
lung            116
clinical         95
group            94
chemotherapy     88
gene             88
effects          85
results          79
use              78
women            77
cases            71
surgery          71
risk             71
analysis         70
rate             67
associated       67
response         66
survival         65
Name: text, dtype: int64

In [None]:
frequency_df[(frequency_df['pos'] == 'NOUN') | (frequency_df['pos'] == 'PROPN')]['lemma'].value_counts()[:25]

patient         502
treatment       316
%               247
study           240
cancer          226
cell            203
therapy         182
effect          182
disease         164
group           145
case            131
lung            120
gene            112
rate            103
clinical         95
result           92
trial            91
conclusion       89
woman            89
chemotherapy     88
analysis         86
use              82
protein          82
response         81
year             79
Name: lemma, dtype: int64

In [None]:
def getFeaturesForOneWord(word_details, pos):
    word_details.reset_index(drop=True, inplace=True)
    word = word_details[pos][0]
    postag = word_details[pos][1]
    
    features = [
        'bias=' + "1.0",
        'word.lower=' + word.lower(),
        'word[-3]=' + word[:-3],
        'word[-2]=' + word[:-2],
        'word.islower=%s' % word.islower(),
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
        'postag=' + postag,
        'postag.isnounpronoun=%s' % (postag in ['NOUN','PROPN']),
    ]
    
    if (pos > 0):
        prev_word = word_details[pos-1][0]
        prev_postag = word_details[pos-1][1]
        
        features.extend([
            'prev_word.lower=' + prev_word.lower(),
            'prev_word[-3]=' + prev_word[:-3],
            'prev_word[-2]=' + prev_word[:-2],
            'prev_word.islower=%s' % prev_word.islower(),
            'prev_word.isupper=%s' % prev_word.isupper(),
            'prev_word.istitle=%s' % prev_word.istitle(),
            'prev_word.isdigit=%s' % prev_word.isdigit(),
            'prev_postag=' + prev_postag,
            'prev_postag.isnounpronoun=%s' % (prev_postag in ['NOUN','PROPN']),
        ])
    else:
        features.append('BEG')
        
    if (pos < len(word_details) - 1):
        next_word = word_details[pos+1][0]
        next_postag = word_details[pos+1][1]
        
        features.extend([
            'next_word.lower=' + next_word.lower(),
            'next_word[-3]=' + next_word[:-3],
            'next_word[-2]=' + next_word[:-2],
            'next_word.islower=%s' % next_word.islower(),
            'next_word.isupper=%s' % next_word.isupper(),
            'next_word.istitle=%s' % next_word.istitle(),
            'next_word.isdigit=%s' % next_word.isdigit(),
            'next_postag=' + next_postag,
            'next_postag.isnounpronoun=%s' % (next_postag in ['NOUN','PROPN']),
        ])
    else:
        features.append('END')
        
    return features

In [None]:
def get_word_details(item):
    return item["text"], item["pos"]

def getFeaturesForOneSentence(sentence_id):
    words_for_features = frequency_df[frequency_df["sentence"] == sentence_id].apply(get_word_details, axis=1)
    return [getFeaturesForOneWord(words_for_features, pos) for pos in range(len(words_for_features))]

In [None]:
features = getFeaturesForOneSentence("train_1")
prefix = "01 Sentence : "
wrapper = textwrap.TextWrapper(initial_indent = prefix, width = 150, subsequent_indent = ' '*len(prefix))
print(wrapper.fill(train_sentences[0]))
print('\n')

i = 1
for feature in features:
    prefix = str('%02d' % i) + " Word     : "
    wrapper = textwrap.TextWrapper(initial_indent = prefix, width = 150, subsequent_indent = ' '*len(prefix))
    print(wrapper.fill(str(feature)))
    i += 1

01 Sentence : All live births > or = 23 weeks at the University of Vermont in 1995 ( n = 2395 ) were retrospectively analyzed for delivery route ,
              indication for cesarean , gestational age , parity , and practice group ( to reflect risk status )


01 Word     : ['bias=1.0', 'word.lower=all', 'word[-3]=', 'word[-2]=A', 'word.islower=False', 'word.isupper=False', 'word.istitle=True',
              'word.isdigit=False', 'postag=DET', 'postag.isnounpronoun=False', 'BEG', 'next_word.lower=live', 'next_word[-3]=l', 'next_word[-2]=li',
              'next_word.islower=True', 'next_word.isupper=False', 'next_word.istitle=False', 'next_word.isdigit=False', 'next_postag=VERB',
              'next_postag.isnounpronoun=False']
02 Word     : ['bias=1.0', 'word.lower=live', 'word[-3]=l', 'word[-2]=li', 'word.islower=True', 'word.isupper=False', 'word.istitle=False',
              'word.isdigit=False', 'postag=VERB', 'postag.isnounpronoun=False', 'prev_word.lower=all', 'prev_word[-3]=',

In [None]:
def getLabelsForOneSentence(sentence_id):
    return frequency_df[frequency_df["sentence"] == sentence_id]["label"]

In [None]:
labels = getLabelsForOneSentence("train_1")

prefix = "01 Labels  : "
wrapper = textwrap.TextWrapper(initial_indent = prefix, width = 150, subsequent_indent = ' '*len(prefix))
print(wrapper.fill(" ".join(labels)))

01 Labels  : O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O


In [None]:
print("Training Sentences")
print("------------------")
X_train = [getFeaturesForOneSentence("train_" + str(i+1)) for i in tqdm(range(len(train_sentences)))] 

print("\n")
print("Testing Sentences")
print("------------------")
X_test = [getFeaturesForOneSentence("test_" + str(i+1)) for i in tqdm(range(len(test_sentences)))] 

Training Sentences
------------------


100%|██████████| 2599/2599 [00:28<00:00, 90.50it/s] 




Testing Sentences
------------------


100%|██████████| 1056/1056 [00:11<00:00, 93.51it/s] 


In [None]:
print("Training Labels")
print("------------------")
Y_train = [getLabelsForOneSentence("train_" + str(i+1)) for i in tqdm(range(len(train_labels)))] 

print("\n")
print("Testing Labels")
print("------------------")
Y_test = [getLabelsForOneSentence("test_" + str(i+1)) for i in tqdm(range(len(test_labels)))]

Training Labels
------------------


100%|██████████| 2599/2599 [00:21<00:00, 118.30it/s]




Testing Labels
------------------


100%|██████████| 1056/1056 [00:09<00:00, 114.25it/s]


In [None]:
crf = sklearn_crfsuite.CRF(c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True)
crf.fit(X_train, Y_train)

CRF(all_possible_transitions=True, c1=0.1, c2=0.1, keep_tempfiles=None,
    max_iterations=100)

In [None]:
Y_pred = crf.predict(X_test)

In [None]:
f1_score = metrics.flat_f1_score(Y_test, Y_pred, average='weighted')
print('Predicted F1-Score : {0} % '.format(round(f1_score*100,2)))

Predicted F1-Score : 92.01 % 


In [None]:
def get_labels_as_array(labels):
    pred_label = []

    for label in labels:
        pred_label.extend(label)
        
    return pred_label

test_frequency_df["pred_label"] = get_labels_as_array(Y_pred)
test_frequency_df.head(5)

Unnamed: 0,sentence,text,lemma,pos,label,pred_label
0,test_1,Furthermore,furthermore,ADV,O,O
1,test_1,",",",",PUNCT,O,O
2,test_1,when,when,ADV,O,O
3,test_1,all,all,DET,O,O
4,test_1,deliveries,delivery,NOUN,O,O


In [None]:
new_df = test_frequency_df[(test_frequency_df.pred_label != 'O')]
new_df.set_index('sentence',inplace=True)

disease=[]
treatment=[]
sentence=[]
med_dict = {}

for i in new_df.index.unique():
    try:
        val = new_df.loc[i,'pred_label'].unique()
        if len(val) == 2:
            disease_val = new_df[new_df.pred_label == 'D'].loc[i,'text']
            treatment_val = new_df[new_df.pred_label == 'T'].loc[i,'text']
            disease_single = disease_val if type(disease_val) == str else " ".join(disease_val)
            treatment_single = treatment_val if type(treatment_val) == str else " ".join(treatment_val)
            if disease_single not in disease:
                med_dict[disease_single] = treatment_single
            else:
                print('Entered')
                med_dict[disease_single] = med_dict.get(disease_single)+'/'+treatment_single
    except AttributeError:
        pass

print(json.dumps(dict(sorted(med_dict.items())), indent = 4))

{
    "AOM drug - resistant S. pneumoniae": "Amoxicillin remains the antibiotic of choice",
    "Barrett 's esophagus": "Acid suppression therapy",
    "Cranial nerve injuries": "persistent conduction blocks",
    "Eisenmenger 's syndrome": "laparoscopic cholecystectomy",
    "Parkinson 's disease": "Microelectrode - guided posteroventral pallidotomy",
    "Pneumocystis carinii pneumonia": "trimethoprim - sulfamethoxazole",
    "Spontaneous splenic rupture": "granulocyte colony - stimulating factor ( G - CSF )",
    "abdominal pain": "thoracic paravertebral block ( tpvb )",
    "acute carbon monoxide poisoning": "Hyperbaric or normobaric oxygen",
    "acute cerebral ischemia": "Antiplatelet therapy",
    "acute migraine treatment": "Sumatriptan",
    "acute myocardial infarction": "thrombolytic treatment",
    "acute occlusion of the middle cerebral artery": "thrombolytic therapy",
    "advanced non -- small - cell lung cancer": "paclitaxel plus carboplatin ( pc ) vinorelbine plus cisp

In [None]:
disease=''
treatment=''

input_text = []
input_pos = []
input_label = []

input_sent = 'hereditary retinoblastoma'

input_model = model(input_sent)

for word in input_model:
    input_text.append(word.text)
    input_pos.append(word.pos_)
    input_label.append('D')

details_sent = pd.DataFrame({'text':input_text, 'pos':input_pos,'label':input_label})
words_for_features = details_sent.apply(get_word_details, axis=1)

test_sent = []

for i in range(len(input_sent.split())):
    test_sent.append(getFeaturesForOneWord(words_for_features, i))

for i,tag in enumerate(crf.predict([test_sent])[0]):
    
    if tag == 'D':
        tr = input_sent.split()[i]
        disease += tr + " "

        if tr in med_dict:
            treatment += med_dict.get(tr) + ", "
            
        if disease.strip() in med_dict:
            treatment += med_dict.get(disease.strip()) + ", "

disease = disease.strip()

if len(treatment) == 0:
    treatment = 'Not Available'
else:
    treatment = treatment.rstrip(", ")
    
print('Identified Disease   :', disease)
print('Identified Treatment :', treatment)

Identified Disease   : retinoblastoma
Identified Treatment : radiotherapy, radiotherapy
