Libraries and Packages

In [1]:
!pip install pycrf
!pip install sklearn-crfsuite

Collecting pycrf
  Downloading pycrf-0.0.1.tar.gz (1.1 kB)
Building wheels for collected packages: pycrf
  Building wheel for pycrf (setup.py) ... [?25l[?25hdone
  Created wheel for pycrf: filename=pycrf-0.0.1-py3-none-any.whl size=1897 sha256=565c02f35491e28cfe082abff8d9cebba06358d8a3f9b92b43d1a6d62779b8c1
  Stored in directory: /root/.cache/pip/wheels/0b/68/37/a457e156cfd6174ed28c9c8cb76f18eeb559b760d84c0a22eb
Successfully built pycrf
Installing collected packages: pycrf
Successfully installed pycrf-0.0.1
Collecting sklearn-crfsuite
  Downloading sklearn_crfsuite-0.3.6-py2.py3-none-any.whl (12 kB)
Collecting python-crfsuite>=0.8.3
  Downloading python_crfsuite-0.9.7-cp37-cp37m-manylinux1_x86_64.whl (743 kB)
[K     |████████████████████████████████| 743 kB 9.4 MB/s 
Installing collected packages: python-crfsuite, sklearn-crfsuite
Successfully installed python-crfsuite-0.9.7 sklearn-crfsuite-0.3.6


In [2]:
# Library Import
import pandas as pd
import re
import spacy
import warnings
warnings.filterwarnings('ignore')

# Import model and metrics
from sklearn_crfsuite import CRF, scorers, metrics

Extraire la phrase des mots : Il aide à extraire le mot basé sur le séparateur pour former la phrase

In [3]:
# Extract sentence from words
def content_extract(file_path='',sep='\t'):
    try:
        with open (file_path,'r',encoding='utf-8') as text:
            if text.mode  == 'r':
                content = text.readlines()
        sentence = []
        final_sentence=''
        for c in content:
            content_word = c.strip('\n')
            if content_word == '':
                #Une fois qu'il est mis en correspondance avec le séparateur, il ajoute la chaîne concaténée extraite précédente en tant que phrase
# phrase_finale = re.sub('(?<=[\(]) | (?=[%\',)])','', phrase_finale)
                sentence.append(final_sentence.strip(' '))

                #Initialize for next sentence
                final_sentence=''
            else:
                # Jusqu'à ce que la boucle identifie le séparateur, elle concatène la chaîne
                final_sentence+=content_word+' '
        print('Total identified value: ',len(sentence),'\n')
        print('Sample display value:\n',sentence[:5])
        return sentence
    except FileNotFoundError:
        print('Check and provide proper file path')

In [4]:
# Une classe pour récupérer les détails des phrases du dataframe
class sentencedetail(object):
    def __init__(self, data):
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, l) for w, p, l in zip(s["word"].values.tolist(), s["pos"].values.tolist(),s["label"].values.tolist())]
        self.grouped = self.data.groupby("sentence").apply(agg_func)
        self.sentences = [s for s in self.grouped]

In [5]:
# Feature set
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[0]': word[0],
        'word[-1]': word[-1],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag':postag,
        'postag_isnounpronoun': postag in ['NOUN','PROPN'],
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word[0]': word1[0],
            '-1:word[-1]': word1[-1],
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
            '-1:postag_isnounpronoun': postag1 in ['NOUN','PROPN']
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
            '+1:postag_isnounpronoun': postag1 in ['NOUN','PROPN']
        })
    else:
        features['EOS'] = True

    return features

In [6]:
# Définissez une fonction pour extraire les caractéristiques d'une phrase.
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

In [7]:
# Définissez une fonction pour obtenir les étiquettes d'une phrase.
def sent2labels(sent):
    return [label for token, postag, label in sent]

In [8]:
# Extraction de Train sentence à partir de la data
train_sent = content_extract(file_path='train_sent',sep='\n')

Total identified value:  2599 

Sample display value:
 ['All live births > or = 23 weeks at the University of Vermont in 1995 ( n = 2395 ) were retrospectively analyzed for delivery route , indication for cesarean , gestational age , parity , and practice group ( to reflect risk status )', 'The total cesarean rate was 14.4 % ( 344 of 2395 ) , and the primary rate was 11.4 % ( 244 of 2144 )', 'Abnormal presentation was the most common indication ( 25.6 % , 88 of 344 )', "The `` corrected '' cesarean rate ( maternal-fetal medicine and transported patients excluded ) was 12.4 % ( 273 of 2194 ) , and the `` corrected '' primary rate was 9.6 % ( 190 of 1975 )", "Arrest of dilation was the most common indication in both `` corrected '' subgroups ( 23.4 and 24.6 % , respectively )"]


In [9]:
# Train label extraction from dataset
train_label = content_extract(file_path='train_label',sep='\n')

Total identified value:  2599 

Sample display value:
 ['O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O', 'O O O O O O O O O O O O O O O O O O O O O O O O O', 'O O O O O O O O O O O O O O O', 'O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O', 'O O O O O O O O O O O O O O O O O O O O O O']


In [10]:
# Test sentence extraction from dataset
test_sent = content_extract(file_path='test_sent',sep='\n')

Total identified value:  1056 

Sample display value:
 ['Furthermore , when all deliveries were analyzed , regardless of risk status but limited to gestational age > or = 36 weeks , the rates did not change ( 12.6 % , 280 of 2214 ; primary 9.2 % , 183 of 1994 )', 'As the ambient temperature increases , there is an increase in insensible fluid loss and the potential for dehydration', 'The daily high temperature ranged from 71 to 104 degrees F and AFI values ranged from 1.7 to 24.7 cm during the study period', 'There was a significant correlation between the 2- , 3- , and 4-day mean temperature and AFI , with the 4-day mean being the most significant ( r = 0.31 , p & # 60 ; 0.001 )', 'Fluctuations in ambient temperature are inversely correlated to changes in AFI']


In [11]:
# Test label extraction from dataset
test_label = content_extract(file_path='test_label',sep='\n')

Total identified value:  1056 

Sample display value:
 ['O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O', 'O O O O O O O O O O O O O O O O O O O', 'O O O O O O O O O O O O O O O O O O O O O O O O', 'O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O', 'O O O O O O O O O O O']


In [12]:
# Import spacy small library pour trouver des entités liées à la médecine
nlp= spacy.load("en_core_web_sm")

In [13]:
# Dataframe of POS tagging,Lemma word and Label for Train and test sentence
train_df = pd.DataFrame(columns=['sentence','word','lemma','pos','label'])
test_df = pd.DataFrame(columns=['sentence','word','lemma','pos','label'])

In [14]:
#train datframe

i=0 #Sentence count
j=0 #Iteration count

for sent,label in zip(train_sent,train_label):
    i+=1
    for s,l in zip(sent.split(),label.split()):
        doc = nlp(s)
        for tok in doc:
            train_df.loc[j,['sentence','word','lemma','pos','label']] = [i,tok.text,tok.lemma_,tok.pos_,l]
            j+=1

In [15]:
#test datframe

i=0 #Sentence count
j=0 #Iteration count

for sent,label in zip(test_sent,test_label):
    i+=1
    for s,l in zip(sent.split(),label.split()):
        doc = nlp(s)
        for tok in doc:
            test_df.loc[j,['sentence','word','lemma','pos','label']] = [i,tok.text,tok.lemma_,tok.pos_,l]
            j+=1

In [16]:
# Word et la fréquence pour le mot qui contient NOUN or PROPN as POS tagging
freq_df = pd.DataFrame()
freq_df = pd.concat((train_df,test_df),axis=0)

In [17]:
# Indice de réinitialisation
freq_df.reset_index(inplace=True,drop=True)

In [18]:
# Top 25 most frequency values for Train and Test related dataset words
freq_df[(freq_df['pos'] == 'NOUN') | ((freq_df['pos'] == 'PROPN'))]['word'].value_counts()[:25]

patients        492
treatment       281
%               247
cancer          200
therapy         175
study           162
disease         143
cell            140
lung            116
clinical         95
group            94
chemotherapy     88
gene             88
effects          85
results          79
use              78
women            77
risk             71
surgery          71
cases            71
analysis         70
associated       67
rate             67
response         66
survival         65
Name: word, dtype: int64

In [19]:
# Top 25 most frequency values for Train and Test related lemma words
freq_df[(freq_df['pos'] == 'NOUN') | ((freq_df['pos'] == 'PROPN'))]['lemma'].value_counts()[:25]

patient         502
treatment       316
%               247
study           240
cancer          226
cell            203
effect          182
therapy         182
disease         164
group           145
case            131
lung            120
gene            112
rate            103
clinical         95
result           92
trial            91
woman            89
conclusion       89
chemotherapy     88
analysis         86
protein          82
use              82
response         81
year             79
Name: lemma, dtype: int64

In [20]:
train_df.head(5)

Unnamed: 0,sentence,word,lemma,pos,label
0,1,All,all,DET,O
1,1,live,live,VERB,O
2,1,births,birth,NOUN,O
3,1,>,>,X,O
4,1,or,or,CCONJ,O


In [21]:
test_df.head(5)

Unnamed: 0,sentence,word,lemma,pos,label
0,1,Furthermore,furthermore,ADV,O
1,1,",",",",PUNCT,O
2,1,when,when,ADV,O
3,1,all,all,DET,O
4,1,deliveries,delivery,NOUN,O


In [22]:
# Récupérer la vue détaillée de  sentence for train set
train_sent_obj = sentencedetail(train_df)
train_sent_detail = train_sent_obj.sentences

In [23]:
# Afficher une sentence detail view for train set
train_sent_detail[0]

[('All', 'DET', 'O'),
 ('live', 'VERB', 'O'),
 ('births', 'NOUN', 'O'),
 ('>', 'X', 'O'),
 ('or', 'CCONJ', 'O'),
 ('=', 'X', 'O'),
 ('23', 'NUM', 'O'),
 ('weeks', 'NOUN', 'O'),
 ('at', 'ADP', 'O'),
 ('the', 'DET', 'O'),
 ('University', 'NOUN', 'O'),
 ('of', 'ADP', 'O'),
 ('Vermont', 'PROPN', 'O'),
 ('in', 'ADP', 'O'),
 ('1995', 'NUM', 'O'),
 ('(', 'PUNCT', 'O'),
 ('n', 'X', 'O'),
 ('=', 'X', 'O'),
 ('2395', 'NUM', 'O'),
 (')', 'PUNCT', 'O'),
 ('were', 'AUX', 'O'),
 ('retrospectively', 'ADV', 'O'),
 ('analyzed', 'VERB', 'O'),
 ('for', 'ADP', 'O'),
 ('delivery', 'NOUN', 'O'),
 ('route', 'PROPN', 'O'),
 (',', 'PUNCT', 'O'),
 ('indication', 'NOUN', 'O'),
 ('for', 'ADP', 'O'),
 ('cesarean', 'PROPN', 'O'),
 (',', 'PUNCT', 'O'),
 ('gestational', 'ADJ', 'O'),
 ('age', 'NOUN', 'O'),
 (',', 'PUNCT', 'O'),
 ('parity', 'PROPN', 'O'),
 (',', 'PUNCT', 'O'),
 ('and', 'CCONJ', 'O'),
 ('practice', 'NOUN', 'O'),
 ('group', 'NOUN', 'O'),
 ('(', 'PUNCT', 'O'),
 ('to', 'ADP', 'O'),
 ('reflect', 'VERB', 'O'

In [24]:
# Fetch detail view of sentence for train set
test_sent_obj = sentencedetail(test_df)
test_sent_detail = test_sent_obj.sentences

In [25]:
# Display one sentence detail view for train set
test_sent_detail[0]

[('Furthermore', 'ADV', 'O'),
 (',', 'PUNCT', 'O'),
 ('when', 'ADV', 'O'),
 ('all', 'DET', 'O'),
 ('deliveries', 'NOUN', 'O'),
 ('were', 'AUX', 'O'),
 ('analyzed', 'VERB', 'O'),
 (',', 'PUNCT', 'O'),
 ('regardless', 'ADV', 'O'),
 ('of', 'ADP', 'O'),
 ('risk', 'NOUN', 'O'),
 ('status', 'PROPN', 'O'),
 ('but', 'CCONJ', 'O'),
 ('limited', 'PROPN', 'O'),
 ('to', 'ADP', 'O'),
 ('gestational', 'ADJ', 'O'),
 ('age', 'NOUN', 'O'),
 ('>', 'X', 'O'),
 ('or', 'CCONJ', 'O'),
 ('=', 'X', 'O'),
 ('36', 'NUM', 'O'),
 ('weeks', 'NOUN', 'O'),
 (',', 'PUNCT', 'O'),
 ('the', 'DET', 'O'),
 ('rates', 'NOUN', 'O'),
 ('did', 'AUX', 'O'),
 ('not', 'PART', 'O'),
 ('change', 'NOUN', 'O'),
 ('(', 'PUNCT', 'O'),
 ('12.6', 'NUM', 'O'),
 ('%', 'NOUN', 'O'),
 (',', 'PUNCT', 'O'),
 ('280', 'NUM', 'O'),
 ('of', 'ADP', 'O'),
 ('2214', 'NUM', 'O'),
 (';', 'PUNCT', 'O'),
 ('primary', 'ADJ', 'O'),
 ('9.2', 'NUM', 'O'),
 ('%', 'NOUN', 'O'),
 (',', 'PUNCT', 'O'),
 ('183', 'NUM', 'O'),
 ('of', 'ADP', 'O'),
 ('1994', 'NUM', '

In [26]:
# Prepare X-train and X-test by extracting features from train and test dataset
X_train = [sent2features(s) for s in train_sent_detail]
X_test = [sent2features(s) for s in test_sent_detail]

In [27]:
# Prepare y-train and y-test by extracting labels from train and test dataset
y_train = [sent2labels(l) for l in train_sent_detail]
y_test = [sent2labels(l) for l in test_sent_detail]

In [28]:
# Build the CRF model.
crf = CRF(max_iterations=100, c1=1.0, c2=0.01, all_possible_transitions=False)

In [29]:
try:
    crf.fit(X_train, y_train)
except AttributeError:
    pass
predictions = crf.predict(X_test)

In [30]:
y_pred = crf.predict(X_test)
f1_score = metrics.flat_f1_score(y_test, y_pred, average='weighted')
print('Predicted F1-score for Medical Entity Dataset is: {0} % '.format(round(f1_score*100,2)))

Predicted F1-score for Medical Entity Dataset is: 91.7 % 


In [31]:
# Taken out predicted label from the model
pred_label=[]
for i in y_pred:
    pred_label.extend(i)

In [32]:
# Loaded into test dataframe
test_df['label_predicted'] = pred_label

In [33]:
# Visualise top 5 data
test_df.head(5)

Unnamed: 0,sentence,word,lemma,pos,label,label_predicted
0,1,Furthermore,furthermore,ADV,O,O
1,1,",",",",PUNCT,O,O
2,1,when,when,ADV,O,O
3,1,all,all,DET,O,O
4,1,deliveries,delivery,NOUN,O,O


In [34]:
# Preparing dictionary by keeping Disease as unique Key element and Treatment as value element
new_df =test_df[(test_df['label_predicted'] != 'O')]
new_df.set_index('sentence',inplace=True)
disease=[]
treatment=[]
sentence=[]
med_dict = {}
for i in new_df.index.unique():
    try:
        val = new_df.loc[i,'label_predicted'].unique()
        if len(val) == 2:
            disease_val = new_df[new_df['label_predicted'] == 'D'].loc[i,'word']
            treatment_val = new_df[new_df['label_predicted'] == 'T'].loc[i,'word']
            disease_single = disease_val if type(disease_val) == str else " ".join(disease_val)
            treatment_single = treatment_val if type(treatment_val) == str else " ".join(treatment_val)
            if disease_single not in disease:
                med_dict[disease_single] = treatment_single
            else:
                print('Entered')
                med_dict[disease_single] = med_dict.get(disease_single)+'/'+treatment_single
    except AttributeError:
        pass

In [35]:
print(med_dict)

{'diabetes cases': 'good glycemic control', 'nonimmune hydrops fetalis': 'Trisomy', 'retinoblastoma': 'radiotherapy', 'epilepsy': 'Methylphenidate', 'myocardial infarction': 'warfarin with 80 mg aspirin , or 1 mg warfarin with 80 mg aspirin', 'unstable angina or non - Q - wave myocardial infarction': 'roxithromycin', 'coronary - artery disease': 'Antichlamydial antibiotics', 'primary pulmonary hypertension ( PPH )': 'fenfluramines', 'essential hypertension': 'moxonidine', 'foot infection': 'G - CSF treatment', 'hemorrhagic stroke': 'double - bolus alteplase infusion of alteplase ( P=0.24', 'cardiac disease': 'fenfluramine - phentermine', 'rheumatoid arthritis': 'arthrodesis', "early Parkinson 's disease": 'Ropinirole monotherapy', 'sore throat': 'Antibiotics prescribed', 'female stress urinary incontinence': 'surgical treatment', 'corpal gastritis': 'gastric acid secretion', 'preeclampsia ( proteinuric hypertension )': 'intrauterine insemination with donor sperm versus intrauterine ins

In [36]:
#Predict treatment withthe help of dictionary
d=[]
disease=''
test_sent=[]
treatment=''

input_sent = 'Rahul suffering from advanced rectal cancer'
m = spacy.load('en_core_web_sm')
doc = m(input_sent)
for i in doc:
    d.append((i.text,i.pos_,'D'))
test_sent.append(sent2features(d))
for i,tag in enumerate(crf.predict(test_sent)[0]):
    if tag == 'D':
        tr = input_sent.split()[i]
        disease += tr
        if tr in med_dict:
            treatment += ''+med_dict.get(tr)
if len(treatment) == 0:
    treatment='None'
print('Identified Disease: ',disease)
print('Identified Treatment: ', treatment)

Identified Disease:  rectalcancer
Identified Treatment:  Matrix metalloproteinase inhibitors


In [39]:
from sklearn.metrics import  classification_report
from sklearn.preprocessing import MultiLabelBinarizer
y_test=MultiLabelBinarizer().fit_transform(y_test)
y_pred=MultiLabelBinarizer().fit_transform(y_pred)
cr=classification_report(y_test,y_pred)
print(cr)

              precision    recall  f1-score   support

           0       0.83      0.98      0.90       759
           1       1.00      1.00      1.00      1056

   micro avg       0.92      0.99      0.95      1815
   macro avg       0.91      0.99      0.95      1815
weighted avg       0.93      0.99      0.96      1815
 samples avg       0.93      0.99      0.95      1815

