In [1]:
import pandas as pd
import glob
import errno
import nltk
from itertools import chain
from itertools import groupby
from operator import itemgetter
import re

import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
import numpy as np
import random
random.seed(42)
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

# Data Creation

In [2]:
path = 'Combined/*.csv'

files = glob.glob(path)
train=[]
names=[]
for name in files:
    try:
        df=pd.read_csv(name,index_col=0)
        df['w']=df['w'].replace({'\n':'#n'})
        names.extend(df[df['label']=='TEST_NAME'].w.values)
        df['pos']=df['w'].apply(lambda x:nltk.pos_tag([str(x)])[0][1])
        tuple1 = [tuple(x) for x in df.values]
        train.append(tuple1)
        
    except IOError as exc:
        if exc.errno != errno.EISDIR:
            raise


In [3]:
import json

with open('crf_module_vocab.json') as fp:
    crf_module_vocab=json.load( fp)

FileNotFoundError: [Errno 2] No such file or directory: 'crf_module_vocab.json'

In [104]:
def is_name(word):
    if any([ word.upper().strip().endswith(i) for i in crf_module_vocab['suffix']]):
           return True
    elif any([ word.upper().strip().startswith(i) for i in crf_module_vocab['starts']]):
           return True
    elif any([i in word.upper().strip() for i in crf_module_vocab['sub_names']]):
           return True
    
    else:
           return False
    

In [33]:
def is_unit(word):
#     if word in units_list:
#         return True
    if word.upper().strip().endswith('/L'):
        return True
    elif re.match(r'(^(10)\s?\^\s?[1-9]\s?(/[Uu]?[lL])?$)|(^(10)?\s?~?\s?\d/[Uu]?[Ll]$)',word):
        return True  
    else:
        return False

In [105]:
range_list=['-','>','<','-—','—-','=','–','Up to 15', 'Up']
def is_range(word):
    if word in range_list:
        return True
    elif re.match(r'^\d*\.?\d*\s?[-|—|>|<|=|-—|–|-]\s?\d+\.?\d*$',word):
        return True
    else:
        return False

In [106]:

def is_range_cat(word):
    if word in range_category:
        return True
    
    else:
        return False

In [494]:
# def is_unit(word):
#     return any([True if word in unit  else False for unit in units_list ])

In [107]:
def word2features(sent, i):
    word = str(sent[i][0])
    #print(type(word))
    postag = sent[i][2]
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
         'postag[:2]': postag[:2],
         'word[+3:]': word[+3:],
             'word[+2:]': word[+2:],
      #   'is_unit()': is_unit(word),
         'is_range_cat()': is_range_cat(word),
        'is_name()': is_name(word),
        #'length':len(word)
        
    }
    if i > 0:
        word1 = str(sent[i-1][0])
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:word.isdigit()': word1.isdigit(),
           '-1:is_name()': is_name(word1),
           # '-1:is_range_cat()': is_range_cat(word1),
          #  '-1:length':len(word1)
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = str(sent[i+1][0])
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
             '+1:word.isdigit()': word1.isdigit(),
               '+1:is_name()': is_name(word1),
          #  '+1:is_range_cat()': is_range_cat(word1),
          #  '+1:length':len(word1)
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token,label, postag  in sent]

# def sent2tokens(sent):
#     return [token for token, label, postag in sent]

In [108]:
X = [sent2features(s) for s in train]
y = [sent2labels(s) for s in train]


In [109]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='ap',
   all_possible_states=True,
   all_possible_transitions=True,
    max_iterations=1000,
    
   
)

CPU times: user 17 µs, sys: 0 ns, total: 17 µs
Wall time: 19.8 µs


In [37]:
from sklearn.model_selection import cross_val_predict
from sklearn_crfsuite.metrics import flat_classification_report

In [110]:
pred = cross_val_predict(estimator=crf, X=X, y=y, cv=10)

In [111]:
report = flat_classification_report(y_pred=pred, y_true=y)
print(report)

                     precision    recall  f1-score   support

            NEWLINE       1.00      1.00      1.00      3223
                  O       0.93      0.95      0.94     18155
          TEST_NAME       0.94      0.89      0.91      6963
         TEST_RANGE       0.96      0.94      0.95      8416
TEST_RANGE_CATEGORY       0.87      0.91      0.89       484
          TEST_UNIT       0.93      0.91      0.92      3255
         TEST_VALUE       0.90      0.91      0.90      4270

           accuracy                           0.94     44766
          macro avg       0.93      0.93      0.93     44766
       weighted avg       0.94      0.94      0.94     44766



In [96]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
     X, y, test_size=0.33,shuffle=False, random_state=42)

In [103]:
crf.fit(X_train, y_train)


CRF(algorithm='ap', all_possible_states=True, all_possible_transitions=True,
    averaging=None, c=None, c1=None, c2=None, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=1000,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

In [20]:
crf

CRF(algorithm='ap', all_possible_states=True, all_possible_transitions=True,
    averaging=None, c=None, c1=None, c2=None, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=1000,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

In [104]:
labels = list(crf.classes_)
labels.remove('O')
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)

0.9224468446474375

In [112]:

crf.fit(X, y)

# crf1.fit(X, y)

# crf2.fit(X, y)

# crf3.fit(X, y)


CRF(algorithm='ap', all_possible_states=True, all_possible_transitions=True,
    averaging=None, c=None, c1=None, c2=None, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=1000,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

In [33]:
labels = list(crf.classes_)
labels.remove('O')
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)

0.9956021800071991

In [76]:
labels = list(crf.classes_)
labels.remove('O')
y_pred = crf2.predict(X_test)
metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)

0.9762245901662712

In [77]:
labels = list(crf.classes_)
labels.remove('O')
y_pred = crf3.predict(X_test)
metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)

0.9826138750195242

In [120]:
from collections import Counter

In [121]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(50))

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-30:])



Top positive:
375.728427 NEWLINE  word.lower():#n
375.728427 NEWLINE  word[-3:]:#n
375.728427 NEWLINE  word[-2:]:#n
300.489408 TEST_UNIT word[-2:]:dL
297.469833 TEST_VALUE +1:word.lower():1hpf
252.056174 O        bias
233.256501 TEST_UNIT -1:word.lower():/
223.111533 TEST_RANGE +1:word.lower():–
216.054682 TEST_UNIT word[-2:]:/L
214.271245 O        BOS
213.555587 TEST_NAME is_name()
201.607587 TEST_VALUE +1:word.lower():negative
198.444418 O        -1:word.lower():*
193.976410 TEST_RANGE +1:word.lower():-
188.120328 TEST_VALUE +1:word.lower():420
185.171425 TEST_VALUE -1:word.lower():male
185.072107 TEST_NAME +1:word.lower():pct
176.448998 TEST_NAME +1:word.lower():low
172.746509 TEST_RANGE -1:word.lower():>
172.434603 TEST_NAME -1:word.lower():(calc)
169.919898 TEST_NAME -1:word.lower():.
169.633028 TEST_UNIT word[-2:]:uL
168.376331 TEST_VALUE -1:word.lower():erythrocytes
167.123851 TEST_NAME -1:word.lower():trisomy
166.136445 O        +1:word.lower():000-
163.186758 TEST_RANGE +1:wor

In [113]:
from datetime import datetime
now=datetime.now()

In [114]:
now=now.strftime("%Y-%m-%d")

In [115]:
import pickle
filename = now+'_crf_model.sav'
pickle.dump(crf, open(filename, 'wb'))

In [143]:
import pickle
filename = '2020-05-04_crf_model.sav'
crf=pickle.load(open(filename, 'rb'))

In [144]:
def test_word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    features = {
          'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
         'postag[:2]': postag[:2],
         'word[+3:]': word[+3:],
             'word[+2:]': word[+2:],
     #    'is_unit()': is_unit(word),
          'is_range_cat()': is_range_cat(word),
         'is_name()': is_name(word)
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:word.isdigit()': word1.isdigit(),
        '-1:is_name()': is_name(word1)
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
           '+1:word.isdigit()': word1.isdigit(),
            '+1:is_name()': is_name(word1)
        })
    else:
        features['EOS'] = True

    return features


def test_sent2features(sent):
    return [test_word2features(sent, i) for i in range(len(sent))]


# def sent2tokens(sent):
#     return [token for token, label, postag in sent]

# Mapping to LOINCID

In [117]:
loinc =pd.read_csv('LoincTableCore.csv')

In [118]:
loinc=loinc[['LOINC_NUM','COMPONENT']]

In [119]:
loinc=dict(zip(loinc.COMPONENT, loinc.LOINC_NUM))

In [120]:
def post_process(results):
    for record in results:
        if 'TEST_NAME' in record:

            name=re.sub(r'((10)\s?\^\s?[1-9]\s?(/[Uu]?[lL])?)|((10)?\s?~?\s?\d/[Uu]?[Ll]$)|(\d*/[a-zA-Z]?[Ll])|(\d*\.?\d*\s?[-|—|>|<|=|-—|–|-]\s?\d+\.?\d*)', '', record['TEST_NAME'])
            name=name.strip(':')
            if (not re.match('^\d+\.*\d*$',name)):
                record['TEST_NAME']=name
            else: record['TEST_NAME']= ''
            if name in loinc.keys():
                record['LOINC_ID']=loinc[name]

        if 'TEST_RANGE' in record:
            ranges=re.findall(r'\d*\.?\d*\s?/?\d*\s?[-|—|=|-—|–|-]\s?\d*\.?\d+/?\d*|[Uu][Pp]\s?[Tt][Oo0]\s?\d+|greater\s?than\s?\d*\.?\d+/?\d*|less\s?than\s?\d*\.?\d+/?\d*|[<|>]\s?\d*\.?\d+/?\d*', record['TEST_RANGE'],re.IGNORECASE)
          
            if (ranges):
                record['TEST_RANGE']=ranges[0]
            else:
                record['TEST_RANGE']=''
        if 'TEST_VALUE' in record:

            values=re.findall(r'\w*\.?\w+\s?[a-zA-Z]*',record['TEST_VALUE'])
            if (values):

                record['TEST_VALUE']=values[0]
    return results
        
    

In [135]:
tests=['TEST_UNIT:', 'TEST_VALUE:', 'TEST_RANGE:','TEST_RANGE_CATEGORY:']
from itertools import groupby
from operator import itemgetter
def extract_entities(doc):
        import re
        chars = crf_module_vocab['chars']
        for i in chars:
            doc = re.sub(i, str(" " + i + " "), doc)
        doc=re.sub('\\n', str(" #n "), doc)
        doc = re.sub("\(", " (", doc)
        doc = re.sub("\)", ") ", doc)
        doc = re.sub("\\xa0", " ", doc)
        
        v_x = [i for i in doc.split(' ') if (i)]
        pos = [nltk.pos_tag([x])[0][1] for x in v_x]

        test_x = list(zip(v_x, pos))
        
        X = [test_sent2features(s) for s in [test_x]]
        
        y = crf.predict(X)[0]
       
        
        res = ' '
        for k, g in groupby(zip(y, v_x), itemgetter(0)):
            res = res + k + ': ' + ' '.join(i for i in list(list(zip(*g))[1])) + ' '
        
        
        
        lines = ["NEWLINE:" + i for i in res.split("NEWLINE:")]
        results=[]
        print(lines)
        for newline in lines:
            if 'TEST_NAME' in newline:
                result = ["TEST_NAME:" + i for i in newline.split("TEST_NAME:")][1:]

                print(result)
                for i in result:
                    lab={}
                    if any(c in i for c in tests):
                        lab['TEST_NAME']=i.split('TEST_NAME: ')[1].split(' TEST_')[0].split(' O: ')[0]
                        if 'TEST_UNIT' in i:
                            lab['TEST_UNIT']=i.split('TEST_UNIT: ')[1].split(' TEST_')[0].split(' O: ')[0]
                        if 'TEST_RANGE:' in i:
                            pattern=i.split('TEST_RANGE: ')[1].split(' TEST_')[0].split(' O: ')
                            lab['TEST_RANGE']=pattern[0]

                            if (re.match('^\d*\.?\d+$',lab['TEST_RANGE'])):
                                if len(pattern)>1:
                                    O=pattern[1]
                                    partial=re.findall('^[-|=|~|—|—|-|–|-]\s?\d+',O)
                                    if len(partial)>0:
                                        lab['TEST_RANGE']=lab['TEST_RANGE']+partial[0]



                            elif (re.match('^\d*\.?\d+\s?[-|=|—|—|-|–|-]$',lab['TEST_RANGE'])):
                                if len(i.split('TEST_UNIT:'))>2:
                                    unit_pattern=i.split('TEST_RANGE: ')[1].split(' TEST_UNIT: ')
                                    if len(unit_pattern)>1:
                                        partial=re.findall('^\s?\d*\.?\d+',unit_pattern[1])
                                        if len(partial)>0:
                                            lab['TEST_RANGE']=lab['TEST_RANGE']+partial[0]
                                

                        if 'TEST_VALUE' in i:
                            lab['TEST_VALUE']=i.split('TEST_VALUE: ')[1].split(' TEST_')[0].split(' O: ')[0]
                        if 'TEST_RANGE_CATEGORY' in i:
                            lab['TEST_RANGE_CATEGORY']=i.split('TEST_RANGE_CATEGORY: ')[1].split(' TEST_')[0].split(' O: ')[0]
                        results.append(lab)
        #print(results)
        processed_results=post_process(results)
        return(processed_results)
    

In [121]:
tests=['TEST_UNIT:', 'TEST_VALUE:', 'TEST_RANGE:','TEST_RANGE_CATEGORY:']
from itertools import groupby
from operator import itemgetter
def extract_entities(doc):
        import re
        chars = crf_module_vocab['chars']
        for i in chars:
            doc = re.sub(i, str(" " + i + " "), doc)
        doc=re.sub('\\n', str(" #n "), doc)
        doc = re.sub("\(", " (", doc)
        doc = re.sub("\)", ") ", doc)
        doc = re.sub("\\xa0", " ", doc)
        
        v_x = [i for i in doc.split(' ') if (i)]
        pos = [nltk.pos_tag([x])[0][1] for x in v_x]

        test_x = list(zip(v_x, pos))
        
        X = [test_sent2features(s) for s in [test_x]]
        
        y = crf.predict(X)[0]
        for i in range(1,len(y)):
            if y[i]=='NEWLINE':
                if y[i-1]=='TEST_NAME' and y[i+1]=='TEST_NAME':
                    y[i]=='SPLITLINE'
        
        res = ' '
        for k, g in groupby(zip(y, v_x), itemgetter(0)):
            res = res + k + ': ' + ' '.join(i for i in list(list(zip(*g))[1])) + ' '
        
        res=re.sub("NEWLINE: #n",'',res)
        
        print(res)
        
        lines = ["SPLITLINE:" + i for i in res.split("SPLITLINE:")]
        results=[]
        
        for newline in lines:
            if 'TEST_NAME' in newline:
                result = ["TEST_NAME:" + i for i in newline.split("TEST_NAME:")][1:]

                
                for i in result:
                    lab={}
                    if any(c in i for c in tests):
                        lab['TEST_NAME']=i.split('TEST_NAME: ')[1].split(' TEST_')[0].split(' O: ')[0]
                        if 'TEST_UNIT' in i:
                            lab['TEST_UNIT']=i.split('TEST_UNIT: ')[1].split(' TEST_')[0].split(' O: ')[0]
                        if 'TEST_RANGE:' in i:
                            pattern=i.split('TEST_RANGE: ')[1].split(' TEST_')[0].split(' O: ')
                            lab['TEST_RANGE']=pattern[0]

                            if (re.match('^\d*\.?\d+$',lab['TEST_RANGE'])):
                                if len(pattern)>1:
                                    O=pattern[1]
                                    partial=re.findall('^[-|=|~|—|—|-|–|-]\s?\d+',O)
                                    if len(partial)>0:
                                        lab['TEST_RANGE']=lab['TEST_RANGE']+partial[0]



                            elif (re.match('^\d*\.?\d+\s?[-|=|—|—|-|–|-]$',lab['TEST_RANGE'])):
                                if len(i.split('TEST_UNIT:'))>2:
                                    unit_pattern=i.split('TEST_RANGE: ')[1].split(' TEST_UNIT: ')
                                    if len(unit_pattern)>1:
                                        partial=re.findall('^\s?\d*\.?\d+',unit_pattern[1])
                                        if len(partial)>0:
                                            lab['TEST_RANGE']=lab['TEST_RANGE']+partial[0]
                                

                        if 'TEST_VALUE' in i:
                            lab['TEST_VALUE']=i.split('TEST_VALUE: ')[1].split(' TEST_')[0].split(' O: ')[0]
                        if 'TEST_RANGE_CATEGORY' in i:
                            lab['TEST_RANGE_CATEGORY']=i.split('TEST_RANGE_CATEGORY: ')[1].split(' TEST_')[0].split(' O: ')[0]
                        results.append(lab)
        #print(results)
        processed_results=post_process(results)
        return(processed_results)
    

In [125]:
doc='25 Name :Mr. ANWAR ALI SAEED MUHAMMAD Lab No :LAB5394796 Validated :12-01-202001:59 Age/Sex: 54Y M Visit No :IN433700 Reported : 12-01-202009:24 MR No :UA0100000578466 Referred By:Dr. Abid Ali Anwaar Printed On:13-01-202013:52 Laboratory Report Biochemistry Test Description Result Units Reference Range Lipid Profile,Serum Sampled:11-01-202017:07 LDL 2.37 mmol/L Optimal: Less than 2.6 Methodology:Bichromatic End Point Near OptimallAbove Optimal: 2.6-3.3 Borderline High: 3.4.4.1 High: 4.1-4.9 Very High:More than 4.9 Triglycerides,Serum 2.17 mmolL Normal: <1.70 Methodology: Enzymatic, end point Borderline Hih:1.70.2.25 High:2.26.5.64 Very High:>5.65 Cholesterol 4.20 mmolL Desirable: <5.2 Methocology: Cholesterol Oxidase Borderline High: 5.2-6.2 High:>6.2 HDL. 1.19 mmolL Low HDL:<1.04 Methodology: Direct HDL,PEGME High HDL: >1.55 Perf.center-NuC Royal Sampled:11-01-202017:07 Vitamin D(25-Hydroxy Cholecalciferol), 10.85 ng/ml Deficiency: <20 Serum Insufficiency:20-30 Sufficiency: 30-100 Methodology:Chemituminisence. . Toxicity: >100 Perf.center-NMC Royal Sampled:11-01-202017:07 Vitamin B12, Serum 337 pgiml 211.911 Methodology: Chemiluminescence Perf.Center-NvC Royal Sampled:11-01-202017:07 SGPT(ALT), Serum 51 UIL 0-63 Methodology:IFCC Perf.Center.NMC Royal Sampled:11-01-202017:07 Uric Acid, Serum 5.8 mgldl 3.5-7.2 Methodology: Uricase (1 mgldL-59.48 umol/L) Perf.Center-NMC Royal Electrolyte Profile(Na, K,CI), Serum Sampled:11-01-202017:07 Bicarbonate, Serum 29.1 mmol/L 21-32 Methodology: Enzymatic Cl 105 mmolL 98-107 Methodology: ISE.Indirect Pase 1of2'

In [128]:
doc='Verizon E 11:40 PM 95% Touch to resume FaceTime 01:52 8 C #n . #n t1 0 u Twt inormston pano! #n Pitint N e Rim J ester Ordering h kbn S ve eMoRE MD o h 10121986 Qinic hform lion Pron h.0b/Grn ABOUT THIS SCREEN Pano TU 5 #n M tend AE EDo: 32 Physicn Partners.D M REC40134 test no diornostic tvhwarnek #n GestRionolA e: 16w d30d n emlbbod . mitu Addtbnal Reports 706 - 208305 and dxent DNA 10 Ce0 #n tenu Wei e 19705 ReportDte 1101/2018 Enormites The #n Cchccton r 55104152N sptdfk chromosome #n Reterence D 6520216 - 2 - N Samples Colected 10/222013 NOT tel withcertinty aleusisatete teih to.. condtions crcr P . #n Aceisenh ID R184512 Samples Recehed 10/25/2010 red proMdE A #n Gr eID Moue Blood o.Abw rhk resur ooe notu 2104159 unuffected letur #n FINAL RESULTS SUMMARY #n Reur Fetol Sax Fetal Froction #n LOW RISK Male 9.1% 0 #n RESULT DETAILS: ANEUPLOIDIES #n Condfition tested Result Risk Before Test Risk After Test #n TriS0my21 Low Risk 1/518 < 1/10.000 #n Trisomy 18 Low Risk 1/1549 < 1/10,000 #n Trisomy13 Low Risk 1/4766 < 1/10.000 #n MonosomyX Low Risk 1/568 < 1/10.000 #n Tripoidy Low Risk #n RESULT DETAILS:MICRO DELETIONS #n Condition tested Result Risk Before Test Risk After Test #n 22q112 dclction syndrome Low Risk 1/2.000 1/9.000 #n 1Ed ,c nsoihtln ostxk audm 2Eed onnderal re oonl se dlor renenl popbrn pplabh Rtwrn huy 1 o y opaatesrea54o hD a n u pb d14 117825wo n0re A 1Obnet cw #n 21597 37 a portd p h r and wv bw a tenal sseih uskd n thiscky on o a ne , nor ea 1 ko d dulon pblhed ud Morinena Qnc s42017 11Woc r a eo wncorppntesrewsponhP e #n Qrrcnol 2o1sM 212013.1s1ndr. pp d pvo A w nwvrb n o ode o #n sabro D FF.inom ÉÏ dl 2 hr neter mwy rot refect the ol Wor na prntn knrn Erord fndn erovbnnhiton w not hdudedh th na nment #n a h btros 6 0 0 #n A : 2 s D 0 D 0 2 ulc D ~ #n F THE OPDENC PROADCR ASQUESTONS OR WENESTO DICUSS TIE RE CATS EASE CONTACTUS AT 50249 001A1a MpT ertk canb #n ¿Ú 22 2 1 : #n < #n'

In [131]:
doc='LabCorp\n \n Specimen ID: Acct #: 17452095 Phone: (800)539-6119 Rte: 00\n Control ID: Walk-In Lab, LLC\n VART verified\n \n  \n \n 169 W Augusta Lane\n SLIDEbitbA Z045€ x ILE XL UTIL IL XX LITITI1\n \n  \n \n Patient Details Specimen Details Physician Details\n DOB: Date collected: Ordering: J BHAN\n Age(y/m/d): Date entered: Referring:\n Gender: SSN: Date reported: ID: 1841295896\n Patient ID: NPI: 1841295896\n General Comments & Additional Information\n Alternate Control Number: Alternate Patient ID: Not Provided\n Total Volume: Not Provided Fasting: Yes\n \n Ordered Items\n \n CBC With Differential/Platelet; Comp. Metabolic Panel (14); Lipid Panel w/ Chol/HDL Ratio; Testosterone,Free and Total; Hemoglobin\n Aic; DHEA-Sulfate; TSH; Estradiol; Prostate-Specific Ag, Serum; C-Reactive Protein, Cardiac; Homocyst(e)ine, Plasma; Fibrinogen\n Activity; Venipuncture\n \n  \n \n CBC With Differential/Platelet\n \n  \n \n  \n \n  \n \n  \n \n  \n \n  \n \n  \n \n WBC 7.4 x1lOE3/uL Beth — 1058 0\n RBC 5.62 xlOE6/uL 4.14 - 5.80 0\n Hemoglobin 16.8 g/dL L2.6 — Lasd 0\n Hematocrit 48.4 & 37.5 — 51.0 0\n CV 86 £L Fo = OF 0\n GH Cig eg 26:6 — 3350 0\n GHC 34.7 g/dL SI et = Bbs i 0\n RDW de BewG & Zee = 15.4 0\n Platelets 303 xLOE3/uL LO — 879 0\n eutrophils 47 & 0\n Lymphs 46 & 0\n onocytes 6 & 0\n Eos 1 & 0\n Basos 0 & 0\n eutrophils (Absolute) 3.4 x10E3/uL ck afl, * shel) 0\n Lymphs. (Absolute) 3.4 High x1lOE3/uL DOF = Bal 0\n onocytes (Absolute) 0.5 x10E3/uL O.L = 6.9 0\n Eos (Absolute) OT x10E3/uL 0.0 - 0.4 0\n Baso (Absolute) 0.0 Xx10E3/uL O20 = O.2 0\n mmnature Granulocytes 0 & 0\n mmature Grans (Abs) 0.0 x10E3/uL 0.0 - 0.1 0\n Comp. Metabolic Panel (14)\n Glucose, Serum o7 mg/dL 65 - 99 0\n BUN 18 mg/dL 6 - 24 0\n Creatinine, Serum A. LB mg/dL Ow FCS dei 0\n eGFR If NonAfricn Am F2 mL/min/1.73 >59\n eGFR If Africn Am 83 mL/min/1.73 >59\n Date Issued: 03/04/16 1622 ET FINAL REPORT Page 1 off\n This document contains private and confidential health information protected by state and federal law. © 1995-2016 Laboratory Corporation of America®Holdings\n \n If you have received this document in error, please call 800-762-4344 All Rights Reserved- Enterprise Report Version: 1.00\n x10E3/uL\n 3.4 - 10.8\n x10E6/uL\n 12.6 - 17.7\n % \n MCV fL 79 - 97\n MCH 29.9 pg 26.6 - 33.0\n MCHC 31.5 - 35.7\n 13.8 % 12.3 - 15.4\n x10E3/uL 150 - 379\n Neutrophils % \n Monocytes %\n Neutrophils (Absolute) 1.4 - 7.0\n x10E3/uL 0.7 - 3.1\n Monocytes (Absolute) 0.1 - 0.9\n 0.1 \n 0.0 - 0.2\n Immature Granulocytes %\n Immature Grans (Abs) \n 97\n 1.18 0.76 - 1.27\n 72'

In [134]:
data='Hemoglobin 12 11.0-16.0 9/dL RBC 3.3 3.5-5.50 10 6/UL HCT 36 37.0-50.0 % MC 83 82-95 1 8 27-31 MCH 2 D9 MCHC 33 32.0-36.0 9/dL RDW-CV 12 11.5-14.5 % RDW-SD 44 35-56 6.7 4.5-11 10^3雹 WBC NEU% 60 40-70 30 20-45 % LYM % 8 2.10 % MON6 2 1-6 % EOS% 0-2 9% BAS% 0 1.5-4.0 10^3 L LYM# 2 2.0-7.5 10 3/UL GRA# 4.7 150-450 10~3/UL 256 PLT 2 Up to 15 mm/hr ESR Digitally signed by D:Cameron Cordara GNU Public Key:E44311F4 Testid:B165AAF4'

In [137]:
data='Name A/c Status Lab No. Ref By : Age: Gender: Report Status Reported Received Collected P : : : : : : : Final 50 Years 29/5/2018 1:52:00PM 29/5/2018 2:00:51PM 139653126 Female Dr. UNKNWON DUMMY 29/5/2018 5:04:45PM Test Name Results Units Bio. Ref. Interval URINE EXAMINATION, ROUTINE; URINE, R/E (Automated Strip Test, Microscopy) Physical Colour Pale Yellow Pale yellow Specific Gravity 1.010 1.001 - 1.030 pH 5.5 5.0 - 8.0 Chemical Proteins Nil Nil Glucose Nil Nil Ketones Nil Nil Bilirubin Nil Nil Urobilinogen Normal Normal Leucocyte Esterase Negative Negative Nitrite Negative Negative Microscopy R.B.C. Negative Negative Pus Cells Negative 0-5 WBC / hpf Epithelial Cells Few Few Casts Nil Nil /lpf Crystals Nil Nil Others Nil -'

In [232]:
doc='IHEMATOLOGY CBC W/ Differential, w Platelet \n WBC 4.9 4.0-11.0 k mm3 RBC 5.71 4.30-6.00 m/mm3 Hemoglobin 17.0 13.0-18.0 q1dL Hematocrit 52.7 40.0-53.0 % MCV 92.3 78.0-100.0 fL MCH 29.8 27.0-34.0 pg MCHC 3 3 31.0-37.0 gldL Platelet Count 201 130-450 k/imm3 38.0-49.0 fL 口 D w SO) 45 RD W(CV) 13 11.0-15.0 % MPV 11.0 7.5-14.0 fL Seg mented Neutrophils 48.0* % Lymphocytes 37.5. % M onocytes 9.8 % Eosinophils 2 Basophils 0.6 % Absolute Neutrophil 2.36 1.60-9.30 k uL Absolute Lymphocyte 1.84 0.60-5.50 k/uL Absolute Monocyte 0.48 0.10-1.60 K/uL Absolute Eosinophi 0.14 0.00-0.70 k/uL Absolute Basophil 0.03 0.00-0.20 k/uL I m mature Granulocytes 1.2 % Absolute Immature Granulocytes 0.06 0.00-0.10 K/uL NRBC RE,Nucleated Red Blood Cell Percent 0.0 0.0-1.0 % *Seq mented Automated Diff Neutrophils:'

In [160]:
doc="Hematology\nTest Description\nResult Units Reference Range\nComplete Blood Count, Whole Blood Sampled:18-01-202010:21\nHaemoglobin:\n15.9 gm/dL 13-17M\nMethodology:SLS Method 14-22 Birth 11.5-16.51m 11.0-14.03m-6yr 11.5-15.56-12yr\nWBC Count, Total:\n8.4 X109L 4-10 x109/L\nMethodology: Fluorescent Flow Cytometry 10-26birth 5-191m 6-183-6m 6-161yr 5-152-6yrs 5-136-12y\nRBC Count\n5.8 X1012L 4.5-5.5X1012/L\nMethodology:DC Sheath Flow Detection 5-6 birth 3-5.41m 3.9-5.3 3m-12y\nPlatelets:\n257 X109L 150-410X109L 1\nMethodology: DC Sheath Flow Detection 100.450 birth 200-5001m 210-6502m 200-5503m-1y 200-490 2-6y 170-4506-12y ..\nHaematocrit(PCV):\n47 % 40-50%\nMethodology: Automated Impedence Technology 45-75 birth 33-531m 30-40 3m-6y 35-456-12y\nMean Corpuscular Volume(MCV):\n81 fL 83-101fL\nMethodology: Calculated 100-120 Birth 92-1161m 68-843m-1y 75-872-6y 77-956-12y"



In [156]:
doc

'Hematology\nTest Description\nResult Units Reference Range\nComplete Blood Count, Whole Blood Sampled:18-01-202010:21\nHaemoglobin:\n15.9 gm/dL 13-17M\nMethodology:SLS Method 14-22 Birth 11.5-16.51m 11.0-14.03m-6yr 11.5-15.56-12yr\nWBC Count, Total:\n8.4 X109L 4-10 x109/L\nMethodology: Fluorescent Flow Cytometry 10-26birth 5-191m 6-183-6m 6-161yr 5-152-6yrs 5-136-12y\nRBC Count\n5.8 X1012L 4.5-5.5 X1012/L\nMethodology:DC Sheath Flow Detection 5-6 birth 3-5.41m 3.9-5.3 3m-12y\nPlatelets:\n257 X109L 150-410X109L 1\nMethodology: DC Sheath Flow Detection 100.450 birth 200-5001m 210-6502m 200-5503m-1y 200-490 2-6y 170-4506-12y ..\nHaematocrit(PCV):\n47 % 40-50%\nMethodology: Automated Impedence Technology 45-75 birth 33-531m 30-40 3m-6y 35-456-12y\nMean Corpuscular Volume(MCV):\n81 fL 83-101fL\nMethodology: Calculated 100-120 Birth 92-1161m 68-843m-1y 75-872-6y 77-956-12y'

In [157]:
doc1=' '.join(doc.split('\n'))

In [161]:

results=extract_entities(doc)


 O: Hematology  O: Test Description  O: Result Units Reference Range  O: Complete Blood Count, Whole Blood Sampled:18 - 01 - 202010:21  TEST_NAME: Haemoglobin:  TEST_VALUE: 15.9 TEST_UNIT: gm/dL TEST_RANGE: 13 - 17M  O: Methodology:SLS Method 14 - 22 Birth TEST_RANGE: 11.5 - 16.51m 11.0 - 14.03m - 6yr 11.5 - 15.56 - 12yr  TEST_NAME: WBC Count, Total:  TEST_VALUE: 8.4 TEST_UNIT: X109L TEST_RANGE: 4 - 10 TEST_UNIT: x109/L  O: Methodology: Fluorescent Flow Cytometry TEST_RANGE: 10 - 26birth 5 - 191m 6 - 183 - 6m 6 - 161yr 5 - 152 - 6yrs 5 - 136 - 12y  TEST_NAME: RBC Count  TEST_VALUE: 5.8 TEST_UNIT: X1012L TEST_RANGE: 4.5 O: - TEST_UNIT: 5.5X1012/L  O: Methodology:DC Sheath Flow Detection 5 - 6 birth 3 - 5.41m TEST_RANGE: 3.9 - 5.3 3m - 12y  TEST_NAME: Platelets:  TEST_VALUE: 257 TEST_UNIT: X109L TEST_RANGE: 150 O: - 410X109L 1  O: Methodology: DC Sheath Flow Detection 100.450 birth TEST_RANGE: 200 - 5001m 210 - 6502m 200 - 5503m - 1y 200 - 490 2 - 6y 170 - 4506 O: - 12y ..  TEST_NAME: Ha

In [162]:
results

[{'TEST_NAME': 'Haemoglobin: ',
  'TEST_UNIT': 'gm/dL',
  'TEST_RANGE': '13 - 17',
  'TEST_VALUE': '15.9'},
 {'TEST_NAME': 'WBC Count, Total: ',
  'TEST_UNIT': 'X109L',
  'TEST_RANGE': '4 - 10',
  'TEST_VALUE': '8.4'},
 {'TEST_NAME': 'RBC Count ',
  'TEST_UNIT': 'X1012L',
  'TEST_RANGE': '',
  'TEST_VALUE': '5.8'},
 {'TEST_NAME': 'Platelets: ',
  'TEST_UNIT': 'X109L',
  'TEST_RANGE': '150- 410',
  'TEST_VALUE': '257'},
 {'TEST_NAME': 'Haematocrit (PCV)',
  'TEST_UNIT': '%',
  'TEST_RANGE': '40 - 50',
  'TEST_VALUE': '47'},
 {'TEST_NAME': 'Mean Corpuscular Volume (MCV)',
  'TEST_UNIT': 'fL',
  'TEST_RANGE': '83 - 101',
  'TEST_VALUE': '81'}]

In [None]:
less than greater than  <1  <1/10   up to 15  

In [151]:
results

[{'TEST_NAME': 'WBC', 'TEST_UNIT': 'x1lOE3/uL', 'TEST_VALUE': '7.4'},
 {'TEST_NAME': 'RBC',
  'TEST_UNIT': 'xlOE6/uL',
  'TEST_RANGE': '4.14 - 5.80',
  'TEST_VALUE': '5.62'},
 {'TEST_NAME': 'Hemoglobin', 'TEST_UNIT': 'g/dL ', 'TEST_VALUE': '16.8'},
 {'TEST_NAME': 'Hematocrit', 'TEST_VALUE': '48.4'},
 {'TEST_NAME': 'CV', 'TEST_VALUE': '86'},
 {'TEST_NAME': 'GHC', 'TEST_UNIT': 'g/dL', 'TEST_VALUE': '34.7'},
 {'TEST_NAME': 'RDW', 'TEST_RANGE': ''},
 {'TEST_NAME': 'Platelets', 'TEST_UNIT': 'xLOE3/uL', 'TEST_VALUE': '303'},
 {'TEST_NAME': 'eutrophils', 'TEST_UNIT': '& 0 ', 'TEST_VALUE': '47'},
 {'TEST_NAME': 'Lymphs', 'TEST_UNIT': '& 0 ', 'TEST_VALUE': '46'},
 {'TEST_NAME': 'onocytes', 'TEST_UNIT': '& 0 ', 'TEST_VALUE': '6'},
 {'TEST_NAME': 'Eos', 'TEST_UNIT': '& 0 ', 'TEST_VALUE': '1'},
 {'TEST_NAME': 'Basos', 'TEST_UNIT': '& 0 ', 'TEST_VALUE': '0'},
 {'TEST_NAME': 'eutrophils (Absolute)',
  'TEST_UNIT': 'x10E3/uL',
  'TEST_VALUE': '3.4'},
 {'TEST_NAME': 'Lymphs. (Absolute)',
  'TEST_UNIT'

In [329]:
results

[]

In [27]:
doc

'w Verizon LE 11:40PM 95% 4 Touch to resume FaceTime 01:52 0 my.natera.com C Patent Inlormation Opano! Test Inlorm.ton Paticnt N me Ri J Mester Ordering Phnkian SyveRer McREe MD Dbte of hh 1012/1986 Qinlc Inlormstion: Propoth-Ob/cyn M tern AE atEDD: 32 ABOUT THS SCREEN Pinpran ha Physicin Partners.Dr. Gsutional Ase: M UEGA0134 test. not diosnoste ievloes renekl 16 wecs/o days n the maternolbbod whichssmicureo Additbnol Rcoorts M tern Weicht 19701bs 706-32083035 and plxcnts DNA to dctomhe thec Colletin t 551B418-2-N Report Date: 1101/2018 specific chromosome abnormaitia The Reference ID 6520216-2-N Samples Colected 10/22/2018 NOT tdl withcertinty afttusisafete Acesuonhg 1D. RT1B-4632 S mples Received: 1025/201B tcsts for the condions odered by the C r FleID proVCET. A bw rik resut doc nct tu 2104188 Mother Blood unoRected Iets FINAL RESULTS SUMMARY Rasult Fctal Sex Fetol Fraction LO WRISK Male 9.19% W) RESULT DETAILS: ANEUPLOIDIES Condition tested Resut Risk Before Test? Risk After Test

In [None]:
doc=''

In [91]:
from striprtf.striprtf import rtf_to_text
f = open('NMC Anwar Biochemistry (a).rtf', 'r')
data = f.readlines()
f.close()

data=' '.join([rtf_to_text(i) for i in data])
print(data)


         25)
 Name :Mr. ANWAR ALI SAEED MUHAMMAD Lab No :LAB5394796 Validated :12-01-202001:59
 Age/Sex: 54YIM Visit No :IN433700 Reported :12-01-202009:24
 MR No :UA0100000578466 Referred By:Dr. Abid Ali Anwaar Printed On:13-01-202013:52 Laboratory Report Biochemistry
 Test Description Result Units Reference Range
 Lipid Profile,Serum Sampled:11-01-202017:07
 LDL 2.37 mmollL Optimal: Less than 2.6 Near Optimall Above Optimal:
 Methodology: Bichromatic End Point 2.6-32 Bordcrline High: 3.4.4.1 High:4.1.4.9 Very High:More than 4.9
 Triglycerides,Serum 2.17 mmol ¶ù  Normal: <1.70 Borderline High:1.70-2.25
 Methodology: Enzymatic, end point High: 2.26-5.64 . Very High:>5.65
 Cholesterol 4.20 mmol/L Desirable: <5.2 Borderline High:5.2-6.2
 Methodology: Cholesterol Oxidase High:>6.2
 HDL. 1.19 mmol/L Low HDL: <1.04 High HDL:>1.55
 Methodology: Direct HDL,PEGME
 Perf.Center.NuC Royal Sampled:11-01-202017:07
 Vitamin D(25-Hydroxy Cholecalciferol), 10.85 ng/ml Deficiency:<20 Insufficiency: 20-

In [92]:
data

'         25)\n Name :Mr. ANWAR ALI SAEED MUHAMMAD Lab No :LAB5394796 Validated :12-01-202001:59\n Age/Sex: 54YIM Visit No :IN433700 Reported :12-01-202009:24\n MR No :UA0100000578466 Referred By:Dr. Abid Ali Anwaar Printed On:13-01-202013:52 Laboratory Report Biochemistry\n Test Description Result Units Reference Range\n Lipid Profile,Serum Sampled:11-01-202017:07\n LDL 2.37 mmollL Optimal: Less than 2.6 Near Optimall Above Optimal:\n Methodology: Bichromatic End Point 2.6-32 Bordcrline High: 3.4.4.1 High:4.1.4.9 Very High:More than 4.9\n Triglycerides,Serum 2.17 mmol ¶ù \xa0Normal: <1.70 Borderline High:1.70-2.25\n Methodology: Enzymatic, end point High: 2.26-5.64 . Very High:>5.65\n Cholesterol 4.20 mmol/L Desirable: <5.2 Borderline High:5.2-6.2\n Methodology: Cholesterol Oxidase High:>6.2\n HDL. 1.19 mmol/L Low HDL: <1.04 High HDL:>1.55\n Methodology: Direct HDL,PEGME\n Perf.Center.NuC Royal Sampled:11-01-202017:07\n Vitamin D(25-Hydroxy Cholecalciferol), 10.85 ng/ml Deficiency:<20

In [26]:
for i in chars:
    data = re.sub(i, str(" " + i + " "), data)
data=re.sub('\\n', str(" #n "), data)
data = re.sub("\(", " (", data)
data = re.sub("\)", ") ", data)
print(data)


         25)  #n  Name :Mr. ANWAR ALI SAEED MUHAMMAD Lab No :LAB5394796 Validated :12   -   01   -   202001:59 #n  Age/Sex: 54YIM Visit No :IN433700 Reported :12   -   01   -   202009:24 #n  MR No :UA0100000578466 Referred By:Dr. Abid Ali Anwaar Printed On:13   -   01   -   202013:52 Laboratory Report Biochemistry #n  Test Description Result Units Reference Range #n  Lipid Profile,Serum Sampled:11   -   01   -   202017:07 #n  LDL 2.37 mmollL Optimal: Less than 2.6 Near Optimall Above Optimal: #n  Methodology: Bichromatic End Point 2.6   -   32 Bordcrline High: 3.4.4.1 High:4.1.4.9 Very High:More than 4.9 #n  Triglycerides,Serum 2.17 mmol ¶ù  Normal:  < 1.70 Borderline High:1.70   -   2.25 #n  Methodology: Enzymatic, end point High: 2.26   -   5.64 . Very High: > 5.65 #n  Cholesterol 4.20 mmol/L Desirable:  < 5.2 Borderline High:5.2   -   6.2 #n  Methodology: Cholesterol Oxidase High: > 6.2 #n  HDL. 1.19 mmol/L Low HDL:  < 1.04 High HDL: > 1.55 #n  Methodology: Direct HDL,PEGME #n  Perf

In [128]:
words = ' '.join([re.sub('NEWLINE','#n',i) for i in data.split()])

In [129]:
words

"Verizon E 11:40 PM 95% Touch to resume FaceTime 01:52 8 C #n . #n t1 0 u Twt inormston pano! #n Pitint N e Rim J ester Ordering h kbn S ve eMoRE MD o h 10121986 Qinic hform lion Pron h.0b/Grn ABOUT THIS SCREEN Pano TU 5 #n M tend AE EDo: 32 Physicn Partners.D M REC40134 test no diornostic tvhwarnek #n GestRionolA e: 16w d30d n emlbbod . mitu Addtbnal Reports 706 - 208305 and dxent DNA 10 Ce0 #n tenu Wei e 19705 ReportDte 1101/2018 Enormites The #n Cchccton r 55104152N sptdfk chromosome #n Reterence D 6520216 - 2 - N Samples Colected 10/222013 NOT tel withcertinty aleusisatete teih to.. condtions crcr P . #n Aceisenh ID R184512 Samples Recehed 10/25/2010 red proMdE A #n Gr eID Moue Blood o.Abw rhk resur ooe notu 2104159 unuffected letur #n FINAL RESULTS SUMMARY #n Reur Fetol Sax Fetal Froction #n LOW RISK Male 9.1% 0 #n RESULT DETAILS: ANEUPLOIDIES #n Condfition tested' Result Risk Before Test' Risk After Test' #n TriS0my21 Low Risk 1/518 < 1/10.000 #n Trisomy 18 Low Risk 1/1549 < 1/10

In [26]:
results

[{'TEST_NAME': 'ANEUPLOIDIES', 'TEST_RANGE': '518 < 1'},
 {'TEST_NAME': 'Trisomy', 'TEST_RANGE': '< 1', 'TEST_VALUE': '18'},
 {'TEST_NAME': 'Trisomy', 'TEST_RANGE': '568 < 1', 'TEST_VALUE': '13'},
 {'TEST_NAME': 'dciction syndrome', 'TEST_VALUE': '450.2'}]

In [431]:
# import PyPDF2

# pdfFileObject = open(r"Blood Test.pdf", 'rb')

# pdfReader = PyPDF2.PdfFileReader(pdfFileObject)

# print(" No. Of Pages :", pdfReader.numPages)

# pageObject = pdfReader.getPage(0)

# print(pageObject.extractText())

# pdfFileObject.close()

 No. Of Pages : 4
 1Blood Test Report   Fasting 
Normal values  Observation 
Complete Blood Count 
(CBC) Fasting not essential  
RBC (Erythrocytes) No M-4.5-6.4 
F-4.0-5.4 Mil. / c. mm If less: anemia 
Haemoglobin No M-14-18 
F-12-16.4 Gm / 
100 ml 
 PCV (RBC) No M-42-52 
F-37-47 %  MCV (Mean corpucells volume) 
No 78-94 Fl. Cu Type of anemia 
MCH No 27-32   
MCHC No 32-38 Gms/dl 
%       
WBV (Leucocytes) No 4000-
11000 Per c. mm If less-susceptibility to 
infection 
    If very high in Lacs-
Leukemia 
Differential WBC count 
No  %  
Neutrophils  60-75  If more-acute infection 
Lymphocytes  20-30  If more-chronic infection 
Monocytes  2-8  If more-T.B. Typhoid, 
urinary infection Esoinophils  1-6  If more-allergy, cough, cold, 
asthma, and worms. 
Basophils  0-1  Led poisoning, Leukemia 
Abnormal cells     
     
Platelets No 150000-
450000 Cu. Mm 
If less-bleeding disorder, dengue,  Peripheral smear No    
Morphology of:     
RBC No Normochromic /Hypochromic / Anisocytosis 
Observati



In [165]:
s='Haemoglobin: '

In [167]:
s.strip(':')

'Haemoglobin: '

In [173]:
re.sub(':\s*$','','Haemoglobin: ')

'Haemoglobin'