In [4]:
import boto3
import pandas as pd
import time

In [5]:
def iterate_all_bucket_items(s3BucketName):
    """
    Iterates over all objects in a given s3 bucket
    
    :param s3BucketName: Name of s3 bucket
    :return: Dictionary of metadata for an object    
    """
    try: 
        s3Client = boto3.client('s3')
        paginator = s3Client.get_paginator('list_objects_v2')
        page_iterator = paginator.paginate(Bucket=s3BucketName)
        bucketItems = []
        for page in page_iterator:
            if page['KeyCount'] > 0:
                for item in page['Contents']:
                    bucketItems.append(item['Key'])
    except Exception as exception:
        raise exception
    return bucketItems

    
def start_job(s3BucketName, docName):
    """ 
    Starts the asynchronous detection of text in a given document.  
    
    :param s3BucketName: Name of s3 bucket
    :param docName: Name of the documnet in s3 bucket input folder
    :return: JobId
    """
    try: 
        txtClient = boto3.client('textract')    
        response = txtClient.start_document_text_detection(DocumentLocation={
            'S3Object': {
            'Bucket': s3BucketName,
            'Name': docName}
        })
    except Exception as exception:
        raise exception
    return response['JobId']  
    

def check_job_status(jobId):
    """ 
    Checks the status of a started job by the "start_job" funtion
    
    :param jobId: id of the job started 
    :return: status of the job
    """
    textractClient = boto3.client('textract')
    response = textractClient.get_document_text_detection(JobId=jobId)  
    print("Job status: {}".format(response['JobStatus']))
    while(response['JobStatus'] == 'IN_PROGRESS'):
        time.sleep(10)
        response = textractClient.get_document_text_detection(JobId=jobId)
        print("Job status: {}".format(response['JobStatus']))
    return response['JobStatus']


def get_job_results(jobId):
    """ 
    Gets the results for an Amazon Textract asynchronous operation
    
    :param jobId: Id of the job started 
    :return: Status of the job
    """
    docPages = []
    try:
        textractClient = boto3.client('textract')
        response = textractClient.get_document_text_detection(JobId=jobId)  
        docPages.append(response)
        print("Extracted results of page {} recieved!".format(len(docPages)))
        nextToken = None
        if('NextToken' in response):   #Amazon Textract returns a pagination token in the response if there are more pages to retrieve
            nextToken = response['NextToken']
        while(nextToken):
            response = textractClient.get_document_text_detection(JobId=jobId, NextToken=nextToken)
            docPages.append(response)
            print("Extracted results of page {} recieved!".format(len(docPages)))
            nextToken = None
            if('NextToken' in response): 
                nextToken = response['NextToken'] 
    except Exception as exception:  
        raise exception
    else:    
        return docPages

def get_medical_entities(extractedText):
        """
        Detects entities in a medical text extracted by Amazon Textract

        :param extractedText: Medical text to inspect
        :return: List of entities along with their confidence scores
        """
        try:
            comprehendClient = boto3.client(service_name='comprehendmedical', region_name='ap-southeast-2')  #region_name: Asia Pacific (Sydney)
            medical_entities = comprehendClient.detect_entities(Text = extractedText)
            medicalDF = pd.DataFrame()
            for entity in medical_entities['Entities']:
                data = pd.DataFrame([(entity['Text'], entity['Category'], entity['Type'], entity['Score'])], columns = 'Text Catetory Type Text_score'.split())               
                if ('Attributes' in entity and entity['Attributes']):
                    for ent in entity['Attributes']:
                        if (ent['Type'] == 'DOSAGE'):
                            data['Dosage']= ent['Text']
                            data['Dosage_score'] = ent['Score']
                        elif(ent['Type'] == 'FREQUENCY'):
                            data['Frequency']= ent['Text']
                            data['Frequency_score']= ent['Score']             
                medicalDF = medicalDF.append(data)   
        except Exception as exception:  
            raise exception
        return medicalDF

In [6]:
s3BucketName = "YourS3BuckName"   
bucketItems = iterate_all_bucket_items(s3BucketName)
bucketItems.remove('Input/')
allMedicalDf = pd.DataFrame()
extractedTextDF = pd.DataFrame()

for docName in bucketItems:
    wrdCnt = 0
    jobId = start_job(s3BucketName, docName)
    print("Job '{}' started!".format(jobId))
    if(check_job_status(jobId) == 'SUCCEEDED'):
        # Results are returned in an array of Block objects. A Block object contains information that's detected about items  
        results = get_job_results(jobId)     
        extractedText = ''
        txt = ''
        for resultPage in results:
            for item in resultPage['Blocks']:
                if item['BlockType'] == "WORD":    # "LINE"
                    wrdCnt+=1
                    extractedText += ''.join(item['Text'])+'\n'
                    txt += ''.join(item['Text'])+' '
                    textDF = pd.DataFrame([(item['Text'], item['Confidence'])], columns = 'Text Confidence_score'.split())
                    extractedTextDF = extractedTextDF.append(textDF)    
        print("Total number of words extracted for document '{}' is {}".format(docName.replace('Input/', '') , wrdCnt))
        with open('Final text', "a") as outputfile:
            outputfile.write('Detected text by AWS: '+ item['Text'] + '\n')
            outputfile.write('Confidence score for detected word by AWS:'+ str(item['Confidence']) + '\n')
            outputfile.write('-------------------------------' + '\n') 
            outputfile.write(txt)  
            outputfile.write('-------------------------------'+ '\n')    
        medicalDf = get_medical_entities(extractedText)
        allMedicalDf = allMedicalDf.append(medicalDf)
        print('**************************************************************') 
        
extractedTextDF.to_excel(r'AmazonTextract.xlsx',sheet_name='Amazon Textract', index = False)        
allMedicalDf.to_excel(r'MedicalComprehend.xlsx',sheet_name='Amazon Comprehend Medical', index = False)

Job '8b8418c4b50b2bd5f58a4664cba868edb1e8502b192f765873c60d7675e18f6b' started!
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Extracted results of page 1 recieved!
Extracted results of page 2 recieved!
Total number of words extracted for document 'SamplePresc.pdf' is 1257
**************************************************************


In [31]:
extractedTextDF.head(10)

Unnamed: 0,Text,Confidence_score
0,UR No:,59.240063
0,1234,93.278328
0,S,80.190369
0,6,24.406532
0,Family,99.82119
0,Name:,91.690201
0,Smith,99.836319
0,Given,99.886299
0,Names:,99.425896
0,Mary,99.996964


In [32]:
allMedicalDf.head()

Unnamed: 0,Text,Catetory,Type,Text_score,Dosage,Dosage_score,Frequency,Frequency_score
0,Smith,PROTECTED_HEALTH_INFORMATION,NAME,0.998437,,,,
0,Mary,PROTECTED_HEALTH_INFORMATION,NAME,0.999152,,,,
0,01/01/2000,PROTECTED_HEALTH_INFORMATION,DATE,0.9998,,,,
0,surname,PROTECTED_HEALTH_INFORMATION,NAME,0.203387,,,,
0,Smith,PROTECTED_HEALTH_INFORMATION,NAME,0.996283,,,,


In [33]:
allMedicalDf.tail()

Unnamed: 0,Text,Catetory,Type,Text_score,Dosage,Dosage_score,Frequency,Frequency_score
0,JAN,PROTECTED_HEALTH_INFORMATION,NAME,0.89548,,,,
0,JAN,PROTECTED_HEALTH_INFORMATION,NAME,0.7745,,,,
0,17/01,PROTECTED_HEALTH_INFORMATION,DATE,0.944878,,,,
0,Iron,MEDICATION,GENERIC_NAME,0.926809,one\ntablet,0.324295,daily,0.789387
0,Green,PROTECTED_HEALTH_INFORMATION,NAME,0.409777,,,,


In [34]:
allMedicalDf[~allMedicalDf['Frequency'].isna()]

Unnamed: 0,Text,Catetory,Type,Text_score,Dosage,Dosage_score,Frequency,Frequency_score
0,enoxaparin,MEDICATION,GENERIC_NAME,0.999731,20mg,0.973891,daily,0.875107
0,Iron,MEDICATION,GENERIC_NAME,0.926809,one\ntablet,0.324295,daily,0.789387
