In [1]:
%load_ext autoreload
%autoreload 2

from tabula import read_pdf
import re
import pandas as pd

In [2]:
pdf_name = '../sample-data/B1705_2000604586.pdf'

In [3]:
json_obj = read_pdf(pdf_name, output_format='json', guess=False)

In [4]:
AUTO_EXTRACT_META_DATA = ['Case ID', 'Collected', 'Received', 'Clinic No', 'Owner', 'Lab Number', 'Species', 'Reported', 'Sex', 'Age', 'Animal ID']

In [5]:
def get_obj_with_text(json_obj, text):
    for i in json_obj:
        objs = i['data']
        for obj in objs:
            for o in obj:
                if o['text'] == text:
                    return o


def get_obj_with_text_contains(json_obj, text):
    for i in json_obj:
        objs = i['data']
        for obj in objs:
            for o in obj:
                if text in o['text']:
                    return o

                
def extract_label_data(label, target_string):
    m = re.search(f'{label}:( *\S+)', target_string)
    if m:
        found = m.group(1)
        return found
    else:
        return None


def extract_metadata(json_obj):
    output = {}
    for key in AUTO_EXTRACT_META_DATA:
        text = get_obj_with_text_contains(json_obj, f'{key}:')['text']
        if key == 'Animal ID':
            output[key] = text.replace("Animal ID:", "").strip()
        else:
            output[key] = extract_label_data(key, text).strip()
    return output


def parse_value(val):
    try:
        return float(val)
    except ValueError:
        text = val.split(' ')
        try:
            return float(text[0])
        except ValueError:
            return val

In [6]:
output = extract_metadata(json_obj)
meta_df = pd.DataFrame(output, index=[0])

In [7]:
table_start = 0
o = get_obj_with_text(json_obj, 'Range')
table_start = o['top'] + o['height']
table_start

199.30999979019165

In [8]:
o = get_obj_with_text_contains(json_obj, 'BLOOD SMEAR')
blood_smear_top = o['top']


In [9]:
for i in json_obj:
    objs = i['data']
    for j, _ in enumerate(objs):
        if j < len(i['data']) - 1:
            prev = objs[j][0]
            after = objs[j+1][0]
            
            diff = after['top'] - prev['top']
            if diff > 20 and after['top'] > table_start and 'RBC' not in after['text']:
                print(f"Found space between {after['text']} and {prev['text']} of {diff}")
                table_end = after['top']
                break
            


Found space between This report has been automatically generated based on accepted physiological limits for the species and criteria and >250 x 10^9/L of 23.379999999999995


In [10]:
df = read_pdf(
    pdf_name, output_format='dataframe', 
    guess=False, area=(table_start, 0, table_end, 1000), 
    pandas_options={'header': None})

In [11]:
df = df.loc[:, 0:1]

In [12]:
df

Unnamed: 0,0,1
0,RBC,6.6 L
1,HAEMOGLOBIN,75 L
2,HAEMATOCRIT,0.24 L
3,RETICULOCYTE %,0.0
4,RETICULOCYTE ABS,0
5,MCV,36
6,MCH,11
7,MCHC,313
8,WBC,2.2 L
9,NEUTROPHILS%,72


In [13]:
df[1] = df[1].apply(parse_value)

In [14]:
df_meta = meta_df.T.reset_index()
df_meta.columns = [0, 1]

In [15]:
df = df_meta.append(df).reset_index(drop=True)

In [16]:
blood_smear_index = df.index[df[0] == 'BLOOD SMEAR'].tolist()[0]

In [17]:
blood_smear_str = ', '.join(df.iloc[blood_smear_index:len(df)][1].tolist())

In [18]:
df = df.drop(df.index[[i for i in range(blood_smear_index, len(df))]])

In [19]:
df = df.append({0: 'BLOOD SMEAR EXAMINATION', 1: blood_smear_str}, ignore_index=True)

In [20]:
df_T = df.T
df_T.columns = df_T.iloc[0, :]
df_T = df_T.drop(df_T.index[[0]])
df_T = df_T.reset_index(drop=True)

In [21]:
df_T.head()

Unnamed: 0,Case ID,Collected,Received,Clinic No,Owner,Lab Number,Species,Reported,Sex,Age,...,MONOCYTES%,MONOCYTES,EOSINOPHILS%,EOSINOPHILS,BASOPHILS%,BASOPHILS,PROTEIN PLASMA,FIBRINOGEN,PLASMA APPEARANCE,BLOOD SMEAR EXAMINATION
0,,27/06/2018,27/06/2018,B1705,CCRG,2000604586,OVINE,28/06/2018,,,...,0,0,0,0,0,0,37,1,Normal,"Red cell and white cell, morphology normal., P..."
