In [12]:
from tika import parser
from parse import parse
from pprint import pprint
 
def do_simple_parsing(clean_lines, filter_list):
    order_details = {}
    for text_filter in filter_list:
        for line in clean_lines:
            result = parse(text_filter, line)
            if result:
                order_details.update(result.__dict__["named"])
    return order_details
 
def get_elements_in_between(starts_with, ends_with, clean_lines):
    for line in clean_lines:
        if starts_with == line:
            start_index = clean_lines.index(line) + 1
        elif ends_with in line:
            end_index = clean_lines.index(line)
    elements_in_between = clean_lines[start_index: end_index]
    return elements_in_between
 
def get_place_details(clean_lines):
    starts_with = "Ordered from:"
    ends_with = "Item Name Quantity Price"
    address_list = get_elements_in_between(starts_with, ends_with, clean_lines)
    place_details = {}
    place_details["place_name"] = address_list[0]
    place_details["place_address"] = " ".join(address_list[1:])
    return(place_details)
 
def get_order_item_details(clean_lines):
    starts_with = "Item Name Quantity Price"
    ends_with = "Item Total:"
    items = get_elements_in_between(starts_with, ends_with, clean_lines)
    items_processed = []
    for item in items:
        item_details = {}
        item_splitted = item.split("₹")
        item_details["name"] = " ".join(item_splitted[0].strip().split(" ")[:-1])
        item_details["price"] = item_splitted[-1].strip()
        item_details["quantity"] = item_splitted[0].strip().split(" ")[-1]
        items_processed.append(item_details)
    return(items_processed)
 
def get_recepient_address(clean_lines):
    starts_with = "Delivery To:"
    ends_with = "Disclaimer:"
    recepient_address_list = get_elements_in_between(starts_with, ends_with, clean_lines)
    recepient_address = " ".join(recepient_address_list)
    return(recepient_address)
 
def parse_swiggy_bill(filename):
    parsed = parser.from_file(filename)
    splitted = parsed["content"].split("\n")
    clean_lines = [line for line in splitted if len(line) != 0]
    filter_list = ["Order No: {order_no}", "Order placed at: {order_date}, {order_time} {order_am_pm}",
               'Order Status: {order_status}', 'Item Total: ₹  {order_total}', 'GST: ₹  {gst}',
               'Order Packing Charges: ₹  {packing_charge}', 'Delivery Charges: ₹  {delivery_charge}']
   
    parsed_data = {
    'order_details' : do_simple_parsing(clean_lines, filter_list),
    'place_details' : get_place_details(clean_lines),
    'ordered_item_details' : get_order_item_details(clean_lines),
    'recepient_address' : get_recepient_address(clean_lines)
    }
    return parsed_data
 
if __name__ == "__main__":
#     pprint(parse_swiggy_bill("pdf_files/40126795233_8665c1f3-29fc-4eda-a479-96ab47ac4d6b.pdf"))
    pprint(parse_swiggy_bill("swiggy-order-41110023402.pdf"))

{'order_details': {'delivery_charge': '40',
                   'order_am_pm': 'AM',
                   'order_date': '17/05/2019',
                   'order_no': '#41110023402',
                   'order_status': 'Delivered',
                   'order_time': '12:57',
                   'order_total': '140',
                   'packing_charge': '10'},
 'ordered_item_details': [{'name': 'Chocolate Xoverloaded Waffle',
                           'price': '140',
                           'quantity': '1'},
                          {'name': '-', 'price': '0', 'quantity': 'Banana'}],
 'place_details': {'place_address': '#105, 1st A Cross, Jyoti Niwas College '
                                    'Road, Kormangala 5th block',
                   'place_name': 'XO Belgian Waffle'},
 'recepient_address': 'Karan Raj Pradhan 3rd Floor, 161, Behind Supreme '
                      'Agencies Building, 5th Cross Rd, Vinayaka Nagar, Wilson '
                      'Garden, Bengaluru, Karnataka 560027, 

In [3]:
import os

os.listdir("pdf_files")

['40126795233_8665c1f3-29fc-4eda-a479-96ab47ac4d6b.pdf',
 '41356117094_866e401b-cbc8-41a5-b4cf-21fad4da5be5.pdf',
 '37096133700_336fc5f6-7846-4a37-919d-611420716cf0.pdf',
 '41418314131_5b81cbc1-01cb-4137-8bfe-f863298f45b4.pdf',
 '41789356811_946f3512-3756-4e97-b9ad-8ddb675e1741.pdf',
 '36946068426_15d54aee-1761-4de7-b521-2ac6112fad8a.pdf',
 '40487698403_f972cf0c-3297-4748-a6ab-6dbe3ca98c02.pdf',
 '41137853274_5589b424-1a20-4ccb-af2b-61eeeb217845.pdf',
 '41959810929_41b735f6-4072-471f-9e86-3c88b9744ccd.pdf',
 '36397759082_7ab2c27d-fdf0-4ec0-b0f1-8a180693a91d.pdf',
 '42373671178_09bf28bf-0a0d-46f7-aba7-c47e45681dfc.pdf',
 '36263755890_929e4ebd-cfc1-47a7-98e7-76168cd3121a.pdf',
 '41754132318_27226cc6-7b51-4fc9-a150-bbfad5f890d0.pdf',
 '42063793755_8108a62e-ca52-45f2-93c6-d3166c3f2a24.pdf',
 '42102775973_559f979e-c3aa-4410-aeea-d91777bc1065.pdf',
 '40680134503_e515f813-f8e6-4784-b1b2-7ca3840f5fa8.pdf',
 '40373019154_b26f7365-a486-4ee4-aaf1-84441008f1fc.pdf',
 '39973265154_309cb6ba-ef5a-400