In [24]:
#!pip install langchain OpenAI

In [26]:
from langchain.llms import OpenAI
import json

# 1) Convert Text to Structured Data

* Making sure it generates correct data (use asserts to test all of this)
* Making sure you handle edge cases (ex: blank fields, fields not in correct datatype, dollar sign in total, phone number larger than 10 digits)
* Language model returning text or invalid json (if not using methos used in class)

In [17]:
openai_api_key = ''

In [3]:
receipt_text = """Marley's Shop
123 Long Rd
Kailua, HI 67530
(808) 555-1234
CASHIER: JOHN
REGISTER #: 6
04/12/2023
Transaction ID: 5769009
PRICE   QTY  TOTAL
APPLES (1 lb)
2.99 2 5.98  1001
-1.00  999
Choco Dream Cookies
7.59 1 7.59   1001
SUBTOTAL
13.57
SALES TAX 8.5%
1.15
TOTAL
-14.72
VISA CARD            14.72
CARD#: ************1234
REFERENCE#: 6789
THANK YOU FOR SHOPPING WITH US!
"""

In [47]:
def generate_response(input_text):
  llm = OpenAI(model="gpt-3.5-turbo-instruct", temperature=0, openai_api_key=openai_api_key)
  return llm(input_text)

In [54]:
prompt = '''fill all info into this json data structure, and correct all the spelling
example: """Marley's Shop
123 Long Rd
Kailua, HI 67530
(808) 555-1234
CASHIER: JOHN
REGISTER #: 6
04/12/2023
Transaction ID: 5769009
PRICE   QTY  TOTAL
APPLES (1 lb)
2.99 2 5.98  1001
-1.00  999
Choco Dream Cookies
7.59 1 7.59   1001
SUBTOTAL
13.57
SALES TAX 8.5%
1.15
TOTAL
-14.72
VISA CARD            14.72
CARD#: ************1234
REFERENCE#: 6789
THANK YOU FOR SHOPPING WITH US!
"""

should get:
{
  "ReceiptInfo": {
    "merchant": "Marley's Shop",
    "address": "123 Long Rd",
    "city": "Kailua",
    "state": "HI",
    "phoneNumber": "(808) 555-1234",
    "tax": 1.15,
    "total": 14.72,
    "receiptDate": "04/12/2023",
    "receiptTime": "Transaction ID: 5769009",
    "ITEMS": [
      {
        "description": "APPLES (1 lb)",
        "quantity": 2,
        "unitPrice": 2.99,
        "totalPrice": 5.98,
        "discountAmount": 1.00
      },
      {
        "description": "Choco Dream Cookies",
        "quantity": 1,
        "unitPrice": 7.59,
        "totalPrice": 7.59,
        "discountAmount": 0
      }
    ]
  }
}
'''
structure = '''{
  "ReceiptInfo": {
    "merchant": "(string value)",
    "address": "(string value)",
    "city": "(string value)",
    "state": "(string value)",
    "phoneNumber": "(string value)",
    "tax": "(float value)",
    "total": "(float value)",
    "receiptDate": "(string value)",
    "receiptTime": "(string value)",
    "ITEMS": [
      {
        "description": "(string value)",
        "quantity": "(integer value)",
        "unitPrice": "(float value)",
        "totalPrice": "(float value)",
        "discountAmount": "(float value)"
      }, ...
    ]
  }
}
'''

In [55]:
entities = generate_response(prompt + receipt_text + structure)

In [57]:
print(entities)

{
  "ReceiptInfo": {
    "merchant": "Marley's Shop",
    "address": "123 Long Rd",
    "city": "Kailua",
    "state": "HI",
    "phoneNumber": "(808) 555-1234",
    "tax": 1.15,
    "total": 14.72,
    "receiptDate": "04/12/2023",
    "receiptTime": "Transaction ID: 5769009",
    "ITEMS": [
      {
        "description": "APPLES (1 lb)",
        "quantity": 2,
        "unitPrice": 2.99,
        "totalPrice": 5.98,
        "discountAmount": 1.00
      },
      {
        "description": "Choco Dream Cookies",
        "quantity": 1,
        "unitPrice": 7.59,
        "totalPrice": 7.59,
        "discountAmount": 0
      }
    ]
  }
}


In [27]:
type(json.loads(entities))

dict

In [28]:
import json

def find_span(text, entity_text):
    start = text.find(entity_text)
    if start == -1:
        return None
    end = start + len(entity_text)
    return (start, end), entity_text

def convert_to_prodigy_spans(receipt_text, entities):
    text_vals = []
    entities = json.loads(entities)
    prodigy_data = []
    receipt_info = entities["ReceiptInfo"]

    for label, entity_text in [
        ("MERCHANT", receipt_info["merchant"]),
        ("ADDRESS", receipt_info["address"]),
        ("CITY", receipt_info["city"]),
        ("STATE", receipt_info["state"]),
        ("PHONE", receipt_info["phoneNumber"]),
        ("TAX", str(receipt_info["tax"])),
        ("TOTAL", str(receipt_info["total"])),
        ("DATE", receipt_info["receiptDate"])
    ]:
        span, text = find_span(receipt_text, entity_text)
        text_vals.append(text)
        if span:
            start, end = span
            prodigy_data.append({"start": start, "end": end, "label": label})

    # Process item-level entities
    for item in receipt_info["ITEMS"]:
        for label, entity_text in [
            ("ITEM_DESC", item["description"]),
            ("QTY", str(item["quantity"])),
            ("UNIT_PRICE", str(item["unitPrice"])),
            ("TOTAL_PRICE", str(item["totalPrice"])),
            ("DISCOUNT", str(item.get("discountAmount", "")))  # Discount might not always be present
        ]:
            if entity_text:  # Check if the entity text is not empty
                span = find_span(receipt_text, entity_text)
                if span:
                    start, end = span
                    prodigy_data.append({"start": start, "end": end, "label": label})

    return prodigy_data, text_vals

# Example usage


prodigy_spans_true, text_vals = convert_to_prodigy_spans(receipt_text, entities)
print(json.dumps(prodigy_spans_true, indent=2))


[
  {
    "start": 0,
    "end": 13,
    "label": "MERCHANT"
  },
  {
    "start": 14,
    "end": 25,
    "label": "ADDRESS"
  },
  {
    "start": 26,
    "end": 32,
    "label": "CITY"
  },
  {
    "start": 34,
    "end": 36,
    "label": "STATE"
  },
  {
    "start": 43,
    "end": 57,
    "label": "PHONE"
  },
  {
    "start": 252,
    "end": 256,
    "label": "TAX"
  },
  {
    "start": 264,
    "end": 269,
    "label": "TOTAL"
  },
  {
    "start": 86,
    "end": 96,
    "label": "DATE"
  },
  {
    "start": [
      140,
      153
    ],
    "end": "APPLES (1 lb)",
    "label": "ITEM_DESC"
  },
  {
    "start": [
      15,
      16
    ],
    "end": "2",
    "label": "QTY"
  },
  {
    "start": [
      154,
      158
    ],
    "end": "2.99",
    "label": "UNIT_PRICE"
  },
  {
    "start": [
      161,
      165
    ],
    "end": "5.98",
    "label": "TOTAL_PRICE"
  },
  {
    "start": [
      173,
      176
    ],
    "end": "1.0",
    "label": "DISCOUNT"
  },
  {
    "start": [


# 2) Identify category for vendor

In [None]:
def validateInput():
    pass

In [None]:
def convertToEmbeddings():
    pass

In [None]:
def searchInDatabase():
    # create database and ask ChatGPT to dream
    # ex: "please gnerate 10 item descriptions that should be similar to what would be seen in a recipt. Here are examples"
    pass

In [None]:
def takeMajority():
    pass

In [None]:
categories = ["Grocery/Supermarkets", "Restaurants/Food Services", "Clothing/Apparel", "Health/Beauty", "Electronics/Appliances", "Home/Garden", "Entertainment/Leisure"]

def getVendorCategory(listOfItems, Title):
    #validateInput()
    #convertToEmbeddings()
    #seachInDatabase()
    #takeMajority()
    pass

# 3) Same thing as 2 but you have to define the categories

* Use title of item plus something else (ex: category of vendor)

In [None]:
categories = []

def getVendorCategory(listOfItems, Title):
    #validateInput()
    #convertToEmbeddings()
    #seachInDatabase()
    #takeMajority()
    pass

# 4) Create tests in python

* Functions that just test one test and shows that tests passed/failed
* At the end shows how many passed and how many failed

- Example:

     - handleVendor.py
     - all test functions tested in testHandleCategory.py (test all the functions in hangleVendor.py) asserts at the end of each test function

     - Fixtures in test file: testing all of the things that are needed for the code to run

- For part 2, test for:
    - If 7 categories, one of the 7 categories and one of the 7 categories
    - Edge cases (ex: error in formatting, must be string in list of possible categories)