In [146]:
#!pip install langchain OpenAI

In [168]:
from langchain.llms import OpenAI
import json
from jsonschema import validate
import os
import pandas as pd
from transformers import AutoConfig, AutoFeatureExtractor
import torch
import time

# 1) Convert Text to Structured Data

* Making sure it generates correct data (use asserts to test all of this)
* Making sure you handle edge cases (ex: blank fields, fields not in correct datatype, dollar sign in total, phone number larger than 10 digits)
* Language model returning text or invalid json (if not using methos used in class)

In [172]:
openai_api_key = ''

In [164]:
def generate_response(input_text):
  llm = OpenAI(model="gpt-3.5-turbo-instruct", temperature=0, openai_api_key=openai_api_key, max_tokens=800)
  return llm(input_text)

In [150]:
# TODO: edit these for our project
def find_span(text, entity_text):
    start = text.find(entity_text)
    if start == -1:
        return None
    end = start + len(entity_text)
    return (start, end), entity_text

def convert_to_prodigy_spans(receipt_text, entities):
    text_vals = []
    entities = json.loads(entities)
    prodigy_data = []
    receipt_info = entities["ReceiptInfo"]

    for label, entity_text in [
        ("MERCHANT", receipt_info["merchant"]),
        ("ADDRESS", receipt_info["address"]),
        ("CITY", receipt_info["city"]),
        ("STATE", receipt_info["state"]),
        ("PHONE", receipt_info["phoneNumber"]),
        ("TAX", str(receipt_info["tax"])),
        ("TOTAL", str(receipt_info["total"])),
        ("DATE", receipt_info["receiptDate"])
    ]:
        span, text = find_span(receipt_text, entity_text)
        text_vals.append(text)
        if span:
            start, end = span
            prodigy_data.append({"start": start, "end": end, "label": label})

    # Process item-level entities
    for item in receipt_info["ITEMS"]:
        for label, entity_text in [
            ("ITEM_DESC", item["description"]),
            ("QTY", str(item["quantity"])),
            ("UNIT_PRICE", str(item["unitPrice"])),
            ("TOTAL_PRICE", str(item["totalPrice"])),
            ("DISCOUNT", str(item.get("discountAmount", "")))  # Discount might not always be present
        ]:
            if entity_text:  # Check if the entity text is not empty
                span = find_span(receipt_text, entity_text)
                if span:
                    start, end = span
                    prodigy_data.append({"start": start, "end": end, "label": label})

    return prodigy_data, text_vals

In [151]:
prompt = '''Please analyze the provided receipt and extract relevant information to fill in the following structured format:

Merchant Name
Address (split into street address, city, and state)
Phone Number (masked for privacy as '(xxx) xxx-xxxx')
Tax Amount (in dollars)
Total Amount (in dollars)
Date of the Receipt
Time of the Receipt (if available)
List of Items, for each item include:
Description
Quantity
Unit Price
Total Price
Discount Amount (if any)
Remember to check for any discounts or special offers applied to the items and reflect these in the item details. Ensure the total amount reflects any discounts or taxes applied. If the receipt has a membership number or any other sensitive personal information, please omit it for privacy reasons.


example: """Marley's Shop
123 Long Rd
Kailua, HI 67530
(808) 555-1234
CASHIER: JOHN
REGISTER #: 6
04/12/2023
Transaction ID: 5769009
PRICE   QTY  TOTAL
APPLES (1 lb)
2.99 2 5.98  1001
-1.00  999
Choco Dream Cookies
7.59 1 7.59   1001
SUBTOTAL
13.57
SALES TAX 8.5%
1.15
TOTAL
-14.72
VISA CARD            14.72
CARD#: **1234
REFERENCE#: 6789
THANK YOU FOR SHOPPING WITH US!
"""

from example should get:
{
  "ReceiptInfo": {
    "merchant": "Marley's Shop",
    "address": "123 Long Rd",
    "city": "Kailua",
    "state": "HI",
    "phoneNumber": "(xxx) xxx-xxxx",
    "tax": 1.15,
    "total": 14.72,
    "receiptDate": "04/12/2023",
    "receiptTime": "Transaction ID: 5769009",
    "ITEMS": [
      {
        "description": "APPLES (1 lb)",
        "quantity": 2,
        "unitPrice": 2.99,
        "totalPrice": 5.98,
        "discountAmount": 1.00
      },
      {
        "description": "Choco Dream Cookies",
        "quantity": 1,
        "unitPrice": 7.59,
        "totalPrice": 7.59,
        "discountAmount": 0
      }
    ]
  }
}
'''

structure = '''
{
  "ReceiptInfo": {
    "merchant": "(string value)",
    "address": "(string value)",
    "city": "(string value)",
    "state": "(string value)",
    "phoneNumber": "(string value)",
    "tax": "(float value)",
    "total": "(float value)",
    "receiptDate": "(string value)",
    "receiptTime": "(string value)",
    "ITEMS": [
      {
        "description": "(string value)",
        "quantity": "(integer value)",
        "unitPrice": "(float value)",
        "totalPrice": "(float value)",
        "discountAmount": "(float value)"
      }, ...
    ]
  }
}
'''

In [152]:
prompt = '''Please analyze the provided receipt and extract relevant information to fill in the following structured format:
{
  "ReceiptInfo": {
    "merchant": "(string value)",
    "address": "(string value)", (split into street address, city, and state)
    "city": "(string value)",
    "state": "(string value)",
    "phoneNumber": "(string value)",
    "tax": "(float value)", (in dollars)
    "total": "(float value)", (in dollars)
    "receiptDate": "(string value)",
    "receiptTime": "(string value)", (if available)
    "ITEMS": [
      {
        "description": "(string value)",
        "quantity": "(integer value)",
        "unitPrice": "(float value)",
        "totalPrice": "(float value)",
        "discountAmount": "(float value)" if any
      }, ...
    ]
  }
}
Remember to check for any discounts or special offers applied to the items and reflect these in the item details. Make sure to end the json object and make sure it's in json format.


example: """Marley's Shop
123 Long Rd
Kailua, HI 67530
(808) 555-1234
CASHIER: JOHN
REGISTER #: 6
04/12/2023
Transaction ID: 5769009
PRICE   QTY  TOTAL
APPLES (1 lb)
2.99 2 5.98  1001
-1.00  999
Choco Dream Cookies
7.59 1 7.59   1001
SUBTOTAL
13.57
SALES TAX 8.5%
1.15
TOTAL
-14.72
VISA CARD            14.72
CARD#: **1234
REFERENCE#: 6789
THANK YOU FOR SHOPPING WITH US!
"""

from example should get:
{
  "ReceiptInfo": {
    "merchant": "Marley's Shop",
    "address": "123 Long Rd",
    "city": "Kailua",
    "state": "HI",
    "phoneNumber": "(xxx) xxx-xxxx",
    "tax": 1.15,
    "total": 14.72,
    "receiptDate": "04/12/2023",
    "receiptTime": "Transaction ID: 5769009",
    "ITEMS": [
      {
        "description": "APPLES (1 lb)",
        "quantity": 2,
        "unitPrice": 2.99,
        "totalPrice": 5.98,
        "discountAmount": 1.00
      },
      {
        "description": "Choco Dream Cookies",
        "quantity": 1,
        "unitPrice": 7.59,
        "totalPrice": 7.59,
        "discountAmount": 0
      }
    ]
  }
}
'''


In [153]:
def read_text_files(folder_path):
    text_list = []

    if not os.path.isdir(folder_path):
        print("Invalid folder path.")
        return None

    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        
        if os.path.isfile(file_path) and filename.endswith('.txt'):
            with open(file_path, 'r') as file:
                file_content = file.read()
                text_list.append(file_content)  # Append file content as a string to the list
                
    return text_list


In [187]:
folder_path = './receipts/text'
resulting_receipts = read_text_files(folder_path)
file_path = f'./entities.json'
entitiesList = []
files_processed = 0
for receipt_text in resulting_receipts: 
    entities = generate_response(prompt + receipt_text)
    z3print(entities)
    jsonObject = json.loads(entities)
    entitiesList.append(jsonObject)
    files_processed += 1
    '''
    if files_processed % 3 == 0:
        print("Pausing for 1 minute... for poor people")
        time.sleep(60)
    '''
with open(file_path, 'w') as file:
    json.dump(entitiesList, file, indent=4)
    




{
  "ReceiptInfo": {
    "merchant": "Halal Gyro Kabob House",
    "address": "240 EAST DELAWARE AVENUE",
    "city": "NEWARK",
    "state": "DE",
    "phoneNumber": "(443) 993-7029",
    "tax": 0,
    "total": 16.09,
    "receiptDate": "06-Jul-2023",
    "receiptTime": "7:57:49P",
    "ITEMS": [
      {
        "description": "#18. Lamb Salad",
        "quantity": 1,
        "unitPrice": 13.99,
        "totalPrice": 13.99,
        "discountAmount": 0
      }
    ]
  }
}

{
  "ReceiptInfo": {
    "merchant": "WHOLE FOODS MARKET",
    "address": "388 Kamakee St Ste 100",
    "city": "Honolulu",
    "state": "HI",
    "phoneNumber": "(808) 379-1800",
    "tax": 1.69,
    "total": 37.46,
    "receiptDate": "08/02/2023",
    "receiptTime": "05:03 PM",
    "ITEMS": [
      {
        "description": "365WFM OG ITALIAN BAG",
        "quantity": 1,
        "unitPrice": 4.19,
        "totalPrice": 4.19,
        "discountAmount": 0
      },
      {
        "description": "365WFM BREADED CALAMARI


{
  "ReceiptInfo": {
    "merchant": "Sam's Club",
    "address": "Self Checkout",
    "city": "Unknown",
    "state": "Unknown",
    "phoneNumber": "(808) 945-9841",
    "tax": 3.32,
    "total": 74.20,
    "receiptDate": "05/13/23",
    "receiptTime": "19:17",
    "ITEMS": [
      {
        "description": "EZPEELSHRINF",
        "quantity": 1,
        "unitPrice": 16.98,
        "totalPrice": 16.98,
        "discountAmount": 0
      },
      {
        "description": "DC 24PK CANF",
        "quantity": 1,
        "unitPrice": 11.58,
        "totalPrice": 11.58,
        "discountAmount": 0
      },
      {
        "description": "HI DEPOSIT F",
        "quantity": 1,
        "unitPrice": 1.20,
        "totalPrice": 1.20,
        "discountAmount": 0
      },
      {
        "description": "HI HANDLINGF",
        "quantity": 1,
        "unitPrice": 0.24,
        "totalPrice": 0.24,
        "discountAmount": 0
      },
      {
        "description": "MIXED NUTS F",
        "quantity": 1,

In [155]:
# Example NER Usage
prodigy_spans_true, text_vals = convert_to_prodigy_spans(receipt_text, entities)
print(json.dumps(prodigy_spans_true, indent=2))

JSONDecodeError: Expecting value: line 32 column 22 (char 768)

# 2) Identify category for vendor

In [None]:
def validateInput(entities):
    schema = {
        "type": "object",
        "properties": {
            "ReceiptInfo": {
                "type": "object",
                "properties": {
                    "merchant": {"type": "string"},
                    "address": {"type": "string"},
                    "city": {"type": "string"},
                    "state": {"type": "string"},
                    "phoneNumber": {"type": "string"},
                    "tax": {"type": "number"},
                    "total": {"type": "number"},
                    "receiptDate": {"type": "string"},
                    "ITEMS": {
                        "type": "array",
                        "items": {
                            "type": "object",
                            "properties": {
                                "description": {"type": "string"},
                                "quantity": {"type": "number"},
                                "unitPrice": {"type": "number"},
                                "totalPrice": {"type": "number"},
                                "discountAmount": {"type": "number"}
                            },
                        },
                    },
                },
            },
        },
    }

    validate(instance=json.loads(entities), schema=schema)

In [None]:
def generate_embeddings(word):
    # Generate embeddings for the word
    inputs = bge(word, return_tensors="pt")
    embeddings = bge.model(**inputs)
    return embeddings


def convertToEmbeddings(categoryColumnDF):

    model_name = "BAAI/bge-large-en"  
    config = AutoConfig.from_pretrained(model_name)
    bge = AutoFeatureExtractor.from_pretrained(model_name, config=config)

    # Apply the 'generate_embeddings' function to create embeddings for each word in the column
    categoryColumnDF['embeddings'] = categoryColumnDF['word_column'].apply(lambda x: generate_embeddings(x))

    # The 'embeddings' column now contains embeddings for each word
    print(categoryColumnDF['embeddings'])

In [None]:
def searchInDatabase():
    # create database and ask ChatGPT to dream
    # ex: "please generate 10 item descriptions that should be similar to what would be seen in a recipt. Here are examples"
    pass

In [None]:
def takeMajority():
    pass

In [None]:
categories = ["Grocery/Supermarkets", "Restaurants/Food Services", "Clothing/Apparel", "Health/Beauty", "Electronics/Appliances", "Home/Garden", "Entertainment/Leisure"]

def getVendorCategory(listOfItems, Title):
    #validateInput()
    #convertToEmbeddings()
    #seachInDatabase()
    #takeMajority()
    pass

# 3) Same thing as 2 but you have to define the categories for the ingredients


* Use title of item plus something else (ex: category of vendor)

In [None]:
categories = []

def getVendorCategory(jsonObject, listOfItems, Title):
    #validateInput(jsonObject)
    #convertToEmbeddings()
    #seachInDatabase()
    #takeMajority()
    pass

# 4) Create tests in python

* Functions that just test one test and shows that tests passed/failed
* At the end shows how many passed and how many failed

- Example:

     - handleVendor.py
     - all test functions tested in testHandleCategory.py (test all the functions in hangleVendor.py) asserts at the end of each test function

     - Fixtures in test file: testing all of the things that are needed for the code to run

- For part 2, test for:
    - If 7 categories, one of the 7 categories and one of the 7 categories
    - Edge cases (ex: error in formatting, must be string in list of possible categories)