In [1]:
#!pip install langchain OpenAI

In [2]:
from langchain.llms import OpenAI
import json
from jsonschema import validate
import os
import pandas as pd
import torch
import time
import numpy as np
from transformers import BertModel, BertTokenizer
from sklearn.neighbors import KNeighborsClassifier

# 1) Convert Text to Structured Data

* Making sure it generates correct data (use asserts to test all of this)
* Making sure you handle edge cases (ex: blank fields, fields not in correct datatype, dollar sign in total, phone number larger than 10 digits)
* Language model returning text or invalid json (if not using methos used in class)

In [3]:
openai_api_key = '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'

In [4]:
def generate_response(input_text):
  llm = OpenAI(model="gpt-3.5-turbo-instruct", temperature=0, openai_api_key=openai_api_key, max_tokens=800)
  return llm(input_text)

In [5]:
# TODO: edit these for our project
def find_span(text, entity_text):
    start = text.find(entity_text)
    if start == -1:
        return None
    end = start + len(entity_text)
    return (start, end), entity_text

def convert_to_prodigy_spans(receipt_text, entities):
    text_vals = []
    entities = json.loads(entities)
    prodigy_data = []
    receipt_info = entities["ReceiptInfo"]

    for label, entity_text in [
        ("MERCHANT", receipt_info["merchant"]),
        ("ADDRESS", receipt_info["address"]),
        ("CITY", receipt_info["city"]),
        ("STATE", receipt_info["state"]),
        ("PHONE", receipt_info["phoneNumber"]),
        ("TAX", str(receipt_info["tax"])),
        ("TOTAL", str(receipt_info["total"])),
        ("DATE", receipt_info["receiptDate"])
    ]:
        span, text = find_span(receipt_text, entity_text)
        text_vals.append(text)
        if span:
            start, end = span
            prodigy_data.append({"start": start, "end": end, "label": label})

    # Process item-level entities
    for item in receipt_info["ITEMS"]:
        for label, entity_text in [
            ("ITEM_DESC", item["description"]),
            ("QTY", str(item["quantity"])),
            ("UNIT_PRICE", str(item["unitPrice"])),
            ("TOTAL_PRICE", str(item["totalPrice"])),
            ("DISCOUNT", str(item.get("discountAmount", "")))  # Discount might not always be present
        ]:
            if entity_text:  # Check if the entity text is not empty
                span = find_span(receipt_text, entity_text)
                if span:
                    start, end = span
                    prodigy_data.append({"start": start, "end": end, "label": label})

    return prodigy_data, text_vals

In [6]:
prompt = '''Please analyze the provided receipt and extract relevant information to fill in the following structured format:

Merchant Name
Address (split into street address, city, and state)
Phone Number (masked for privacy as '(xxx) xxx-xxxx')
Tax Amount (in dollars)
Total Amount (in dollars)
Date of the Receipt
Time of the Receipt (if available)
List of Items, for each item include:
Description
Quantity
Unit Price
Total Price
Discount Amount (if any)
Remember to check for any discounts or special offers applied to the items and reflect these in the item details. Ensure the total amount reflects any discounts or taxes applied. If the receipt has a membership number or any other sensitive personal information, please omit it for privacy reasons.


example: """Marley's Shop
123 Long Rd
Kailua, HI 67530
(808) 555-1234
CASHIER: JOHN
REGISTER #: 6
04/12/2023
Transaction ID: 5769009
PRICE   QTY  TOTAL
APPLES (1 lb)
2.99 2 5.98  1001
-1.00  999
Choco Dream Cookies
7.59 1 7.59   1001
SUBTOTAL
13.57
SALES TAX 8.5%
1.15
TOTAL
-14.72
VISA CARD            14.72
CARD#: **1234
REFERENCE#: 6789
THANK YOU FOR SHOPPING WITH US!
"""

from example should get:
{
  "ReceiptInfo": {
    "merchant": "Marley's Shop",
    "address": "123 Long Rd",
    "city": "Kailua",
    "state": "HI",
    "phoneNumber": "(xxx) xxx-xxxx",
    "tax": 1.15,
    "total": 14.72,
    "receiptDate": "04/12/2023",
    "receiptTime": "Transaction ID: 5769009",
    "ITEMS": [
      {
        "description": "APPLES (1 lb)",
        "quantity": 2,
        "unitPrice": 2.99,
        "totalPrice": 5.98,
        "discountAmount": 1.00
      },
      {
        "description": "Choco Dream Cookies",
        "quantity": 1,
        "unitPrice": 7.59,
        "totalPrice": 7.59,
        "discountAmount": 0
      }
    ]
  }
}
'''

structure = '''
{
  "ReceiptInfo": {
    "merchant": "(string value)",
    "address": "(string value)",
    "city": "(string value)",
    "state": "(string value)",
    "phoneNumber": "(string value)",
    "tax": "(float value)",
    "total": "(float value)",
    "receiptDate": "(string value)",
    "receiptTime": "(string value)",
    "ITEMS": [
      {
        "description": "(string value)",
        "quantity": "(integer value)",
        "unitPrice": "(float value)",
        "totalPrice": "(float value)",
        "discountAmount": "(float value)"
      }, ...
    ]
  }
}
'''

In [7]:
prompt = '''Please analyze the provided receipt and extract relevant information to fill in the following structured format:
{
  "ReceiptInfo": {
    "merchant": "(string value)",
    "address": "(string value)", (split into street address, city, and state)
    "city": "(string value)",
    "state": "(string value)",
    "phoneNumber": "(string value)",
    "tax": "(float value)", (in dollars)
    "total": "(float value)", (in dollars)
    "receiptDate": "(string value)",
    "receiptTime": "(string value)", (if available)
    "ITEMS": [
      {
        "description": "(string value)",
        "quantity": "(integer value)",
        "unitPrice": "(float value)",
        "totalPrice": "(float value)",
        "discountAmount": "(float value)" if any
      }, ...
    ]
  }
}
Remember to check for any discounts or special offers applied to the items and reflect these in the item details. Make sure to end the json object and make sure it's in json format.


example: """Marley's Shop
123 Long Rd
Kailua, HI 67530
(808) 555-1234
CASHIER: JOHN
REGISTER #: 6
04/12/2023
Transaction ID: 5769009
PRICE   QTY  TOTAL
APPLES (1 lb)
2.99 2 5.98  1001
-1.00  999
Choco Dream Cookies
7.59 1 7.59   1001
SUBTOTAL
13.57
SALES TAX 8.5%
1.15
TOTAL
-14.72
VISA CARD            14.72
CARD#: **1234
REFERENCE#: 6789
THANK YOU FOR SHOPPING WITH US!
"""

from example should get:
{
  "ReceiptInfo": {
    "merchant": "Marley's Shop",
    "address": "123 Long Rd",
    "city": "Kailua",
    "state": "HI",
    "phoneNumber": "(xxx) xxx-xxxx",
    "tax": 1.15,
    "total": 14.72,
    "receiptDate": "04/12/2023",
    "receiptTime": "Transaction ID: 5769009",
    "ITEMS": [
      {
        "description": "APPLES (1 lb)",
        "quantity": 2,
        "unitPrice": 2.99,
        "totalPrice": 5.98,
        "discountAmount": 1.00
      },
      {
        "description": "Choco Dream Cookies",
        "quantity": 1,
        "unitPrice": 7.59,
        "totalPrice": 7.59,
        "discountAmount": 0
      }
    ]
  }
}
'''


In [8]:
def read_text_files(folder_path):
    text_list = []

    if not os.path.isdir(folder_path):
        print("Invalid folder path.")
        return None

    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        
        if os.path.isfile(file_path) and filename.endswith('.txt'):
            with open(file_path, 'r') as file:
                file_content = file.read()
                text_list.append(file_content)  # Append file content as a string to the list
                
    return text_list


In [11]:
folder_path = './receipts/text'
resulting_receipts = read_text_files(folder_path)
file_path = f'./entities.json'
entitiesList = []
files_processed = 0
for receipt_text in resulting_receipts: 
    entities = generate_response(prompt + receipt_text)
    #print(entities)
    jsonObject = json.loads(entities)
    entitiesList.append(jsonObject)
    files_processed += 1
    
    if files_processed % 3 == 0:
        print("Pausing for 1 minute... for poor people")
        time.sleep(60)
    
with open(file_path, 'w') as file:
    json.dump(entitiesList, file, indent=4)
    



Pausing for 1 minute... for poor people
Pausing for 1 minute... for poor people
Pausing for 1 minute... for poor people
Pausing for 1 minute... for poor people


In [12]:
'''
# Example NER Usage
prodigy_spans_true, text_vals = convert_to_prodigy_spans(receipt_text, entities)
print(json.dumps(prodigy_spans_true, indent=2))
'''

'\n# Example NER Usage\nprodigy_spans_true, text_vals = convert_to_prodigy_spans(receipt_text, entities)\nprint(json.dumps(prodigy_spans_true, indent=2))\n'

# 2) Identify category for vendor

In [13]:
def validateInput(entities):
    schema = {
        "type": "object",
        "properties": {
            "ReceiptInfo": {
                "type": "object",
                "properties": {
                    "merchant": {"type": "string"},
                    "address": {"type": "string"},
                    "city": {"type": "string"},
                    "state": {"type": "string"},
                    "phoneNumber": {"type": "string"},
                    "tax": {"type": "number"},
                    "total": {"type": "number"},
                    "receiptDate": {"type": "string"},
                    "ITEMS": {
                        "type": "array",
                        "items": {
                            "type": "object",
                            "properties": {
                                "description": {"type": "string"},
                                "quantity": {"type": "number"},
                                "unitPrice": {"type": "number"},
                                "totalPrice": {"type": "number"},
                                "discountAmount": {"type": "number"}
                            },
                        },
                    },
                },
            },
        },
    }

    validate(instance=json.loads(entities), schema=schema)

In [14]:
model_name = "BAAI/bge-large-en"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

In [27]:
def generate_embeddings(word):
    inputs = tokenizer(word, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)  # Mean pooling of token embeddings
    return embeddings
    
# Function to convert words in a DataFrame column to embeddings
def convert_to_embeddings_df(df):
    embeddings = [generate_embeddings(x) for x in df["Items"]] 
    dfs = []
    for embedding in embeddings:
        dfs.append(pd.DataFrame(embedding))
    return pd.concat(dfs)
        

In [16]:
# Converting word databases to a embedding database
clothing_db = pd.read_csv("clothing_datebase.csv")
clothing_df = convert_to_embeddings_df(clothing_db)
clothing_df['Category'] = 'Clothing'

electronics_db = pd.read_csv("electronics_datebase.csv")
electronics_df = convert_to_embeddings_df(clothing_db)
electronics_df['Category'] = 'Electronics'

entertainment_db = pd.read_csv("entertainment_datebase.csv")
entertainment_df = convert_to_embeddings_df(entertainment_db)
entertainment_df['Category'] = 'Entertainment'

foodService_db = pd.read_csv("food_service_datebase.csv")
foodService_df = convert_to_embeddings_df(foodService_db)
foodService_df['Category'] = 'Food Service'

grocery_db = pd.read_csv("grocery_datebase.csv")
grocery_df = convert_to_embeddings_df(grocery_db)
grocery_df['Category'] = 'Grocery'

healthBeauty_db = pd.read_csv("health_beauty_datebase.csv")
healthBeauty_df = convert_to_embeddings_df(healthBeauty_db)
healthBeauty_df['Category'] = 'Health Beauty'

homeGarden_db = pd.read_csv("home_garden_datebase.csv")
homeGarden_df = convert_to_embeddings_df(homeGarden_db)
homeGarden_df['Category'] = 'Home Garden'

embeddedDatabase = pd.concat([clothing_df, electronics_df, entertainment_df, foodService_df, grocery_df, healthBeauty_df, homeGarden_df], axis=0)
embeddedDatabase.to_csv("embeddedDatabase.csv")


In [17]:
def searchInDatabase():
    pass

In [18]:
def getEmbeddedDatabase():
    filePath = f'./embeddedDatabase.csv'
    df = pd.read_csv(filePath)
    df = df.drop('Unnamed: 0', axis=1)
    
    # Creating variables from database values
    X = df.drop('Category', axis=1)
    y = df['Category']
    
    
    return X, y

In [None]:
def getReceiptTestData():
    # Read and parse the JSON file
    with open('./entities.json', 'r') as file:
        data = json.load(file)
        
    # Initialize variables
    entry_number = 0
    current_merchant = None

    # Initialize a list to store the processed entries as dictionaries
    processed_entries = []

    # Iterate through the data
    for entry in data:
        entry_number += 1  # Increment the entry number
        merchant = entry["ReceiptInfo"]["merchant"]
        items = entry["ReceiptInfo"]["ITEMS"]

        # Remove "number+space" occurrences in the descriptions and combine them
        cleaned_descriptions = []
        for item in items:
            description = item.get('description', 'No Description')
            cleaned_description = ' '.join(word for word in description.split() if not word.isdigit())
            cleaned_descriptions.append(cleaned_description)

        # Combine descriptions with spaces
        combined_descriptions = ' '.join(cleaned_descriptions)

        # Remove "UNKNOWN," "<UNKNOWN>," and "unknown" from the merchant field
        merchant = merchant.replace("UNKNOWN", "").replace("<UNKNOWN>", "").replace("unknown", "").replace("<>", "")

        # Remove "UNKNOWN," "<UNKNOWN>," and "unknown" from the combined_descriptions field
        combined_descriptions = combined_descriptions.replace("UNKNOWN", "").replace("<UNKNOWN>", "").replace("unknown", "").replace("<>", "")

        # Create a dictionary for the current entry
        entry_dict = {
            "entry_number": entry_number,
            "merchant": merchant,
            "combined_descriptions": combined_descriptions
        }

        # Append the dictionary to the list
        processed_entries.append(entry_dict)

    merchants = []
    descriptions = []

    for entry in processed_entries:
        merchant = entry["merchant"]
        description = entry["combined_descriptions"]
        merchants.append(merchant)
        descriptions.append(description)
        
    entities_df = pd.DataFrame({
    'Merchants': merchants, 
    'Descriptions': descriptions
    })
    entities_df.to_csv('entities_database.csv', index=0)


In [86]:
def KNN():
    X_train, y_train = getEmbeddedDatabase()
    
    clf = KNeighborsClassifier(n_neighbors=1)
    clf.fit(X_train, y_train)
    
    testindDB = pd.read_csv("entities_database.csv")
    testindDB['Items'] = testindDB['Merchants'].str.cat(testindDB['Descriptions'], sep='')
    receiptEmbeddings = convert_to_embeddings_df(testindDB)
    X_test = receiptEmbeddings.values
    
    return (clf.predict(X_test))
     
#print(type(KNN()))
testindDB = pd.read_csv("entities_database.csv")
merchants = testindDB['Merchants'].to_frame()
results = pd.DataFrame(KNN(), columns=['KNN Prediction'])  # Assuming the array has two columns
result_df = pd.concat([merchants, results], axis=1)
result_df



Unnamed: 0,Merchants,KNN Prediction
0,Halal Gyro Kabob House,Food Service
1,WHOLE FOODS MARKET,Food Service
2,LEAHI HEALTH MANOA,Grocery
3,Longs Drugs,Health Beauty
4,Kozo Sushi,Grocery
5,McDonald's Restaurant #14616,Food Service
6,Longs Drugs,Home Garden
7,H MART,Food Service
8,Longs,Health Beauty
9,Sam's Club,Grocery


In [20]:
categories = ["Grocery/Supermarkets", "Restaurants/Food Services", "Clothing/Apparel", "Health/Beauty", "Electronics/Appliances", "Home/Garden", "Entertainment/Leisure"]

def getVendorCategory(listOfItems, Title):
    #validateInput()
    #convertToEmbeddings()
    #seachInDatabase()
    #takeMajority()
    pass

# 3) Same thing as 2 but you have to define the categories for the ingredients


* Use title of item plus something else (ex: category of vendor)

In [21]:
categories = []

def getVendorCategory(jsonObject, listOfItems, Title):
    #validateInput(jsonObject)
    #convertToEmbeddings()
    #seachInDatabase()
    #takeMajority()
    pass

# 4) Create tests in python

* Functions that just test one test and shows that tests passed/failed
* At the end shows how many passed and how many failed

- Example:

     - handleVendor.py
     - all test functions tested in testHandleCategory.py (test all the functions in hangleVendor.py) asserts at the end of each test function

     - Fixtures in test file: testing all of the things that are needed for the code to run

- For part 2, test for:
    - If 7 categories, one of the 7 categories and one of the 7 categories
    - Edge cases (ex: error in formatting, must be string in list of possible categories)