# Homework 1

### Installing packages

In [7]:
!pip install langchain_google_genai


Collecting langchain_google_genai
  Downloading langchain_google_genai-4.2.0-py3-none-any.whl.metadata (2.7 kB)
Collecting filetype<2.0.0,>=1.2.0 (from langchain_google_genai)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting google-genai<2.0.0,>=1.56.0 (from langchain_google_genai)
  Downloading google_genai-1.59.0-py3-none-any.whl.metadata (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.1/53.1 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Collecting google-auth<3.0.0,>=2.47.0 (from google-auth[requests]<3.0.0,>=2.47.0->google-genai<2.0.0,>=1.56.0->langchain_google_genai)
  Downloading google_auth-2.47.0-py3-none-any.whl.metadata (6.4 kB)
Downloading langchain_google_genai-4.2.0-py3-none-any.whl (66 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.5/66.5 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading filetype-1.2.0-py2.py3-none-any.whl (19 kB)
Downloading google_genai-1.59.0-py3-none-any.whl (

## Setup your API key

To run the following cell, your API key must be stored it in a Colab Secret named `VERTEX_API_KEY`.


1.   Look for the key icon on the left panel of your colab.
2.   Under `Name`, create `VERTEX_API_KEY`.
3. Copy your key to `Value`.




In [2]:
from google.colab import userdata
GEMINI_VERTEX_API_KEY = userdata.get('VERTEX_API_KEY')

## Downloading receipts.zip
The codes below download and unzip receipts.zip from Google Drive. receipts.zip contains all images from the Fusion folder on BlackBoard.


In [11]:
import gdown
file_id = "1oe2FZd3ZTO7nrDqjCafNvxicl08oF8JF"
download_url = f"https://drive.google.com/uc?id={file_id}"
gdown.download(download_url, "receipts.zip", quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1oe2FZd3ZTO7nrDqjCafNvxicl08oF8JF
To: /content/receipts.zip
100%|██████████| 1.61M/1.61M [00:00<00:00, 98.9MB/s]


'receipts.zip'

In [12]:
!unzip receipts.zip

Archive:  receipts.zip
  inflating: receipt1.jpg            
  inflating: __MACOSX/._receipt1.jpg  
  inflating: receipt2.jpg            
  inflating: __MACOSX/._receipt2.jpg  
  inflating: receipt3.jpg            
  inflating: __MACOSX/._receipt3.jpg  
  inflating: receipt4.jpg            
  inflating: __MACOSX/._receipt4.jpg  
  inflating: receipt5.jpg            
  inflating: __MACOSX/._receipt5.jpg  
  inflating: receipt6.jpg            
  inflating: __MACOSX/._receipt6.jpg  
  inflating: receipt7.jpg            
  inflating: __MACOSX/._receipt7.jpg  


## 1. Helper functions

We need two functions
* image_to_base64 convert your jpg image into Base64 encoded string (basically a sequence of 64 characters to make your image easily transfered via API)
* get_image_data_url takes your jpg image, converting them into base64 string and construct a suitable input for GEMINI api call.

In [4]:
import base64
import mimetypes

# Helper function to read and encode image
def image_to_base64(img_path):
    with open(img_path, "rb") as img_file:
        return base64.b64encode(img_file.read()).decode('utf-8')

# Helper function to encode local file to Base64 Data URL
def get_image_data_url(image_path):
    # Guess the mime type (e.g., image/png, image/jpeg) based on file extension
    mime_type, _ = mimetypes.guess_type(image_path)
    if mime_type is None:
        mime_type = "image/png" # Default fallback

    encoded_string = image_to_base64(image_path)

    # Construct the Data URL
    return f"data:{mime_type};base64,{encoded_string}"

In [8]:
from langchain_google_genai import ChatGoogleGenerativeAI
llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    api_key=userdata.get('VERTEX_API_KEY'), # Ensure this key is set in Colab secrets
    temperature=0
)

Display jpg images. Alternatively, open the folder icon on the left pannel to see the images.

In [10]:
from IPython.display import HTML, display
import glob, os

image_paths = glob.glob("*.jpg")
image_paths.sort()
html_content = '<div style="display: flex; flex-wrap: wrap; gap: 20px;">'

for path in image_paths:
    b64 = image_to_base64(path)
    filename = os.path.basename(path) # Clean up path to show just the name

    # Create a vertical column for each image + text
    html_content += f'''
    <div style="display: flex; flex-direction: column; align-items: center;">
        <img src="data:image/jpeg;base64,{b64}" style="height: 300px; border: 1px solid #ddd; margin-bottom: 5px;"/>
        <span style="font-family: monospace; font-size: 14px;">{filename}</span>
    </div>
    '''

html_content += '</div>'

display(HTML(html_content))

## 2. Image input to Gemini
Different from text, image needs to be converted into base64 encoded string and then formated into url before inputting to the language model. This is convenient for image-type input to be transfered through the API.

You can find out more appropriate format for image-type data in this [Link](https://docs.langchain.com/oss/python/langchain/messages)



In [13]:
from langchain_core.prompts import ChatPromptTemplate
prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a helpful assistant.You analyze receipt content without missing"),
    ("human", [
        {"type": "text", "text": "{question}"},
        {"type": "image_url", "image_url": {"url": "{image_url}"}},
        {"type": "text", "text": "add a line of ----begin---- indicate the beginning of the response and a create a new line ----end---- at the end  of the response"}
    ]),
])

chain = prompt | llm
receipt_content = ""
for i in range(1,8):
  image_path = f"/content/receipt{i}.jpg"
  image_data_url = get_image_data_url(image_path)
  response = chain.invoke({
      "question": "What is in this picture?",
      "image_url": image_data_url
  })
  if i == 1:
    receipt_content=response.content
  else:
    receipt_content+="\n"+f"receipt session {i}" +response.content # Added newline here

print(receipt_content)

----begin----
This picture is a receipt from a grocery or supermarket, likely in Hong Kong given the currency and language.

Here's a breakdown of the content:

**Items Purchased:**

*   **084213 韭菜豬肉雲吞20粒裝 (Chive Pork Wonton 20 pcs)**
    *   Quantity: 1
    *   Price: $24.90
    *   Discount: -$12.40
*   **084213 韭菜豬肉雲吞20粒裝 (Chive Pork Wonton 20 pcs)**
    *   Quantity: 1
    *   Price: $24.90
    *   Discount: -$12.40
*   **084213 韭菜豬肉雲吞20粒裝 (Chive Pork Wonton 20 pcs)**
    *   Quantity: 1
    *   Price: $24.90
    *   Discount: -$12.40
*   **395092 IF100% COCONUT WATER**
    *   Quantity: 2
    *   Price: $57.80
    *   Discount (Buy 2 Save $12.8): -$12.80
*   **044228 玉芒 (Mango)**
    *   Quantity: 3
    *   Price: $29.70
    *   Discount (Buy 3 Save $10.8): -$10.80
*   **490948 Fresh綜合莓汁 (Fresh Mixed Berry Juice)**
    *   Quantity: 2
    *   Price: $35.80
    *   Discount (Buy 2 Save $3.9): -$3.90
*   **126894 雀巢脫脂高鈣牛奶飲品 (Nestle Skimmed High Calcium Milk Drink)**
    *   Quantit

## 3. Evaluation Code

* Make sure your LLM return a single float as the answer, stored in `query1_answer` and `query2_answer`
* Run the following code blocks: (1) If the blocks does not return any error, then your chain design is correct. Otherwise, please check your chain design.

* Do not modify `query_1_costs` and `query_2_costs`

In [14]:
def test_query(answer, ground_truth_costs):
    # Convert string to float if necessary
    if isinstance(answer, str):
        answer = float(answer)

    # Calculate the ground truth sum once for clarity
    expected_total = sum(ground_truth_costs)

    # Check if the answer is within +/- $2 of the expected total
    assert abs(answer - expected_total) <= 2

In [15]:
prompt3 = ChatPromptTemplate.from_messages([
    ("system", "You are a result interpretation assistant. You recognize the receipt information from each receipt and sum it up, return with a simple string sum value"),
    ("human", [
        {"type": "text", "text": "Here are the receipts' payment without content for each receipt: {payment_without_discount}"},
        {"type": "text", "text": "return only the sumed string value without the $ notation,"}

    ]),
])
result_trans = prompt3|llm


Run the following code block to evaluate query 1:
> How much money did I spend in total for these bills?

In [16]:
query_1_costs = [394.7, 316.1, 140.8, 514.0, 102.3, 190.8, 315.6] # do not modify this

# Redefine prompt1 to correctly use 'text' type for receipt content
prompt1 = ChatPromptTemplate.from_messages([
    ("system", "You are a cost saving assistant. You analyze receipt text and return the total amount spent as a float. Make sure the output is a single float number."),
    ("human", [
        {"type": "text", "text": "Here are the receipts' content extracted from the images: {receipt_text_input}"},
         {"type": "text", "text": "I separate the session through ----begin---- and ----end---- which denote the beginning and end of a receipt session respectively"},
         {"type": "text", "text": "Try to recognize and use the key feature with total payment amount or total paid for submission，notice that each session will have 1 total amount spended"},
        {"type": "text", "text": "sum the total amount spent from each extracted receipts session. Return the string summed value which could be converted into float through float function, only remain the total sum value without the calculation process."}

    ]),
])



full_query1_pipeline = prompt1 | llm
query1_answer_raw_message = full_query1_pipeline.invoke({"receipt_text_input":receipt_content})
result_transformed=result_trans.invoke({"payment_without_discount":query1_answer_raw_message})
query1_answer=float(result_transformed.content)
print(query1_answer)

1974.3


In [17]:
test_query(query1_answer, query_1_costs)

In [18]:
sum(query_1_costs)

1974.3

Run the following code block to evaluate query 2:
> How much would I have had to pay without the discount?

In [19]:
query_2_costs = [480.20, 392.20, 160.10, 590.80, 107.70, 221.20, 396.00] # do not modify this

In [26]:

prompt2 = ChatPromptTemplate.from_messages([
    ("system", "You are a discount helpful assistant. You prudent review every item in each receipt session and calculate the amount to pay without the discount"),
    ("human", [
        {"type": "text", "text": "Here are the receipts' discount related content extracted from the images: {receipt_text_input}"},
        {"type": "text", "text": "I separate the session through ----begin---- and ----end---- which denote the beginning and end of a receipt session respectively"},
        {"type": "text", "text": "You check both the detail of item listed and the discount part if have. Then you compare whether there are one discount being added twice "},
        {"type": "text", "text": "Don not recognize as discount if the sign start with -, checking with the item include the keyword like discount,%off and other similar meaning expression."},
        {"type": "text", "text": "add up and check the amount to pay without the discount for each receipt session. Don't add back one discount twice"}
    ]),
])

full_query2_pipeline = prompt2 | llm
discount_info=llm.invoke(f"Please acrroding to the {receipt_content}, keep the item and part related to discount, and the total payment for each session. This response will be used for payment without discount calculation, so please be careful and utilize the keyward for recognition and extraction. The session are separated in a --begin-- --end-- pattern, please follow this pattern to generate response")
query2_answer_raw_message = full_query2_pipeline.invoke({"receipt_text_input":discount_info.content})
query2_answer_raw_message.content

'Here is the amount to pay without the discount for each receipt session:\n\n**Receipt Session 1:**\n*   Total Payment: $394.70\n*   Discounts: $12.40 + $12.40 + $12.40 + $12.80 + $10.80 + $3.90 + $20.78 = $85.48\n*   Amount to pay without discount: $394.70 + $85.48 = **$480.18**\n\n**Receipt Session 2:**\n*   Total Payment: $316.10\n*   Discounts: $9.80 + $6.00 + $5.90 + $5.80 + $2.00 + $10.00 + $20.00 + $16.59 = $76.09\n*   Amount to pay without discount: $316.10 + $76.09 = **$392.19**\n\n**Receipt Session 3:**\n*   Total Payment: $140.80\n*   Discounts: $4.00 + $7.80 + $7.42 = $19.22\n*   Amount to pay without discount: $140.80 + $19.22 = **$160.02**\n\n**Receipt Session 4:**\n*   Total Payment: $514.00\n*   Discounts: $4.00 + $7.80 + $6.38 + $28.53 + $30.00 = $76.71\n*   Amount to pay without discount: $514.00 + $76.71 = **$590.71**\n\n**Receipt Session 5:**\n*   Total Payment: $102.30\n*   Discounts: $5.39\n*   Amount to pay without discount: $102.30 + $5.39 = **$107.69**\n\n**Rec

In [27]:
prompt4 = ChatPromptTemplate.from_messages([
    ("system", "You are a result interpretation assistant. You recognize the receipt information from each receipt and sum it up, return with a simple string sum value"),
    ("human", [
        {"type": "text", "text": "Here are the receipts' payment without content for each receipt: {payment_without_discount}"},
        {"type": "text", "text": "return only the sumed string value without the $ notation,"}

    ]),
])
result_trans2 = prompt4|llm
result_transformed = result_trans2.invoke({"payment_without_discount":query2_answer_raw_message})
query2_answer=float(result_transformed.content)

In [30]:
test_query(query2_answer, query_2_costs)

In [31]:
query2_answer

2347.86