In [10]:
from Gemini import generatePromptsDictionary
import io
from Fullcontext_main import retrieveCompanyYearReports
from google import genai
client = genai.Client()

companyYearReports = retrieveCompanyYearReports("Airlines", "QuantasAirways", "2024")
prompts = {}

for doc in companyYearReports:
    print(f"Now prompting document: {doc.company_name} {doc.period} {doc.topic} {doc.mimetype} {doc.counter} ")

    doc_io = io.BytesIO(doc.file_value)
    uploaded_doc =  client.files.upload(
        file=doc_io,
        config=dict(
            mime_type=doc.mimetype)
    )
    print(f"Uploaded doc: {uploaded_doc.name} with MIME type: {uploaded_doc.mime_type}")

    prompts = generatePromptsDictionary(doc)


Download 100 of 2024.pdf application/pdf
Download 100 of 2024.pdf application/pdf
Now prompting document: QuantasAirways 2024 Topic.ESG application/pdf 1 
Uploaded doc: files/oizl47yxa8bt with MIME type: application/pdf
Now prompting document: QuantasAirways 2024 Topic.FINANCIAL application/pdf 1 
Uploaded doc: files/vk1gmtbztotd with MIME type: application/pdf


In [35]:
from Gemini import IndicatorExtraction

requests_data = []

for indicatorID in prompts:
    request = {
        "key": indicatorID,
        "request": {
            "contents": [{
                "parts": [
                    {"text": prompts[indicatorID]},
                    {"file_data": {"file_uri": uploaded_doc.uri, "mime_type": uploaded_doc.mime_type}}
                ]
            }],
        "generationConfig": {
            "thinking_config": {
                  "include_thoughts": True,
                  "thinking_budget": -1
            },
            "response_mime_type": "application/json",
            "response_json_schema": IndicatorExtraction.model_json_schema()
        }
        }
    }
    requests_data.append(request)

print(requests_data)

[{'key': 'environmental_ex', 'request': {'contents': [{'parts': [{'text': '"Extract the following metric from the provided document:\n      -Money (OpEx, CapEx) used for environmental purposes (EU Taxonomy aligned CapEx and OpEx starting in 2024) of the reporting company QuantasAirways for the year 2024.\n      \n      Metric-specific instructions:\n      OpEx, CapEx used for environmental purposes (Taxonomy aligned). Specifically NOT money invested in carbon credits and offsets, those are already covered in the carbon_credits_offsets indicator. This value may also be specified as total expenses for environmental purposes.\n      From the year 2024 on, you can expect report to include figures of invested CapEx and OpEx that are aligned with climate friendly target according to the EU taxonomy. Prior to this year, general figures for investment in environment causes are sufficient. You should look for tables describing the Operational Expenditure and the Capital Expanditure in the reque

In [36]:
import json

json_file_path = 'batch_requests_with_pdfs.json'

print(f"\nCreating JSONL file: {json_file_path}")
with open(json_file_path, 'w') as f:
    for req in requests_data:
        f.write(json.dumps(req) + '\n')

print(f"Uploading JSONL file: {json_file_path}")
batch_input_file = client.files.upload(
    file=json_file_path
    )
print(f"Uploaded JSONL file: {batch_input_file.name}")

print("\nCreating batch job...")
try:
    batch_job_from_file = client.batches.create(
        model="gemini-2.5-flash",
        src=batch_input_file.name,
        config={
            'display_name': 'quantas_batch_file_with_thinking_corrected_with_json_schema',
        }
    )
except Exception as e:
    print(e)
    print(e.message)
print(f"Created batch job from file: {batch_job_from_file.name}")
print("You can now monitor the job status using its name.")


Creating JSONL file: batch_requests_with_pdfs.json
Uploading JSONL file: batch_requests_with_pdfs.json
Uploaded JSONL file: files/2ab9n0303b8d

Creating batch job...
Created batch job from file: batches/11wqncexbtisoa2rrbvc9tqrtcm4qb1wxv25
You can now monitor the job status using its name.


In [6]:
# Use the name of the job you want to check
# e.g., inline_batch_job.name from the previous step
job_name0 = "batches/68ohlbuc7numz7ketcvkwizg4dh8povs8eoa"  # (e.g. 'batches/your-batch-id')
job_name = "batches/zi9hlczk1uwzljrevytef2qwqw41djv4wfnq"  # (e.g. 'batches/your-batch-id')
batch_job_from_file = client.batches.get(name=job_name)

import time

completed_states = set([
    'JOB_STATE_SUCCEEDED',
    'JOB_STATE_FAILED',
    'JOB_STATE_CANCELLED',
    'JOB_STATE_EXPIRED',
])

while batch_job_from_file.state.name not in completed_states:
  print(f"Current state: {batch_job_from_file.state.name}")
  time.sleep(30) # Wait for 30 seconds before polling again

print(f"Job finished with state: {batch_job_from_file.state.name}")
if batch_job_from_file.state.name == 'JOB_STATE_FAILED':
    print(f"Error: {batch_job_from_file.error}")

Job finished with state: JOB_STATE_SUCCEEDED


In [40]:
completed_states = set([
    'JOB_STATE_SUCCEEDED',
    'JOB_STATE_FAILED',
    'JOB_STATE_CANCELLED',
    'JOB_STATE_EXPIRED',
])

print("Listing recent batch jobs:\n")

# Note: The list API currently doesn't return inlined_responses.
# As a workaround,you can make a `get` call for inline jobs to see their results.
batches = client.batches.list(config={'page_size': 10})

for b in batches.page:
    print(f"Job Name: {b.name}")
    print(f"  - Display Name: {b.display_name}")
    print(f"  - State: {b.state.name}")
    print(f"  - Create Time: {b.create_time.strftime('%Y-%m-%d %H:%M:%S')}")
    if b.state.name in completed_states:
        print(f"  - End Time: {b.end_time.strftime('%Y-%m-%d %H:%M:%S')}")




    # Check if it was an inline job (no destination file)
    if b.dest is not None:
      if not b.dest.file_name:
        full_job = client.batches.get(name=b.name)
        if full_job.inlined_responses:
            print("  - Type: Inline ({} responses)".format(len(full_job.inlined_responses)))
      else:
          print(f"  - Type: File-based (Output: {b.dest.file_name})")

    print("-" * 20)

Listing recent batch jobs:

Job Name: batches/11wqncexbtisoa2rrbvc9tqrtcm4qb1wxv25
  - Display Name: quantas_batch_file_with_thinking_corrected_with_json_schema
  - State: JOB_STATE_SUCCEEDED
  - Create Time: 2025-09-12 18:31:09
  - End Time: 2025-09-12 18:39:49
  - Type: File-based (Output: files/batch-11wqncexbtisoa2rrbvc9tqrtcm4qb1wxv25)
--------------------
Job Name: batches/i9m070qk0ebqez5f3my485jreq4r6jrhut3x
  - Display Name: quantas_batch_file_with_thinking_corrected
  - State: JOB_STATE_SUCCEEDED
  - Create Time: 2025-09-12 18:25:13
  - End Time: 2025-09-12 18:39:55
  - Type: File-based (Output: files/batch-i9m070qk0ebqez5f3my485jreq4r6jrhut3x)
--------------------
Job Name: batches/5dfzbzpknv9dkwjdy3sx1w9g21u409g92aoj
  - Display Name: my-batch-job-with-file-and-thoughts-thinking_config
  - State: JOB_STATE_SUCCEEDED
  - Create Time: 2025-09-12 16:59:15
  - End Time: 2025-09-12 17:00:40
  - Type: File-based (Output: files/batch-5dfzbzpknv9dkwjdy3sx1w9g21u409g92aoj)
----------

In [41]:
import json

batch_job = client.batches.get(name="batches/11wqncexbtisoa2rrbvc9tqrtcm4qb1wxv25")

if batch_job.state.name == 'JOB_STATE_SUCCEEDED':
    # The output is in another file.
    result_file_name = batch_job.dest.file_name
    print(f"Results are in file: {result_file_name}")

    print("\nDownloading and parsing result file content...")
    file_content_bytes = client.files.download(file=result_file_name)
    file_content = file_content_bytes.decode('utf-8')

    # The result file is also a JSONL file. Parse and print each line.
    for line in file_content.splitlines():
      if line:
        parsed_response = json.loads(line)
        # Pretty-print the JSON for readability
        print(json.dumps(parsed_response, indent=2))
        print("-" * 20)
else:
    print(f"Job did not succeed. Final state: {batch_job.state.name}")

Results are in file: files/batch-11wqncexbtisoa2rrbvc9tqrtcm4qb1wxv25

Downloading and parsing result file content...
{
  "key": "environmental_ex",
  "response": {
    "modelVersion": "gemini-2.5-flash",
    "candidates": [
      {
        "content": {
          "parts": [
            {
              "thought": true,
              "text": "**QuantasAirways 2024 Environmental Expenditure Analysis**\n\nMy task is to locate and quantify \"Money (OpEx, CapEx) used for environmental purposes (EU Taxonomy aligned CapEx and OpEx starting in 2024)\" for Qantas Airways' 2024 reporting. I've focused my search on environmental investments, CapEx, OpEx, and importantly, EU Taxonomy alignment.\n\nI began by searching the document for key terms like \"environmental purposes,\" \"CapEx,\" \"OpEx,\" and \"taxonomy.\" I systematically reviewed the document, meticulously noting any mention of environmental spending or investments. The instructions are specific: exclude carbon credits and offsets, as th