In [6]:
from Gemini import generatePromptsDictionary
import io
from Fullcontext_main import retrieveCompanyYearReports
from google import genai
client = genai.Client()

companyYearReports = retrieveCompanyYearReports("Airlines", "QuantasAirways", "2024")

for doc in companyYearReports:
    print(f"Now prompting document: {doc.company_name} {doc.period} {doc.topic} {doc.mimetype} {doc.counter} ")

    doc_io = io.BytesIO(doc.file_value)
    uploaded_doc =  client.files.upload(
        file=doc_io,
        config=dict(
            mime_type=doc.mimetype)
    )
    print(f"Uploaded doc: {uploaded_doc.name} with MIME type: {uploaded_doc.mime_type}")

    prompts = generatePromptsDictionary(doc)


Download 100 of 2024.pdf application/pdf
Download 100 of 2024.pdf application/pdf
Now prompting document: QuantasAirways 2024 Topic.ESG application/pdf 1 
Uploaded doc: files/txx1gol65y36 with MIME type: application/pdf
Now prompting document: QuantasAirways 2024 Topic.FINANCIAL application/pdf 1 
Uploaded doc: files/oyvoeiwpj5os with MIME type: application/pdf


In [7]:
from Gemini import IndicatorExtraction

requests_data = []

for indicatorID in prompts:
    request = {
        "key": indicatorID,
        "request": {
            "contents": [{
                "parts": [
                    {"text": prompts[indicatorID]},
                    {"file_data": {"file_uri": uploaded_doc.uri, "mime_type": uploaded_doc.mime_type}}
                ]
            }]
        },
        "generation_config": {
            "thinking_config": {
                  "include_thoughts": True,
            },
            "response_mime_type": "application/json",
            "response_schema": IndicatorExtraction.model_json_schema()
        }
    }
    requests_data.append(request)

print(requests_data)

[{'key': 'environmental_ex', 'request': {'contents': [{'parts': [{'text': '"Extract the following Information from the provided document:\n      -Money (OpEx, CapEx) used for environmental purposes (EU Taxonomy aligned CapEx and OpEx starting in 2024)\n\n      Provide the page number, where the respective information was found. \n      Also provide the text section where you found the information.\n      If the Information we are looking for is not disclosed in the document, set the is_disclosed field to 0.\n      The required output format is JSON.\n      Example:\n      {            \n            "is_disclosed": 1, \n            "indicator_id": "environmental_ex":,\n            "value": "1.695",\n            "unit": "SEK Millions",\n            "page_number": "92",\n            "section" : "Capital expenditure of environmentally sustainable activities (Taxonomy-aligned) \n(A.1) 832  Operational expenditure of environmentally sustainable activities (Taxonomy\naligned) (A.1) 863"\n    

In [8]:
import json

json_file_path = 'batch_requests_with_pdfs.json'

print(f"\nCreating JSONL file: {json_file_path}")
with open(json_file_path, 'w') as f:
    for req in requests_data:
        f.write(json.dumps(req) + '\n')

print(f"Uploading JSONL file: {json_file_path}")
batch_input_file = client.files.upload(
    file=json_file_path
    )
print(f"Uploaded JSONL file: {batch_input_file.name}")

print("\nCreating batch job...")
batch_job_from_file = client.batches.create(
    model="gemini-2.5-flash",
    src=batch_input_file.name,
    config={
        'display_name': 'my-batch-job-with-file-and-thoughts-thinking_config',
    }
)
print(f"Created batch job from file: {batch_job_from_file.name}")
print("You can now monitor the job status using its name.")


Creating JSONL file: batch_requests_with_pdfs.json
Uploading JSONL file: batch_requests_with_pdfs.json
Uploaded JSONL file: files/4odmts83t579

Creating batch job...
Created batch job from file: batches/vnb8p71ws0ug2mgdkb6vmo11xlxspy40okyd
You can now monitor the job status using its name.


In [18]:
# Use the name of the job you want to check
# e.g., inline_batch_job.name from the previous step
job_name0 = "batches/68ohlbuc7numz7ketcvkwizg4dh8povs8eoa"  # (e.g. 'batches/your-batch-id')
job_name = "batches/zi9hlczk1uwzljrevytef2qwqw41djv4wfnq"  # (e.g. 'batches/your-batch-id')
batch_job_from_file = client.batches.get(name=job_name)

import time

completed_states = set([
    'JOB_STATE_SUCCEEDED',
    'JOB_STATE_FAILED',
    'JOB_STATE_CANCELLED',
    'JOB_STATE_EXPIRED',
])

while batch_job_from_file.state.name not in completed_states:
  print(f"Current state: {batch_job_from_file.state.name}")
  time.sleep(30) # Wait for 30 seconds before polling again

print(f"Job finished with state: {batch_job_from_file.state.name}")
if batch_job_from_file.state.name == 'JOB_STATE_FAILED':
    print(f"Error: {batch_job_from_file.error}")

Current state: JOB_STATE_PENDING


KeyboardInterrupt: 

In [14]:
completed_states = set([
    'JOB_STATE_SUCCEEDED',
    'JOB_STATE_FAILED',
    'JOB_STATE_CANCELLED',
    'JOB_STATE_EXPIRED',
])

print("Listing recent batch jobs:\n")

# Note: The list API currently doesn't return inlined_responses.
# As a workaround,you can make a `get` call for inline jobs to see their results.
batches = client.batches.list(config={'page_size': 10})

for b in batches.page:
    print(f"Job Name: {b.name}")
    print(f"  - Display Name: {b.display_name}")
    print(f"  - State: {b.state.name}")
    print(f"  - Create Time: {b.create_time.strftime('%Y-%m-%d %H:%M:%S')}")
    if b.state.name in completed_states:
        print(f"  - End Time: {b.end_time.strftime('%Y-%m-%d %H:%M:%S')}")




    # Check if it was an inline job (no destination file)
    if b.dest is not None:
      if not b.dest.file_name:
        full_job = client.batches.get(name=b.name)
        if full_job.inlined_responses:
            print("  - Type: Inline ({} responses)".format(len(full_job.inlined_responses)))
      else:
          print(f"  - Type: File-based (Output: {b.dest.file_name})")

    print("-" * 20)

Listing recent batch jobs:

Job Name: batches/vnb8p71ws0ug2mgdkb6vmo11xlxspy40okyd
  - Display Name: my-batch-job-with-file-and-thoughts-thinking_config
  - State: JOB_STATE_SUCCEEDED
  - Create Time: 2025-09-11 16:44:56
  - End Time: 2025-09-11 17:48:30
  - Type: File-based (Output: files/batch-vnb8p71ws0ug2mgdkb6vmo11xlxspy40okyd)
--------------------
Job Name: batches/f9twk9y9jhwrgfyjrpq9h3ucnqr5cypgorio
  - Display Name: my-batch-job-with-file-and-thoughts
  - State: JOB_STATE_SUCCEEDED
  - Create Time: 2025-09-11 00:14:25
  - End Time: 2025-09-11 03:29:17
  - Type: File-based (Output: files/batch-f9twk9y9jhwrgfyjrpq9h3ucnqr5cypgorio)
--------------------
Job Name: batches/zi9hlczk1uwzljrevytef2qwqw41djv4wfnq
  - Display Name: my-batch-job-with-file
  - State: JOB_STATE_SUCCEEDED
  - Create Time: 2025-09-10 23:57:10
  - End Time: 2025-09-11 03:16:01
  - Type: File-based (Output: files/batch-zi9hlczk1uwzljrevytef2qwqw41djv4wfnq)
--------------------
Job Name: batches/68ohlbuc7numz7k

In [13]:
import json

batch_job = client.batches.get(name="batches/vnb8p71ws0ug2mgdkb6vmo11xlxspy40okyd")

if batch_job.state.name == 'JOB_STATE_SUCCEEDED':
    # The output is in another file.
    result_file_name = batch_job.dest.file_name
    print(f"Results are in file: {result_file_name}")

    print("\nDownloading and parsing result file content...")
    file_content_bytes = client.files.download(file=result_file_name)
    file_content = file_content_bytes.decode('utf-8')

    # The result file is also a JSONL file. Parse and print each line.
    for line in file_content.splitlines():
      if line:
        parsed_response = json.loads(line)
        # Pretty-print the JSON for readability
        print(json.dumps(parsed_response, indent=2))
        print("-" * 20)
else:
    print(f"Job did not succeed. Final state: {batch_job.state.name}")

Results are in file: files/batch-vnb8p71ws0ug2mgdkb6vmo11xlxspy40okyd

Downloading and parsing result file content...
{
  "generation_config": {
    "response_schema": {
      "title": "IndicatorExtraction",
      "required": [
        "indicator_id",
        "value",
        "unit",
        "page_number",
        "section"
      ],
      "type": "object",
      "properties": {
        "value": {
          "type": "string",
          "title": "Value"
        },
        "indicator_id": {
          "title": "Indicator Id",
          "type": "string"
        },
        "isDisclosed": {
          "type": "integer",
          "title": "Isdisclosed",
          "default": 1
        },
        "page_number": {
          "title": "Page Number",
          "type": "string"
        },
        "section": {
          "title": "Section",
          "type": "string"
        },
        "unit": {
          "type": "string",
          "title": "Unit"
        }
      }
    },
    "thinking_config": {
     