# Notebook Setup
<a target="_blank" href="https://colab.research.google.com/github/PacktPublishing/Generative-AI-Integration-Patterns-1E/blob/main/01-Batch-Metadata/Chapter_5_batch_metadata_extraction.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

In [None]:
#Install dependencies

!pip install --upgrade google-cloud-aiplatform

In [None]:
#Authenticate
!gcloud auth application-default login

In [None]:
#General Imports

import base64
import json

#GCP Imports
import vertexai
from vertexai.generative_models import GenerativeModel, Part, FinishReason
import vertexai.preview.generative_models as generative_models


In [None]:
PROJECT = "YOUR-GCP-PROJECT"#@param {type:"string"}
LOCATION = "us-central1"#@param {type:"string"}
MODEL = "gemini-1.5-pro-001"#@param {type:"string"}

# Function Definitions

In [None]:
generation_config = {
    "max_output_tokens": 8192,
    "temperature": 0,
    "top_p": 0.95,
}

safety_settings = {
    generative_models.HarmCategory.HARM_CATEGORY_HATE_SPEECH: generative_models.HarmBlockThreshold.BLOCK_ONLY_HIGH,
    generative_models.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: generative_models.HarmBlockThreshold.BLOCK_ONLY_HIGH,
    generative_models.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: generative_models.HarmBlockThreshold.BLOCK_ONLY_HIGH,
    generative_models.HarmCategory.HARM_CATEGORY_HARASSMENT: generative_models.HarmBlockThreshold.BLOCK_ONLY_HIGH,
}

def generate(prompt_template_beggining,document,prompt_template_end):
  vertexai.init(project=PROJECT, location=LOCATION)
  model = GenerativeModel(MODEL)
  responses = model.generate_content(
      [prompt_template_beggining,document,prompt_template_end],
      generation_config=generation_config,
      safety_settings=safety_settings,
      stream=False,
  )
  return(responses)


# Entry Point

In [None]:
#In this section we will define the documents we are going to ingest.
#For this example we are going to download a file directly.
#In the case of a Cloud Function, you can leverage the object that triggered the function

!wget https://s1.q4cdn.com/806093406/files/doc_downloads/2021/08/Nike10k2021.pdf -O document.pdf


In [None]:
#Load pdf file

def get_file(file_path):
  with open(file_path, "rb") as file:
    file_data = file.read() #Read the binary data of the file
  pdf_document = Part.from_data(mime_type="application/pdf", data=file_data)
  return pdf_document

pdf_data = get_file("document.pdf")

# Prompt Preprocessing

In [None]:
#In this section we define the prompt, notice that the document will be included as part of the format
prompt_template_beggining = """
You are an expert business analyst specialized in 10k documents.

Your task is to extract information from 10K documents, to achieve this task, follow these steps:

1. Carefully analyze the document provided as context.
2. Use the template as a reference to understand which data points to extract.
3. Never make up information, if you don't remember, go back to read the document. If the data is not available, add "Not available" as the value.
4. Return a well formed JSON document following the template using the data from the document.
5. Always think step by step.
<template>
{
  "10k_template": {
    "part_I": {
      "item_1_business": {
        "description": "Company’s main products and services, subsidiaries, and markets",
        "recent_events": null,
        "competition": null,
        "regulations": null,
        "labor_issues": null,
        "operating_costs": null,
        "seasonal_factors": null,
        "summary": null
      },
      "item_1a_risk_factors": {
        "risk_factors": [],
        "summary": null
      },
      "item_1b_unresolved_staff_comments": {
        "comments": [],
        "summary": null
      },
      "item_2_properties": {
        "properties": [],
        "summary": null
      },
      "item_3_legal_proceedings": {
        "proceedings": [],
        "summary": null
      },
      "item_4": {
        "reserved": true
      }
    },
    "part_II": {
      "item_5_market_for_registrants_common_equity": {
        "market_information": null,
        "number_of_holders": null,
        "dividends": null,
        "stock_repurchases": null,
        "summary": null
      },
      "item_6_selected_financial_data": {
        "financial_data": {},
        "summary": null
      },
      "item_7_managements_discussion_and_analysis": {
        "operations_and_financial_results": null,
        "liquidity_and_capital_resources": null,
        "trends_and_uncertainties": null,
        "critical_accounting_judgments": null,
        "off_balance_sheet_arrangements": null,
        "contractual_obligations": null,
        "summary": null
      },
      "item_7a_quantitative_and_qualitative_disclosures_about_market_risk": {
        "market_risk_exposures": null,
        "risk_management": null,
        "summary": null
      },
      "item_8_financial_statements_and_supplementary_data": {
        "income_statement": null,
        "balance_sheets": null,
        "statement_of_cash_flows": null,
        "statement_of_stockholders_equity": null,
        "notes_to_financial_statements": null,
        "auditors_report": null,
        "summary": null
      },
      "item_9_changes_in_and_disagreements_with_accountants": {
        "changes_in_accountants": null,
        "disagreements": null,
        "summary": null
      },
      "item_9a_controls_and_procedures": {
        "disclosure_controls_and_procedures": null,
        "internal_control_over_financial_reporting": null,
        "summary": null
      },
      "item_9b_other_information": {
        "other_information": null,
        "summary": null
      }
    },
    "part_III": {
      "item_10_directors_executive_officers_and_corporate_governance": {
        "directors_and_executive_officers": [],
        "code_of_ethics": null,
        "board_qualifications": null,
        "summary": null
      },
      "item_11_executive_compensation": {
        "compensation_policies_and_programs": null,
        "executive_compensation": {},
        "summary": null
      },
      "item_12_security_ownership": {
        "ownership_information": {},
        "equity_compensation_plans": null,
        "summary": null
      },
      "item_13_certain_relationships_and_related_transactions": {
        "relationships_and_transactions": [],
        "director_independence": null,
        "summary": null
      },
      "item_14_principal_accountant_fees_and_services": {
        "fees_for_services": {},
        "summary": null
      }
    },
    "part_IV": {
      "item_15_exhibits_financial_statement_schedules": {
        "exhibits": [],
        "financial_statement_schedules": null,
        "summary": null
      }
    }
  }
}

</template>
<document>
"""
prompt_template_end="""
</document>

Response:

"""

# Inference

In [None]:
result = generate(prompt_template_beggining,pdf_data,prompt_template_end)
print(result)

# Result Postprocessing

In [None]:
try:
 data = json.loads(result.text)
 print(data['10k_template'])
 #Ingest the data in a database, this can be a full ingestion in the case of a document DB or may require additional post processing.
except json.JSONDecodeError:
 print("Error: Invalid JSON data")


# Result Presentation

In [None]:
#In this case we are not exposing the results as they are intended to be ingested, but this is an example on how to show the results from the JSON document

def traverse_json(data, path="", indent=""):
 """
 This function recursively traverses a JSON object and prints
 key-value pairs based on the provided path.

 Args:
     data: The JSON object to traverse.
     path: A string representing the current path within the JSON object (optional).
     indent: A string representing indentation for better readability (optional).
 """
 if isinstance(data, dict):
   for key, value in data.items():
     new_path = f"{path}/{key}" if path else key
     print(f"{indent}{new_path}:")
     traverse_json(value, new_path, indent + "  ")
 elif isinstance(data, list):
   for i, item in enumerate(data):
     new_path = f"{path}/{i}" if path else str(i)
     print(f"{indent}{new_path}:")
     traverse_json(item, new_path, indent + "  ")
 else:
   # Handle other data types (strings, numbers, etc.)
   print(f"{indent}{path}: {data}")



# Print the entire JSON structure with indentation
traverse_json(data)
