In [1]:
# mount the google drive folder
from google.colab import drive
drive.mount('/content/drive', force_remount=True)


Mounted at /content/drive


In [2]:
#install the dependencies
!pip install matplotlib-venn
!apt-get -qq install -y libfluidsynth1
# https://pypi.python.org/pypi/libarchive
!apt-get -qq install -y libarchive-dev && pip install -U libarchive
import libarchive
# https://pypi.python.org/pypi/pydot
!apt-get -qq install -y graphviz && pip install pydot
import pydot
!pip install cartopy
import cartopy
!pip install genai
!pip install -U --quiet google-generativeai
!pip install -q -r /content/drive/MyDrive/LLMCode/requirements_notebooks.txt

E: Package 'libfluidsynth1' has no installation candidate
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m74.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m81.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for titlecase (pyproject.toml) ... [?25l[?25hdone


In [3]:
#include the LLMCode toolkits
CODE_PATH = '/content/drive/MyDrive/LLMCode'

import sys, os
sys.path.append(CODE_PATH)

import llmcode
print("llmcode loaded from:", llmcode.__file__)

llmcode loaded from: /content/drive/MyDrive/LLMCode/llmcode/__init__.py


In [4]:
#Initial setup code. If you opened this notebook in Colab, this code is hidden
#by default to avoid unnecessary user interface clutter

#-------------------------------------------------------
#User-defined parameters. You can freely edit the values
llm_API = "Gemini" # @param ["OpenAI","Aalto","Gemini"]
LLM_model = "gemini-2.5-pro" # @param ["gpt-4o-mini","gpt-4o","gpt-4-turbo","gemini-2.5-pro","gemini-2.5-flash"]


#-------------------------------------------------------------------
#Implementation. Only edit this part if you know what your are doing

#Import packages
import pandas as pd
import numpy as np
from IPython.display import HTML, clear_output
import getpass
import os
import html
import plotly.express as px
import textwrap
import openpyxl
import re
import google.genai as genai


#Jupyter is already running an asyncio event loop => need this hack for async OpenAI API calling
import nest_asyncio
nest_asyncio.apply()

#Prompt the user for an API key if not provided via a system variable
if llm_API=="OpenAI":
    if os.environ.get("OPENAI_API_KEY") is None:
        print("Please input an OpenAI API key")
        api_key = getpass.getpass()
        os.environ["OPENAI_API_KEY"] = api_key
elif llm_API=="Gemini":
    if os.environ.get("GOOGLE_API_KEY") is None:
        print("Please input an Gemini API key")
        api_key = getpass.getpass()
        os.environ["GOOGLE_API_KEY"] = api_key
else:
    print(f"Invalid API type: {llm_API}")

#Initialize the LLMCode library
llmcode.init(API=llm_API)

Please input an Gemini API key
··········


In [5]:
#test the genai package version
import google.genai as genai
print(genai.__version__)

1.25.0


In [None]:
#Define the prompt and store it in a variable (a container for some data)
#called "my_prompt".
my_prompt="Hi!"

#Call the query_LLM() function from the LLMCode library.
#Functions are pieces of Python code that perform some functionality.
#Here, the query_LLM() function takes in the "prompts" and "model" parameters and
#and sends the prompts to the LLM. The "LLM_model" is the model you defined above.
#The LLM response is is stored in the "response" variable"
response = llmcode.query_LLM(prompts=my_prompt,
                             model=LLM_model)

#Print out the response.
print("LLM response:")
print(response)

LLM response:
Hello! How can I help you today?


In [None]:
#Test for giving parent content analysis
#-------------------------------------------------------
#User-defined parameters. You can freely edit the values
data_filename_or_URL="" #@param {type:"string"}
examples_filename_or_URL="" #@param {type:"string", placeholder:"leave this empty to use examples from the data file"}
data_column="" #@param {type:"string"}
ground_truth_column="" #@param {type:"string"}
validation_data= #@param {type:"integer"}
test_data= #@param {type:"integer"}
examples_to_view= #@param {type:"integer"}

#-------------------------------------------------------------------
#Implementation. Only edit this part if you know what your are doing

#data load helper function
def load_data(filename_or_URL):
  #Load the file
  if filename_or_URL.endswith(".xlsx"):
    df = pd.read_excel(filename_or_URL)
  elif filename_or_URL.endswith(".docx"):
    df = llmcode.open_docx_and_process_codes(filename_or_URL)
  elif filename_or_URL.endswith(".csv"):
    df = pd.read_csv(filename_or_URL)
  else:
    raise Exception("File type not supported.")

  #Fix a possible Excel import issue
  df[data_column]=df[data_column].astype(str).apply(openpyxl.utils.escape.unescape)
  if ground_truth_column in df.columns:
    df[ground_truth_column]=df[ground_truth_column].astype(str).apply(openpyxl.utils.escape.unescape)

  #In this notebook, we only focus on the highlights
  #Thus, we remove any codes defined for the highlights enclosed between <sup> and </sup>
  if ground_truth_column in df.columns:
    df[ground_truth_column]=df[ground_truth_column].str.replace(r'<sup>.*?</sup>', '', regex=True)
  return df

#load data file
df=load_data(data_filename_or_URL)

#validation split
df_test=df.iloc[validation_data:validation_data+test_data]
df=df.head(validation_data)

#load example file if defined. if not, we take a copy of the validation data
if examples_filename_or_URL:
  df_examples=load_data(examples_filename_or_URL)
else:
  df_examples=df.copy()

#Print examples formatted so that the highlights are in bold
print(f"{examples_to_view} first rows of the example data:")
html_text=llmcode.extracts_to_html(df_examples.head(examples_to_view)[ground_truth_column])
display(HTML(html_text))

print(f"{examples_to_view} first rows of the processed data:")
html_text=llmcode.extracts_to_html(df_examples.head(examples_to_view)[data_column])
display(HTML(html_text))



In [None]:
# June 18, 2025 PDT 15:40
# Model: Gemini 2.5 Pro
# Test dataset capacity: 175
# Example Input: 11
# IoU 0.902 > 0.9
# 11 results IoU = 0
# 7 FN results, 4 FP results

#-------------------------------------------------------
#User-defined parameters. You can freely edit the values

#Number of examples to use from the example data
#Note that the numbering is 0-based, i.e.,
num_examples=11

#Define the prompt beginning. The code below will automatically add the examples.
improved_prompt=""" SET YOUR PROMPT TO HERE


"""

#-------------------------------------------------------------------
#Implementation. Only edit this part if you know what your are doing


#Add the examples to the prompt



improved_prompt+=""""""
for example in range(num_examples):
  improved_prompt+=f"EXAMPLE INPUT:\n\n{df_examples.iloc[example][data_column]}\n\n"
  improved_prompt+=f"EXAMPLE OUTPUT:\n\n{df_examples.iloc[example][ground_truth_column]}\n\n"

improved_prompt+="""
Your Task
Now, apply the same logic to the following input.

INPUT:

"""
#improved_prompt+="ACTUAL INPUT:\n\n"



#=====
#call the extract_relevant method with the prompt and data
df_extracts=llmcode.extract_relevant(improved_prompt,
                          df,
                          data_col=data_column,
                          extracts_col="llm_extracts",
                          model=LLM_model
                      )


#calculate the IoU
IoU,html_report=llmcode.extract_IoU(df_extracts,
                                    extracts_col="llm_extracts",
                                    reference_col=ground_truth_column)

#display the quality report and print out the average IoU
display(HTML(html_report))
print(f"Average IoU = {np.mean(IoU)}")




In [None]:
#-------------------------------------------------------
#User-defined parameters. You can freely edit the values

pdf_filename = "/content/drive/MyDrive/LLMCode/LLM_highlights.pdf"  #@param {type:"string"}

#-------------------------------------------------------------------
#Implementation. Only edit this part if you know what your are doing
markdown_output="\n\n".join(df_extracts["llm_extracts"])
from markdown_pdf import MarkdownPdf, Section
pdf = MarkdownPdf(toc_level=2)
pdf.add_section(Section(markdown_output))
pdf.save(pdf_filename)
