## Install Additional Packages

In [None]:
!pip install transformers==4.49
!pip install sentence-transformers
!pip install langchain
!pip install langchain_community
!pip install faiss-cpu
!pip install ragatouille
!pip install "unstructured[all-docs]"
!pip install rouge-score
!pip install bert-score

Collecting unstructured[all-docs]
  Using cached unstructured-0.18.3-py3-none-any.whl.metadata (24 kB)
Collecting python-magic (from unstructured[all-docs])
  Using cached python_magic-0.4.27-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting emoji (from unstructured[all-docs])
  Using cached emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Collecting python-iso639 (from unstructured[all-docs])
  Using cached python_iso639-2025.2.18-py3-none-any.whl.metadata (14 kB)
Collecting langdetect (from unstructured[all-docs])
  Using cached langdetect-1.0.9.tar.gz (981 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting rapidfuzz (from unstructured[all-docs])
  Using cached rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting backoff (from unstructured[all-docs])
  Downloading backoff-2.2.1-py3-none-any.whl.metadata (14 kB)
Collecting unstructured-client (from unstructured[all-docs])
  Downloading unstructured_client-0.38.1-py3-no

Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bert-score
Successfully installed bert-score-0.3.13


## Imports

In [None]:
import os
import numpy as np
import nltk
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn.functional as F
import gc
import re
import json
import time
import inspect

from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.llms import HuggingFacePipeline
from langchain_community.document_loaders import (
    DirectoryLoader,
    UnstructuredMarkdownLoader,
)

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document as LangchainDocument
from langchain_community.vectorstores import FAISS
from langchain.embeddings.base import Embeddings
from langchain.prompts import PromptTemplate
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm import tqdm
from typing import Optional, List, Tuple, Any, Union, Pattern, Dict, Callable
from huggingface_hub import login
from ragatouille import RAGPretrainedModel
from datetime import datetime
from functools import wraps
from rouge_score import rouge_scorer
from bert_score import score
from requests.exceptions import HTTPError

nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
pd.set_option("display.max_colwidth", None)

********************************************************************************
--------------------------------------------
RAGatouille version 0.0.10 will be migrating to a PyLate backend 
instead of the current Stanford ColBERT backend.
PyLate is a fully mature, feature-equivalent backend, that greatly facilitates compatibility.
However, please pin version <0.0.10 if you require the Stanford ColBERT backend.
********************************************************************************
  from ragatouille import RAGPretrainedModel
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


## Login to HuggingFace

In [None]:
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Tokenisers

In [None]:
# NOTE: only one tokenizer was implemented in these experiments due to time constraints. Future work would implement multiple tokenisers to compare performance between them

# LARGE_TOKENISER_MODEL_NAME = "????"
MEDIUM_TOKENISER_MODEL_NAME = "Alibaba-NLP/gte-Qwen2-1.5B-instruct"
# SMALL_TOKENSIER_MODEL_NAME = "intfloat/multilingual-e5-large-instruct"

## LLMs

In [None]:
# Defining model names in variables as re-used throughout file

LARGE_FINETUNED_LLM_MODEL_NAME = "johndennehy101/Mistral-7B-Instruct-v0.3-finetune-irish-citizen-info-v1"
LARGE_LLM_MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.3"
MEDIUM_LLM_MODEL_NAME = "meta-llama/Llama-3.2-3B-Instruct"
SMALL_LLM_MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"

## Prompt Types

In [None]:
# Defining prompt types

ZERO_SHOT_PROMPT_TYPE = "zero shot"
PERSONA_PROMPT_TYPE = "persona"
FEW_SHOT_PROMPT_TYPE = "few shot"
INSTRUCTION_PROMPT_TYPE = "instruction"
CHAIN_OF_THOUGHT_PROMPT_TYPE = "chain of thought"
TREE_OF_THOUGHT_PROMPT_TYPE = "tree of thought"

## Common variables

In [None]:
# These were implemented to ease testing (for chunk size for RAG & path to selected vector db)

CHUNK_SIZE = 512
VECTOR_DB_PATH = "/content/faiss_vector_store_Alibaba-NLP_gte-Qwen2-1_512_chunk"

## Utility Functions

In [None]:
def get_device() -> str:
    """
    Determines what device is available (e.g. running on colab gpu instance should return cuda whereas local run might return cpu)

    Args:
        None
    
    Returns:
        str: device string
    """
    # Call methods on torch package to see what device is available
    # Print statements for easier visibility on what is being used
    if torch.cuda.is_available():
        device = "cuda"
        print("Using CUDA")
    elif torch.backends.mps.is_available():
        device = "mps"
        print("Using MPS")
    elif torch.backends.opencl.is_available():
        device = "opencl"
        print("Using OpenCL")
    else:
        device = "cpu"
        print("Using CPU")
    return device

In [None]:
# These were the only markdown separators tested which appeared to split the documents reasonably well
# This is another area where additional work could be completed to test if utilising different separators would lead to better performance for RAG
MARKDOWN_SEPARATORS = [
    "\n#{1,6} ",
    "```\n",
    "\n\\*\\*\\*+\n",
    "\n---+\n",
    "\n___+\n",
    "\n\n",
    "\n",
    " ",
    "",
]

def split_documents(
    chunk_size: int,
    knowledge_base: List[LangchainDocument],
    tokenizer_name: Optional[str] = MEDIUM_TOKENISER_MODEL_NAME,
) -> List[LangchainDocument]:
    """
    Split documents into chunks of maximum size `chunk_size` tokens and return a list of documents.

    Args:
        chunk_size (int): determines what size each chunk should be
        knowledge_base (List[LangchainDocument]): list of langchain documents that are to be split into chunks
        tokenizer_name (Optional[str]): What tokenizer model should be used
    
    Returns:
        List[LangchainDocument]: Returns chunked documents
    """

    # Use the RecursiveCharacterTextSplitter to split the data into chunks based on parameter values
    # Note the overlap is set to 10% to minimise chance of context loss between chunks
    text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
        AutoTokenizer.from_pretrained(tokenizer_name),
        chunk_size=chunk_size,
        chunk_overlap=int(chunk_size / 10),
        add_start_index=True,
        strip_whitespace=True,
        separators=MARKDOWN_SEPARATORS,
    )

    docs_processed = []
    for doc in knowledge_base:
        # Loop over documents and append chunk to docs_processed variable
        docs_processed += text_splitter.split_documents([doc])

    # Remove duplicates
    unique_texts = {}
    docs_processed_unique = []

    # Check to ensure that no duplicates are included
    for doc in docs_processed:
        if doc.page_content not in unique_texts:
            unique_texts[doc.page_content] = True
            docs_processed_unique.append(doc)

    return docs_processed_unique

In [None]:
def generate_faiss_embeddings_vector_store(documents: List[LangchainDocument], embedding_model: Embeddings, batch_size: int = 32) -> FAISS:
  """
  Generate embedding store based on provided documents with provided embedding model

  Args:
    documents (List[LangchainDocument]): knowledge base which is to be used for vector store
    embedding_model (Embeddings): model used to generate embeddings for documents
    batch_size (batch_size): how many documents should embeddings be generated for in each iteration
  
  Returns:
    FAISS: vector store for the provided documents
  """

  # First, initialise to None
  faiss_index = None

  # Loop over all documents, increment based on batch size variable value
  for i in tqdm(range(0, len(documents), batch_size)):
    # Extract documents in current batch
    documents_current_batch = documents[i:i+batch_size]

    # Extract text from documents in current batch
    documents_current_batch_extracted_text = [doc.page_content for doc in documents_current_batch]

    # Use the embedding model to generate embeddings for the current batch text
    current_batch_embeddings = embedding_model.embed_documents(documents_current_batch_extracted_text)

    # Create list of tuples linking the raw text with the current batch embeddings generated
    current_batch_embeddings_pairs = list(zip(documents_current_batch_extracted_text, current_batch_embeddings))

    # Create FAISS index for current batch embeddings by passing embeddings and embedding model
    batch_faiss = FAISS.from_embeddings(text_embeddings=current_batch_embeddings_pairs, embedding=embedding_model)

    # If first batch, set faiss index to FAISS index, otherwise merge to ensure previous batch values are not overwritten
    if faiss_index is None:
      faiss_index = batch_faiss
    else:
      faiss_index.merge_from(batch_faiss)

  # Once all batches have been processed, return the full index store
  return faiss_index

In [None]:
def check_file_exists(file_path: str) -> bool:
  """
  This function checks if provided file path exists

  Args:
    file_path (str): file path which should be checked for file existence
  
  Returns:
    bool: whether the file exists or not
  """

  # Uses os package isfile method to determine if a file exists at the provided file path
  # If so, return True, else return False
  if os.path.isfile(file_path):
    return True

  return False

In [None]:
def write_json_file(output_file_path: str, content: Any) -> bool:
  """
  Writes content to json file at provided output path

  Args:
    output_file_path (str): File path at which file should be generated
    content (Any): The content to be written within the file
  
  Returns:
    bool: indicates if writing json was successful or not
  """

  # Get directory name from output file path parameter value
  output_directory = os.path.dirname(output_file_path)

  # If the directory does not exist, create it
  if output_directory and not os.path.exists(output_directory):
    os.makedirs(output_directory)

  # Try writing content to the file - return True if all successful, otherwise catch the error, log and return False
  try:
    with open(output_file_path, "w") as output_file:
      json.dump(content, output_file, indent=4)

    print("Successfully saved content to {}".format(output_file_path))

    return True
  except(OSError, IOError) as e:
    print("Error saving content to {}".format(output_file_path))
    return False

In [None]:
def read_json_file(input_file_path: str, default: Any = []) -> Any:
  """
  Reads content from json file at provided input file path

  Args:
    input_file_path (str): the file path where the target file resides
    default (Any): the default structure of the expected file (to be returned in case of error to avoid consuming errors)
  
  Returns:
    Any: the content read from the file
  """

  # Call utility function to check if a file actually exists at the provided file path parameter
  # If it does not exist, return the default data structure
  if not check_file_exists(input_file_path):
    return default

  # Try open the file and read the contents
  # If successful, return the contents. If not successful, return the default data structure
  try:
    with open(input_file_path, "r", encoding="utf-8") as input_file:
      content = json.load(input_file)

    print("Successfully loaded content from {} file".format(input_file_path))
    return content

  except (OSError, IOError, json.JSONDecodeError) as e:
    print("Error reading from file path: {}".format(input_file_path))
    return default

In [None]:
def generate_json_file_name(prompt_type: str, model_name: str, model_type: str, temperature: float, top_k: int, top_p: float, repetition_penalty: float, no_repeat_n_grams: float, max_new_tokens: int, instance_name: str, output_directory: str) -> str:
  """
  Generate unique file name to be used for model experiment files

  Args:
    prompt_type (str): experiment prompt type (zero-shot, few-shot, instruction, persona, chain-of-thought, tree-of-thought)
    model_name (str): experiment model
    model_type (str): experiment model type (rag / llm)
    temperature (str): value for temperature hyperparameter
    top_k (int): top k hyperparameter value
    top_p (float): top p hyperparameter value
    repetition_penalty (float): rep penalty hyperparameter value
    no_repeat_n_grams (float): no repeat n grams hyperparameter value
    max_new_tokens (int): hyperparameter value for max new tokens
    instance_name (str): Google Colaboratory instance name used in experiment
    output_directory (str): output directory where results are to be written
  
  Returns:
    str: the unique file name for the model experiment results
  """

  # Replace spaces and / in model name and replace with _ for safe writing for files
  safe_model_name = model_name.replace("/", "_").replace(" ", "_")

  # Likewise for prompt and instance, remove any spaces and / characters
  safe_prompt_name = prompt_type.replace(" ", "_")
  safe_instance_name = instance_name.replace("/", "_").replace(" ", "_")

  # Generate timestamp to ensure uniqueness for file name
  timestamp = datetime.now().strftime("%d-%m-%Y_%H-%M-%S")

  # Substitute parameters into file path structure
  file_path = "{}/golden_results_{}_{}_{}_{}_temp{}_topk{}_topP{}_repPenalty{}_noRepeatNGrams{}_max{}_{}.json".format(output_directory, safe_prompt_name, safe_model_name, safe_instance_name, model_type, temperature, top_k, top_p, repetition_penalty, no_repeat_n_grams, max_new_tokens, timestamp)

  # Return generated file path
  return file_path

In [None]:
def get_instance_name() -> str:
  """
  Get current Google colab instance name

  Args:
    None
  
  Returns:
    str: the instance name for google colab, otherwise if not able to extract, default to CPU
  """

  # Use torch method to get current instance name if available otherwise default to CPU
  return torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU"

In [None]:
def generate_rouge_scores(ground_truth: str, generated_answer: str) -> Tuple[float, float, float]:
  """
  Generate rouge scores by providing ground truth and generated answer to rouge score package

  Args:
    ground_truth (str): Ground truth which should be used as baseline for rouge score generation
    generated_answer (str): model generated output
  
  Returns:
    Tuple[float, float, float]: rouge1, rouge2, rougel scores
  """

  # Call RougeScorer method initialise class with target rouge metrics
  scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)

  # Actually generate scores
  rouge_scores = scorer.score(ground_truth, generated_answer)

  # Extract scores
  rouge_1_score = rouge_scores["rouge1"].fmeasure
  rouge_2_score = rouge_scores["rouge2"].fmeasure
  rouge_l_score = rouge_scores["rougeL"].fmeasure

  # Return scores
  return rouge_1_score, rouge_2_score, rouge_l_score

In [None]:
def get_file_paths_matching_regex(directory: str, regex_pattern: Union[str, Pattern]) -> List[str]:
  """
  Get files within provided directory path that match provided regex pattern

  Args:
    directory (str): target directory which to search for matched files
    regex_pattern (Union[str, Pattern]): regex used to match docs
  
  Returns:
    List[str]: return list of matched file paths
  """

  # Initialise empty dict
  unique_files_dict = {}

  # Traverse directory files
  # If match found for file with regex, set file name as key and full file path as value in dict
  for root, dirs, files in os.walk(directory):
    for file_name in files:
      if re.search(regex_pattern, file_name):
        full_path = os.path.join(root, file_name)

        unique_files_dict[file_name] = full_path

  # Once all have been looped over, generate list of full path values by extracting values from dict
  unique_files = list(unique_files_dict.values())

  # Return list of file paths that match regex pattern within provided directory
  return unique_files

In [None]:
def plot_rouge_score_comparison(first_model_results: List[Dict[str, float]], first_model_name: str, second_model_results: List[Dict[str, float]], second_model_name: str) -> None:
  """
  Use matplotlib to plot comparison between two models for generated rouge scores

  Args:
    first_model_results (List[Dict[str, float]]): results for first model to be plotted
    first_model_name (str): first model name
    second_model_results (List[Dict[str, float]]): results for second model to be plotted
    second_model_name (str): second model name
  
  Returns:
    None
  """

  # Extract properties from results for both models
  rouge_one_scores_model_one = [i["rouge1_score"] for i in first_model_results]
  rouge_one_scores_model_two = [i["rouge1_score"] for i in second_model_results]

  rouge_two_scores_model_one = [i["rouge2_score"] for i in first_model_results]
  rouge_two_scores_model_two = [i["rouge2_score"] for i in second_model_results]

  rouge_l_scores_model_one = [i["rougeL_score"] for i in first_model_results]
  rouge_l_scores_model_two = [i["rougeL_score"] for i in second_model_results]

  # Initialise graph
  plt.figure(figsize=(12,4))

  # Use subplots to plot side by side

  # First graph for rouge 1
  plt.subplot(1,3,1)
  plt.scatter(rouge_one_scores_model_one, rouge_one_scores_model_two, color="blue", alpha=0.6, label="ROUGE-1")
  plt.plot([0,1], [0,1], "--", color="gray")
  plt.xlabel("{} ROUGE-1".format(first_model_name))
  plt.ylabel("{} ROUGE-1".format(second_model_name))
  plt.title("ROUGE-1 Comparison")
  plt.legend()

  # Second graph for rouge 2
  plt.subplot(1,3,2)
  plt.scatter(rouge_two_scores_model_one, rouge_two_scores_model_two, color="green", alpha=0.6, label="ROUGE-2")
  plt.plot([0,1], [0,1], "--", color="gray")
  plt.xlabel("{} ROUGE-2".format(first_model_name))
  plt.ylabel("{} ROUGE-2".format(second_model_name))
  plt.title("ROUGE-2 Comparison")
  plt.legend()

  # Third graph for rouge L
  plt.subplot(1,3,3)
  plt.scatter(rouge_l_scores_model_one, rouge_l_scores_model_two, color="red", alpha=0.6, label="ROUGE-L")
  plt.plot([0,1], [0,1], "--", color="gray")
  plt.xlabel("{} ROUGE-L".format(first_model_name))
  plt.ylabel("{} ROUGE-L".format(second_model_name))
  plt.title("ROUGE-L Comparison")
  plt.legend()

  plt.tight_layout()
  plt.show()

In [None]:
def plot_score_comparison_recall_precision_f1(first_model_results: List[Dict[str, float]], first_model_name: str, second_model_results: List[Dict[str, float]], second_model_name: str) -> None:
  """
  Use matplotlib to plot comparison between two models for generated recall, precision, f1 scores

  Args:
    first_model_results (List[Dict[str, float]]): results for first model to be plotted
    first_model_name (str): first model name
    second_model_results (List[Dict[str, float]]): results for second model to be plotted
    second_model_name (str): second model name
  
  Returns:
    None
  """
  
  # Extract precision, recall, f1 scores from results to be plotted
  bert_precision_scores_model_one = [i["bert_precision"] for i in first_model_results]
  bert_precision_scores_model_two = [i["bert_precision"] for i in second_model_results]

  bert_recall_scores_model_one = [i["bert_recall"] for i in first_model_results]
  bert_recall_scores_model_two = [i["bert_recall"] for i in second_model_results]

  bert_f1_scores_model_one = [i["bert_f1"] for i in first_model_results]
  bert_f1_scores_model_two = [i["bert_f1"] for i in second_model_results]

  # Initialise graph
  plt.figure(figsize=(12,4))

  # On first subplot, plot precision
  plt.subplot(1,3,1)
  plt.scatter(bert_precision_scores_model_one, bert_precision_scores_model_two, color="blue", alpha=0.6, label="BERT Precision")
  plt.plot([0,1], [0,1], "--", color="gray")
  plt.xlabel("{} BERT Precision".format(first_model_name))
  plt.ylabel("{} BERT Precision".format(second_model_name))
  plt.title("BERT Precision Comparison")
  plt.legend()

  # On second subplot, plot recall
  plt.subplot(1,3,2)
  plt.scatter(bert_recall_scores_model_one, bert_recall_scores_model_two, color="green", alpha=0.6, label="BERT Recall")
  plt.plot([0,1], [0,1], "--", color="gray")
  plt.xlabel("{} BERT Recall".format(first_model_name))
  plt.ylabel("{} BERT Recall".format(second_model_name))
  plt.title("BERT Recall Comparison")
  plt.legend()

  # On third subplot, plot F1
  plt.subplot(1,3,3)
  plt.scatter(bert_f1_scores_model_one, bert_f1_scores_model_two, color="red", alpha=0.6, label="BERT F1")
  plt.plot([0,1], [0,1], "--", color="gray")
  plt.xlabel("{} BERT F1".format(first_model_name))
  plt.ylabel("{} BERT F1".format(second_model_name))
  plt.title("BERT F1 Comparison")
  plt.legend()

  plt.tight_layout()
  plt.show()

In [None]:
def delete_model(model_variable: Any) -> None:
  """
  Free up GPU and CPU resources when no longer needed for model by deleting reference and calling garbage collector and emptying torch cache
  """
  
  model_variable.pipeline.model.to("cpu")
  del model_variable
  gc.collect()
  torch.cuda.empty_cache()
  torch.cuda.ipc_collect()

In [None]:
def get_prompt_by_type(prompt_type: str, prompt_dict: Dict = {}) -> str:
  """
  Utility function to easily extract prompt template based on prompt type parameter value

  Args:
    prompt_type (str): the target key for which the value should be returned
    prompt_dict (Dict): the dictionary that stores prompt types as keys and prompt templates as values
  """

  # Sanitise input for prompt type
  prompt_type = prompt_type.strip().lower()

  # Try return prompt type template based on type parameter value, otherwise throw error
  try:
    return prompt_dict[prompt_type]
  except KeyError:
    raise ValueError("Unknown prompt type: {}".format(prompt_type))

In [None]:
def get_post_process_function_by_prompt_type(prompt_type: str, prompt_dict: Dict = {}) -> str:
  """
  Utility function to easily extract post process function based on prompt type parameter value

  Args:
    prompt_type (str): the target key for which the value should be returned
    prompt_dict (Dict): the dictionary that stores prompt types as keys and post-process functions as values
  """

  # Sanitise input for prompt type
  prompt_type = prompt_type.strip().lower()

  # Try return post process function based on prompt type parameter value, otherwise throw error
  try:
    return prompt_dict[prompt_type]
  except KeyError:
    raise ValueError("Unknown prompt type: {}".format(prompt_type))

In [None]:
def retry_hugging_face_inference(func: Callable[..., Any], *args: Any, max_retries: int = 5, base_wait_period: int = 10, retry_on: Tuple[Type[BaseException, ...]]=(HTTPError,)) -> None:
  """
  Occasionally, hugging face inference timeouts were observed (if response time was very low, loops could lead to timeout issues)
  Utility function to add retry functionality to catch these errors and retry to improve stability

  Args:
    func (Callable[..., Any]): the function which should be retried in case of failure
    args (Any): the arguments passed to the function
    max_retries (int): number of retries before actually failing
    base_wait_period (int): wait period between retries
    retry_on (Tuple[Type[BaseException, ...]]): list of errors on which to retry
  
  Returns:
    None
  """

  # Initialise retries at 0
  retries = 0

  # If unsuccessful request, keep retrying until max retries value reached
  while retries < max_retries:
    # Try call the function, in case of failure code in provided parameter, increment and retry after wait period
    # Otherwise, print output informing user of failure in case of other exceptions or if max retries is exceeded
    try:
      return func(*args)
    except retry_on as e:
      status_code = getattr(e.response, "status_code", None)

      if status_code == 429:
        wait = base_wait_period * (2 * retries)
        print("Too many requests, retrying in {} seconds...".format(wait))
        time.sleep(wait)
        retries += 1
      else:
        raise
    except Exception as e:
      print("Error {}: {}".format(type(e).__name__, e))
      raise
  raise Exception("max retries exceeded")

In [None]:
def delete_files_in_directory(directory_path: str) -> None:
  """
  Delete files in a provided directory

  Args:
    directory_path (str): the directory path at which the files should be deleted

  Returns:
    None
  """

  # If directory path does not exist, inform user of error
  if not os.path.exists(directory_path):
    print("Directory {} does not exist".format(directory_path))

  # Initialise file count (will be used to count number of deleted files)
  file_count = 0

  # Loop over files in directory, remove files and increment file count as each one is deleted
  for file_name in os.listdir(directory_path):
    file_path = os.path.join(directory_path, file_name)

    if os.path.isfile(file_path):
      os.remove(file_path)
      file_count += 1

  # Once complete, inform user how many files were deleted from the directory
  print("Deleted {} files from {} directory".format(file_count, directory_path))


## Clone Repo with Scraped Data

In [None]:
# Citizen information data was scraped over a number of runs on my local machine
# Therefore, need to clone down the repo to get access to the data
!git clone https://github.com/JohnDennehy101/webScraperCitizensInformation.git

Cloning into 'webScraperCitizensInformation'...
remote: Enumerating objects: 41164, done.[K
remote: Counting objects: 100% (53/53), done.[K
remote: Compressing objects: 100% (31/31), done.[K
remote: Total 41164 (delta 27), reused 37 (delta 21), pack-reused 41111 (from 2)[K
Receiving objects: 100% (41164/41164), 59.75 MiB | 14.49 MiB/s, done.
Resolving deltas: 100% (6051/6051), done.
Updating files: 100% (43422/43422), done.


## Extract Documents (and filter out Gaeilge documents)

In [None]:
# Directory path within the repo where target documents are available
directory_path = "/content/webScraperCitizensInformation/src/data/markdown"

# Use the directory loader to load the files, note .md used as files are in markdown format
loader = DirectoryLoader(
    path=directory_path,
    glob="*.md",
    loader_cls=UnstructuredMarkdownLoader,
    recursive=True
)

# Load documents
documents = loader.load()

print(len(documents))

# Filter out documents that start with 'ga' which signifies that they are in Irish (which is not well supported by LLMs)
# Therefore, decision made to only use English language documents for the experiments
filtered_documents = [
    doc for doc in documents if not os.path.basename(doc.metadata['source']).startswith('ga_')
]

if documents:
    print("Contents of the first file:")
    print(documents[0].page_content)
else:
    print("No documents found.")

2840
Contents of the first file:
English | Gaeilge

You are here: Home > Housing > Planning permission > Development plans

Development plans

What is a development plan?

Do national plans influence local development plans?

Can I give feedback about a development plan?

Can a development plan influence my application for planning permission?

More information on development plans

What is a development plan?

A development plan describes how your local authority will develop and use particular areas in your local authority area, for example:

Residential areas

Commercial areas

Industrial areas

Recreational areas

Agricultural areas

It also sets out the development objectives for your local authority area, such as plans to improve roads and local amenities.

A development plan is made up of a written statement and series of maps.

How long does a development plan last?

A development plan lasts for 6 years. Local authorities must begin work on a new development plan 4 years after 

In [None]:
print(len(filtered_documents))

if filtered_documents:
    print("Contents of the first file:")
    print(filtered_documents[0].page_content)
else:
    print("No documents found.")

1924
Contents of the first file:
English | Gaeilge

You are here: Home > Moving Country > Moving to Ireland > Rights of residence in Ireland > Residence rights of family members

Residence rights of family members

Introduction

Family members of Irish citizens

Family members of UK citizens

Family members of EU, EEA and Swisscitizens

People who have internationalprotection status

Other Non-EEA citizens

Dependent elderly relatives

Further information and contacts

Introduction

If you move to Ireland you may be able to bring your Non-EEA family members to live with you. Your right to have your family join you in Ireland depends on:

Your residence status in Ireland

Your family relationship with the person

In some cases, you have a legal right to be joined by your family. But in many situations, you must show that you have the income to support them and satisfy eligibility criteria. The information on this page is mostly about spouses, partners and dependent children.

Family mem

In [None]:
# Extract page content for each English langugage document and store as list of LangchainDocuments
RAW_KNOWLEDGE_BASE = [
    LangchainDocument(page_content=doc.page_content) for doc in tqdm(filtered_documents)
]

100%|██████████| 1924/1924 [00:00<00:00, 246603.13it/s]


In [None]:
# Split the documents into chunks using the utility split_documents function
docs_processed = split_documents(
    CHUNK_SIZE,
    RAW_KNOWLEDGE_BASE,
    tokenizer_name=MEDIUM_TOKENISER_MODEL_NAME
)

In [None]:
avg_doc_length = lambda docs: sum([len(doc.page_content) for doc in docs])//len(docs)

In [None]:
avg_char_before_split = avg_doc_length(RAW_KNOWLEDGE_BASE)
avg_char_after_split = avg_doc_length(docs_processed)

In [None]:
# Print out documents for values before split
print("Before split, there were {} documents loaded, with average characters equal to {}.".format(len(RAW_KNOWLEDGE_BASE), avg_char_before_split))

Before split, there were 1924 documents loaded, with average characters equal to 6807.


In [None]:
# Print out documents for values after chunking to show that there are more chunks with fewer characters in each
print("After split, there were {} documents (chunks), with average characters equal to {} (average chunk length).".format(len(docs_processed), avg_char_after_split))

After split, there were 3632 documents (chunks), with average characters equal to 3373 (average chunk length).


In [None]:
# Get device type by calling utility function
device = get_device()

# Initialise embedding model
huggingface_embeddings_medium_model = HuggingFaceBgeEmbeddings(
    model_name=MEDIUM_TOKENISER_MODEL_NAME,
    model_kwargs={"device": device,},
    encode_kwargs={"normalize_embeddings": True, "multi_process": False}
)

Using CUDA


  huggingface_embeddings_medium_model = HuggingFaceBgeEmbeddings(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/284 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/55.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/901 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.11G [00:00<?, ?B/s]

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/80.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/370 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

In [None]:
# For vector store, initialise to None initially
vectorstore_medium_model = None

# Vector store generation took a significant amount of time, therefore model files were saved for re-use between model runs to speed up execution
# First, check if the path for the embedding files if present (if so try and use it)
if os.path.isdir(VECTOR_DB_PATH):
  print("Vector db found locally, loading")
  vectorstore_medium_model = FAISS.load_local(folder_path=VECTOR_DB_PATH, embeddings=huggingface_embeddings_medium_model, allow_dangerous_deserialization=True)
# Otherwise, construct the vector db by calling the utility generate_faiss_embeddings_vector_store function
else:
  print("Not found, building vector db")
  vectorstore_medium_model = generate_faiss_embeddings_vector_store(docs_processed, huggingface_embeddings_medium_model)
  vectorstore_medium_model.save_local("faiss_vector_store_{}.index".format(MEDIUM_TOKENISER_MODEL_NAME.replace("/", "_")))

Vector db found locally, loading


In [None]:

# Initialise retriever
retriever = vectorstore_medium_model.as_retriever(search_type="similarity", search_kwargs={"k": 3})

In [None]:
# These variables were used to ensure that any tweaks here were constant throughout the file (easier when running multiple experiments)
LLM_IN_USE_NAME = LARGE_LLM_MODEL_NAME
PROMPT_TYPE_IN_USE = FEW_SHOT_PROMPT_TYPE
RUN_ALL_PROMPTS = False

RE_RANKER = "colbert-ir/colbertv2.0"
NUMBER_RETRIEVED_DOCS = 20
NUMBER_FINAL_DOCS = 5


TEMPERATURE = 0
TOP_K = 0
TOP_P = 0
REPETITION_PENALTY = 0
NO_REPEAT_N_GRAMS = 0
MAX_TOKENS = 500

In [None]:
# Experiments only ran:

##### Zero Shot

# LARGE MODEL

# {}
# {"temperature": 1e-5}
# {"temperature": 1e-5, "max_new_tokens": 500}
# {"temperature": 1e-5, "max_new_tokens": 750}

# {"top_k": 10, "do_sample": True}
# {"top_k": 30, "do_sample": True}
# {"top_k": 50, "do_sample": True}

# {"top_p": 0.7, "do_sample": True}
# {"top_p": 0.8, "do_sample": True}
# {"top_p": 0.9, "do_sample": True}

# {"repetition_penalty": 1.1, "do_sample": True}
# {"repetition_penalty": 1.2, "do_sample": True}
# {"repetition_penalty": 1.3, "do_sample": True}

# {"no_repeat_ngram_size": 2, "do_sample": True}

# {"temperature": 0.5, "do_sample": True}

# {"max_new_tokens": 100}
# {"max_new_tokens": 150}
# {"max_new_tokens": 200}

# {"temperature": 1e-5, "max_new_tokens": 100}
# {"temperature": 1e-5, "max_new_tokens": 150}
# {"temperature": 1e-5, "max_new_tokens": 200}

# {"temperature": 0.5, "do_sample": True, "max_new_tokens": 150}

# {"repetition_penalty": 1.1, "do_sample": True, "max_new_tokens": 150}
# {"repetition_penalty": 1.2, "do_sample": True, "max_new_tokens": 150}
# {"repetition_penalty": 1.3, "do_sample": True, "max_new_tokens": 150}

# {"top_k": 10, "do_sample": True, "max_new_tokens": 150}
# {"top_k": 30, "do_sample": True, "max_new_tokens": 150}
# {"top_k": 50, "do_sample": True, "max_new_tokens": 150}

# {"top_p": 0.7, "do_sample": True, "max_new_tokens": 150}
# {"top_p": 0.8, "do_sample": True, "max_new_tokens": 150}
# {"top_p": 0.9, "do_sample": True, "max_new_tokens": 150}

# {"no_repeat_ngram_size": 2, "do_sample": True, "max_new_tokens": 150}

# {"max_new_tokens": 500}
# {"max_new_tokens": 750}
# {"max_new_tokens": 1000}
# {"max_new_tokens": 1500}


##### Basic Instruction Prompt

# LARGE MODEL

# {}

# {"temperature": 1e-5}
# {"temperature": 1e-5, "max_new_tokens": 500}
# {"temperature": 1e-5, "max_new_tokens": 750}

# {"top_k": 10, "do_sample": True}
# {"top_k": 30, "do_sample": True}
# {"top_k": 50, "do_sample": True}

# {"top_p": 0.7, "do_sample": True}
# {"top_p": 0.8, "do_sample": True}
# {"top_p": 0.9, "do_sample": True}

# {"repetition_penalty": 1.1, "do_sample": True}
# {"repetition_penalty": 1.2, "do_sample": True}
# {"repetition_penalty": 1.3, "do_sample": True}

# {"no_repeat_ngram_size": 2, "do_sample": True}

# {"temperature": 0.5, "do_sample": True}

# {"temperature": 0.5, "do_sample": True, "max_new_tokens": 150}
# {"temperature": 0, "do_sample": False, "max_new_tokens": 150}
# {"temperature": 1.0, "do_sample": True, "max_new_tokens": 150}

# {"top_k": 10, "do_sample": True, "max_new_tokens": 150}
# {"top_k": 30, "do_sample": True, "max_new_tokens": 150}
# {"top_k": 50, "do_sample": True, "max_new_tokens": 150}

# {"top_p": 0.7, "do_sample": True, "max_new_tokens": 150}
# {"top_p": 0.8, "do_sample": True, "max_new_tokens": 150}
# {"top_p": 0.9, "do_sample": True, "max_new_tokens": 150}

# {"repetition_penalty": 1.1, "do_sample": True, "max_new_tokens": 150}
# {"repetition_penalty": 1.2, "do_sample": True, "max_new_tokens": 150}
# {"repetition_penalty": 1.3, "do_sample": True, "max_new_tokens": 150}

# {"no_repeat_ngram_size": 2, "do_sample": True, "max_new_tokens": 150}

# {"temperature": 0, "max_new_tokens": 100}
# {"temperature": 0, "max_new_tokens": 200}

# {"max_new_tokens": 100}
# {"max_new_tokens": 200}

# {"max_new_tokens": 500}
# {"max_new_tokens": 750}
# {"max_new_tokens": 1000}
# {"max_new_tokens": 1500}

##### Few shot prompt

# LARGE MODEL

# {}

# {"temperature": 1e-5}
# {"temperature": 1e-5, "max_new_tokens": 500}
# {"temperature": 1e-5, "max_new_tokens": 750}

# {"top_k": 10, "do_sample": True}
# {"top_k": 30, "do_sample": True}
# {"top_k": 50, "do_sample": True}

# {"top_p": 0.7, "do_sample": True}
# {"top_p": 0.8, "do_sample": True}
# {"top_p": 0.9, "do_sample": True}

# {"repetition_penalty": 1.1, "do_sample": True}
# {"repetition_penalty": 1.2, "do_sample": True}
# {"repetition_penalty": 1.3, "do_sample": True}

# {"no_repeat_ngram_size": 2, "do_sample": True}

# {"temperature": 0.5, "do_sample": True}

# {"temperature": 0.5, "do_sample": True, "max_new_tokens": 150}

# {"repetition_penalty": 1.1, "do_sample": True, "max_new_tokens": 150}
# {"repetition_penalty": 1.2, "do_sample": True, "max_new_tokens": 150}
# {"repetition_penalty": 1.3, "do_sample": True, "max_new_tokens": 150}

# {"top_k": 10, "do_sample": True, "max_new_tokens": 150}
# {"top_k": 30, "do_sample": True, "max_new_tokens": 150}
# {"top_k": 50, "do_sample": True, "max_new_tokens": 150}

# {"top_p": 0.7, "do_sample": True, "max_new_tokens": 150}
# {"top_p": 0.8, "do_sample": True, "max_new_tokens": 150}
# {"top_p": 0.9, "do_sample": True, "max_new_tokens": 150}

# {"no_repeat_ngram_size": 2, "do_sample": True, "max_new_tokens": 150}

# {"max_new_tokens": 500}
# {"max_new_tokens": 750}
# {"max_new_tokens": 1000}
# {"max_new_tokens": 1500}
# {"max_new_tokens": 2000}

##### Chain of thought prompt

# LARGE MODEL

# {}

# {"temperature": 1e-5}
# {"temperature": 1e-5, "max_new_tokens": 500}
# {"temperature": 1e-5, "max_new_tokens": 750}

# {"top_k": 10, "do_sample": True}
# {"top_k": 30, "do_sample": True}
# {"top_k": 50, "do_sample": True}

# {"top_p": 0.7, "do_sample": True}
# {"top_p": 0.8, "do_sample": True}
# {"top_p": 0.9, "do_sample": True}

# {"repetition_penalty": 1.1, "do_sample": True}
# {"repetition_penalty": 1.2, "do_sample": True}
# {"repetition_penalty": 1.3, "do_sample": True}

# {"no_repeat_ngram_size": 2, "do_sample": True}

# {"temperature": 0.5, "do_sample": True}

# {"temperature": 0.5, "do_sample": True, "max_new_tokens": 150}

# {"repetition_penalty": 1.1, "do_sample": True, "max_new_tokens": 150}
# {"repetition_penalty": 1.2, "do_sample": True, "max_new_tokens": 150}
# {"repetition_penalty": 1.3, "do_sample": True, "max_new_tokens": 150}

# {"top_k": 10, "do_sample": True, "max_new_tokens": 150}
# {"top_k": 30, "do_sample": True, "max_new_tokens": 150}
# {"top_k": 50, "do_sample": True, "max_new_tokens": 150}

# {"top_p": 0.7, "do_sample": True, "max_new_tokens": 150}
# {"top_p": 0.8, "do_sample": True, "max_new_tokens": 150}
# {"top_p": 0.9, "do_sample": True, "max_new_tokens": 150}

# {"no_repeat_ngram_size": 2, "do_sample": True, "max_new_tokens": 150}

# {"max_new_tokens": 500}
# {"max_new_tokens": 750}
# {"max_new_tokens": 1000}
# {"max_new_tokens": 1500}
# {"max_new_tokens": 2000}

##### Tree of thought prompt

# LARGE MODEL

# {}

# {"temperature": 1e-5}
# {"temperature": 1e-5, "max_new_tokens": 500}
# {"temperature": 1e-5, "max_new_tokens": 750}

# {"top_k": 10, "do_sample": True}
# {"top_k": 30, "do_sample": True}
# {"top_k": 50, "do_sample": True}

# {"top_p": 0.7, "do_sample": True}
# {"top_p": 0.8, "do_sample": True}
# {"top_p": 0.9, "do_sample": True}

# {"repetition_penalty": 1.1, "do_sample": True}
# {"repetition_penalty": 1.2, "do_sample": True}
# {"repetition_penalty": 1.3, "do_sample": True}

# {"no_repeat_ngram_size": 2, "do_sample": True}

# {"temperature": 0.5, "do_sample": True}
# {"temperature": 0.5, "do_sample": True, "max_new_tokens": 150}

# {"repetition_penalty": 1.1, "do_sample": True, "max_new_tokens": 150}
# {"repetition_penalty": 1.2, "do_sample": True, "max_new_tokens": 150}
# {"repetition_penalty": 1.3, "do_sample": True, "max_new_tokens": 150}

# {"top_k": 10, "do_sample": True, "max_new_tokens": 150}
# {"top_k": 30, "do_sample": True, "max_new_tokens": 150}
# {"top_k": 50, "do_sample": True, "max_new_tokens": 150}

# {"top_p": 0.7, "do_sample": True, "max_new_tokens": 150}
# {"top_p": 0.8, "do_sample": True, "max_new_tokens": 150}
# {"top_p": 0.9, "do_sample": True, "max_new_tokens": 150}

# {"no_repeat_ngram_size": 2, "do_sample": True, "max_new_tokens": 150}

# {"max_new_tokens": 500}
# {"max_new_tokens": 750}
# {"max_new_tokens": 1000}
# {"max_new_tokens": 1500}
# {"max_new_tokens": 2000}

##### Persona prompt

# LARGE MODEL

# {}

# {"temperature": 1e-5}
# {"temperature": 1e-5, "max_new_tokens": 500}
# {"temperature": 1e-5, "max_new_tokens": 750}

# {"top_k": 10, "do_sample": True}
# {"top_k": 30, "do_sample": True}
# {"top_k": 50, "do_sample": True}

# {"top_p": 0.7, "do_sample": True}
# {"top_p": 0.8, "do_sample": True}
# {"top_p": 0.9, "do_sample": True}

# {"repetition_penalty": 1.1, "do_sample": True}
# {"repetition_penalty": 1.2, "do_sample": True}
# {"repetition_penalty": 1.3, "do_sample": True}

# {"no_repeat_ngram_size": 2, "do_sample": True}

# {"temperature": 0.5, "do_sample": True}

# {"temperature": 0.5, "do_sample": True, "max_new_tokens": 150}

# {"repetition_penalty": 1.1, "do_sample": True, "max_new_tokens": 150}
# {"repetition_penalty": 1.2, "do_sample": True, "max_new_tokens": 150}
# {"repetition_penalty": 1.3, "do_sample": True, "max_new_tokens": 150}

# {"top_k": 10, "do_sample": True, "max_new_tokens": 150}
# {"top_k": 30, "do_sample": True, "max_new_tokens": 150}
# {"top_k": 50, "do_sample": True, "max_new_tokens": 150}

# {"top_p": 0.7, "do_sample": True, "max_new_tokens": 150}
# {"top_p": 0.8, "do_sample": True, "max_new_tokens": 150}
# {"top_p": 0.9, "do_sample": True, "max_new_tokens": 150}

# {"no_repeat_ngram_size": 2, "do_sample": True, "max_new_tokens": 150}

# {"max_new_tokens": 500}
# {"max_new_tokens": 750}
# {"max_new_tokens": 1000}
# {"max_new_tokens": 1500}
# {"max_new_tokens": 2000}

# **************************

##### Zero shot

# MEDIUM MODEL

# {}

# {"temperature": 1e-5}
# {"temperature": 1e-5, "max_new_tokens": 500}
# {"temperature": 1e-5, "max_new_tokens": 750}

# {"top_k": 10, "do_sample": True}
# {"top_k": 30, "do_sample": True}
# {"top_k": 50, "do_sample": True}

# {"top_p": 0.7, "do_sample": True}
# {"top_p": 0.8, "do_sample": True}
# {"top_p": 0.9, "do_sample": True}

# {"repetition_penalty": 1.1, "do_sample": True}
# {"repetition_penalty": 1.2, "do_sample": True}
# {"repetition_penalty": 1.3, "do_sample": True}

# {"no_repeat_ngram_size": 2, "do_sample": True}

# {"temperature": 0.5, "do_sample": True}

# {"max_new_tokens": 100}
# {"max_new_tokens": 150}
# {"max_new_tokens": 200}

# {"temperature": 1e-5, "max_new_tokens": 100}
# {"temperature": 1e-5, "max_new_tokens": 150}
# {"temperature": 1e-5, "max_new_tokens": 200}

# {"temperature": 0.5, "do_sample": True, "max_new_tokens": 150}

# {"repetition_penalty": 1.1, "do_sample": True, "max_new_tokens": 150}
# {"repetition_penalty": 1.2, "do_sample": True, "max_new_tokens": 150}
# {"repetition_penalty": 1.3, "do_sample": True, "max_new_tokens": 150}

# {"top_k": 10, "do_sample": True, "max_new_tokens": 150}
# {"top_k": 30, "do_sample": True, "max_new_tokens": 150}
# {"top_k": 50, "do_sample": True, "max_new_tokens": 150}

# {"top_p": 0.7, "do_sample": True, "max_new_tokens": 150}
# {"top_p": 0.8, "do_sample": True, "max_new_tokens": 150}
# {"top_p": 0.9, "do_sample": True, "max_new_tokens": 150}

# {"no_repeat_ngram_size": 2, "do_sample": True, "max_new_tokens": 150}

# {"max_new_tokens": 500}
# {"max_new_tokens": 750}
# {"max_new_tokens": 1000}
# {"max_new_tokens": 1500}
# {"max_new_tokens": 2000}

##### Basic Instruction Prompt

# MEDIUM MODEL

# {}

# {"temperature": 1e-5}
# {"temperature": 1e-5, "max_new_tokens": 500}
# {"temperature": 1e-5, "max_new_tokens": 750}

# {"top_k": 10, "do_sample": True}
# {"top_k": 30, "do_sample": True}
# {"top_k": 50, "do_sample": True}

# {"top_p": 0.7, "do_sample": True}
# {"top_p": 0.8, "do_sample": True}
# {"top_p": 0.9, "do_sample": True}

# {"repetition_penalty": 1.1, "do_sample": True}
# {"repetition_penalty": 1.2, "do_sample": True}
# {"repetition_penalty": 1.3, "do_sample": True}

# {"no_repeat_ngram_size": 2, "do_sample": True}

# {"temperature": 0.5, "do_sample": True}

# {"max_new_tokens": 100}
# {"max_new_tokens": 150}
# {"max_new_tokens": 200}

# {"temperature": 1e-5, "max_new_tokens": 100}
# {"temperature": 1e-5, "max_new_tokens": 150}
# {"temperature": 1e-5, "max_new_tokens": 200}

# {"temperature": 0.5, "do_sample": True, "max_new_tokens": 150}

# {"repetition_penalty": 1.1, "do_sample": True, "max_new_tokens": 150}
# {"repetition_penalty": 1.2, "do_sample": True, "max_new_tokens": 150}
# {"repetition_penalty": 1.3, "do_sample": True, "max_new_tokens": 150}

# {"top_k": 10, "do_sample": True, "max_new_tokens": 150}
# {"top_k": 30, "do_sample": True, "max_new_tokens": 150}
# {"top_k": 50, "do_sample": True, "max_new_tokens": 150}

# {"top_p": 0.7, "do_sample": True, "max_new_tokens": 150}
# {"top_p": 0.8, "do_sample": True, "max_new_tokens": 150}
# {"top_p": 0.9, "do_sample": True, "max_new_tokens": 150}

# {"no_repeat_ngram_size": 2, "do_sample": True, "max_new_tokens": 150}

# {"temperature": 0.5, "do_sample": True, "max_new_tokens": 150}

# {"repetition_penalty": 1.1, "do_sample": True, "max_new_tokens": 150}
# {"repetition_penalty": 1.2, "do_sample": True, "max_new_tokens": 150}
# {"repetition_penalty": 1.3, "do_sample": True, "max_new_tokens": 150}

# {"top_k": 10, "do_sample": True, "max_new_tokens": 150}
# {"top_k": 30, "do_sample": True, "max_new_tokens": 150}
# {"top_k": 50, "do_sample": True, "max_new_tokens": 150}

# {"top_p": 0.7, "do_sample": True, "max_new_tokens": 150}
# {"top_p": 0.8, "do_sample": True, "max_new_tokens": 150}
# {"top_p": 0.9, "do_sample": True, "max_new_tokens": 150}

# {"no_repeat_ngram_size": 2, "do_sample": True, "max_new_tokens": 150}

# {"max_new_tokens": 500}
# {"max_new_tokens": 750}
# {"max_new_tokens": 1000}
# {"max_new_tokens": 1500}
# {"max_new_tokens": 2000}

##### Few shot prompt

# MEDIUM MODEL

# {}

# {"temperature": 1e-5}
# {"temperature": 1e-5, "max_new_tokens": 500}
# {"temperature": 1e-5, "max_new_tokens": 750}

# {"top_k": 10, "do_sample": True}
# {"top_k": 30, "do_sample": True}
# {"top_k": 50, "do_sample": True}

# {"top_p": 0.7, "do_sample": True}
# {"top_p": 0.8, "do_sample": True}
# {"top_p": 0.9, "do_sample": True}

# {"repetition_penalty": 1.1, "do_sample": True}
# {"repetition_penalty": 1.2, "do_sample": True}
# {"repetition_penalty": 1.3, "do_sample": True}

# {"no_repeat_ngram_size": 2, "do_sample": True}

# {"temperature": 0.5, "do_sample": True}

# {"repetition_penalty": 1.1, "do_sample": True, "max_new_tokens": 150}
# {"repetition_penalty": 1.2, "do_sample": True, "max_new_tokens": 150}
# {"repetition_penalty": 1.3, "do_sample": True, "max_new_tokens": 150}

# {"top_k": 10, "do_sample": True, "max_new_tokens": 150}
# {"top_k": 30, "do_sample": True, "max_new_tokens": 150}
# {"top_k": 50, "do_sample": True, "max_new_tokens": 150}

# {"top_p": 0.7, "do_sample": True, "max_new_tokens": 150}
# {"top_p": 0.8, "do_sample": True, "max_new_tokens": 150}
# {"top_p": 0.9, "do_sample": True, "max_new_tokens": 150}

# {"no_repeat_ngram_size": 2, "do_sample": True, "max_new_tokens": 150}

# {"temperature": 1e-5, "max_new_tokens": 100}
# {"temperature": 1e-5, "max_new_tokens": 150}
# {"temperature": 1e-5, "max_new_tokens": 200}

# {"temperature": 0.5, "do_sample": True, "max_new_tokens": 150}

# {"temperature": 0.5, "do_sample": True, "max_new_tokens": 150}

# {"repetition_penalty": 1.1, "do_sample": True, "max_new_tokens": 150}
# {"repetition_penalty": 1.2, "do_sample": True, "max_new_tokens": 150}
# {"repetition_penalty": 1.3, "do_sample": True, "max_new_tokens": 150}

# {"top_k": 10, "do_sample": True, "max_new_tokens": 150}
# {"top_k": 30, "do_sample": True, "max_new_tokens": 150}
# {"top_k": 50, "do_sample": True, "max_new_tokens": 150}

# {"top_p": 0.7, "do_sample": True, "max_new_tokens": 150}
# {"top_p": 0.8, "do_sample": True, "max_new_tokens": 150}
# {"top_p": 0.9, "do_sample": True, "max_new_tokens": 150}

# {"no_repeat_ngram_size": 2, "do_sample": True, "max_new_tokens": 150}

# {"max_new_tokens": 500}
# {"max_new_tokens": 750}
# {"max_new_tokens": 1000}
# {"max_new_tokens": 1500}
# {"max_new_tokens": 2000}

##### Chain of thought prompt

# MEDIUM MODEL

# {}

# {"temperature": 1e-5}
# {"temperature": 1e-5, "max_new_tokens": 500}
# {"temperature": 1e-5, "max_new_tokens": 750}

# {"top_k": 10, "do_sample": True}
# {"top_k": 30, "do_sample": True}
# {"top_k": 50, "do_sample": True}

# {"top_p": 0.7, "do_sample": True}
# {"top_p": 0.8, "do_sample": True}
# {"top_p": 0.9, "do_sample": True}

# {"repetition_penalty": 1.1, "do_sample": True}
# {"repetition_penalty": 1.2, "do_sample": True}
# {"repetition_penalty": 1.3, "do_sample": True}

# {"no_repeat_ngram_size": 2, "do_sample": True}

# {"temperature": 0.5, "do_sample": True}

# {"repetition_penalty": 1.1, "do_sample": True, "max_new_tokens": 150}
# {"repetition_penalty": 1.2, "do_sample": True, "max_new_tokens": 150}
# {"repetition_penalty": 1.3, "do_sample": True, "max_new_tokens": 150}

# {"top_k": 10, "do_sample": True, "max_new_tokens": 150}
# {"top_k": 30, "do_sample": True, "max_new_tokens": 150}
# {"top_k": 50, "do_sample": True, "max_new_tokens": 150}

# {"top_p": 0.7, "do_sample": True, "max_new_tokens": 150}
# {"top_p": 0.8, "do_sample": True, "max_new_tokens": 150}
# {"top_p": 0.9, "do_sample": True, "max_new_tokens": 150}

# {"no_repeat_ngram_size": 2, "do_sample": True, "max_new_tokens": 150}

# {"temperature": 1e-5, "max_new_tokens": 100}
# {"temperature": 1e-5, "max_new_tokens": 150}
# {"temperature": 1e-5, "max_new_tokens": 200}

# {"temperature": 0.5, "do_sample": True, "max_new_tokens": 150}

# {"temperature": 0.5, "do_sample": True, "max_new_tokens": 150}

# {"repetition_penalty": 1.1, "do_sample": True, "max_new_tokens": 150}
# {"repetition_penalty": 1.2, "do_sample": True, "max_new_tokens": 150}
# {"repetition_penalty": 1.3, "do_sample": True, "max_new_tokens": 150}

# {"top_k": 10, "do_sample": True, "max_new_tokens": 150}
# {"top_k": 30, "do_sample": True, "max_new_tokens": 150}
# {"top_k": 50, "do_sample": True, "max_new_tokens": 150}

# {"top_p": 0.7, "do_sample": True, "max_new_tokens": 150}
# {"top_p": 0.8, "do_sample": True, "max_new_tokens": 150}
# {"top_p": 0.9, "do_sample": True, "max_new_tokens": 150}

# {"no_repeat_ngram_size": 2, "do_sample": True, "max_new_tokens": 150}

# {"max_new_tokens": 500}
# {"max_new_tokens": 750}
# {"max_new_tokens": 1000}
# {"max_new_tokens": 1500}
# {"max_new_tokens": 2000}

##### Tree of thought prompt

# MEDIUM MODEL

# {}

# {"temperature": 1e-5}
# {"temperature": 1e-5, "max_new_tokens": 500}
# {"temperature": 1e-5, "max_new_tokens": 750}

# {"top_k": 10, "do_sample": True}
# {"top_k": 30, "do_sample": True}
# {"top_k": 50, "do_sample": True}

# {"top_p": 0.7, "do_sample": True}
# {"top_p": 0.8, "do_sample": True}
# {"top_p": 0.9, "do_sample": True}

# {"repetition_penalty": 1.1, "do_sample": True}
# {"repetition_penalty": 1.2, "do_sample": True}
# {"repetition_penalty": 1.3, "do_sample": True}

# {"no_repeat_ngram_size": 2, "do_sample": True}

# {"temperature": 0.5, "do_sample": True}

# {"temperature": 0.5, "do_sample": True, "max_new_tokens": 150}

# {"repetition_penalty": 1.1, "do_sample": True, "max_new_tokens": 150}
# {"repetition_penalty": 1.2, "do_sample": True, "max_new_tokens": 150}
# {"repetition_penalty": 1.3, "do_sample": True, "max_new_tokens": 150}

# {"top_k": 10, "do_sample": True, "max_new_tokens": 150}
# {"top_k": 30, "do_sample": True, "max_new_tokens": 150}
# {"top_k": 50, "do_sample": True, "max_new_tokens": 150}

# {"top_p": 0.7, "do_sample": True, "max_new_tokens": 150}
# {"top_p": 0.8, "do_sample": True, "max_new_tokens": 150}
# {"top_p": 0.9, "do_sample": True, "max_new_tokens": 150}

# {"no_repeat_ngram_size": 2, "do_sample": True, "max_new_tokens": 150}

# {"max_new_tokens": 500}
# {"max_new_tokens": 750}
# {"max_new_tokens": 1000}
# {"max_new_tokens": 1500}
# {"max_new_tokens": 2000}

##### Persona prompt

# MEDIUM MODEL

# {}

# {"temperature": 1e-5}
# {"temperature": 1e-5, "max_new_tokens": 500}
# {"temperature": 1e-5, "max_new_tokens": 750}

# {"top_k": 10, "do_sample": True}
# {"top_k": 30, "do_sample": True}
# {"top_k": 50, "do_sample": True}

# {"top_p": 0.7, "do_sample": True}
# {"top_p": 0.8, "do_sample": True}
# {"top_p": 0.9, "do_sample": True}

# {"repetition_penalty": 1.1, "do_sample": True}
# {"repetition_penalty": 1.2, "do_sample": True}
# {"repetition_penalty": 1.3, "do_sample": True}

# {"no_repeat_ngram_size": 2, "do_sample": True}

# {"temperature": 0.5, "do_sample": True}

# {"temperature": 0.5, "do_sample": True, "max_new_tokens": 150}

# {"repetition_penalty": 1.1, "do_sample": True, "max_new_tokens": 150}
# {"repetition_penalty": 1.2, "do_sample": True, "max_new_tokens": 150}
# {"repetition_penalty": 1.3, "do_sample": True, "max_new_tokens": 150}

# {"top_k": 10, "do_sample": True, "max_new_tokens": 150}
# {"top_k": 30, "do_sample": True, "max_new_tokens": 150}
# {"top_k": 50, "do_sample": True, "max_new_tokens": 150}

# {"top_p": 0.7, "do_sample": True, "max_new_tokens": 150}
# {"top_p": 0.8, "do_sample": True, "max_new_tokens": 150}
# {"top_p": 0.9, "do_sample": True, "max_new_tokens": 150}

# {"no_repeat_ngram_size": 2, "do_sample": True, "max_new_tokens": 150}

# {"max_new_tokens": 500}
# {"max_new_tokens": 750}
# {"max_new_tokens": 1000}
# {"max_new_tokens": 1500}
# {"max_new_tokens": 2000}


# **************************


##### Zero Shot

# SMALL MODEL

# {}

# {"temperature": 1e-5}

# {"top_k": 10, "do_sample": True}
# {"top_k": 30, "do_sample": True}
# {"top_k": 50, "do_sample": True}

# {"top_p": 0.7, "do_sample": True}
# {"top_p": 0.8, "do_sample": True}
# {"top_p": 0.9, "do_sample": True}

# {"repetition_penalty": 1.1, "do_sample": True}
# {"repetition_penalty": 1.2, "do_sample": True}
# {"repetition_penalty": 1.3, "do_sample": True}

# {"no_repeat_ngram_size": 2, "do_sample": True}

# {"temperature": 0.5, "do_sample": True}

# {"max_new_tokens": 100}
# {"max_new_tokens": 150}
# {"max_new_tokens": 200}

# {"temperature": 1e-5, "max_new_tokens": 100}
# {"temperature": 1e-5, "max_new_tokens": 150}
# {"temperature": 1e-5, "max_new_tokens": 200}

# {"temperature": 0.5, "do_sample": True, "max_new_tokens": 150}

# {"repetition_penalty": 1.1, "do_sample": True, "max_new_tokens": 150}
# {"repetition_penalty": 1.2, "do_sample": True, "max_new_tokens": 150}
# {"repetition_penalty": 1.3, "do_sample": True, "max_new_tokens": 150}

# {"top_k": 10, "do_sample": True, "max_new_tokens": 150}
# {"top_k": 30, "do_sample": True, "max_new_tokens": 150}
# {"top_k": 50, "do_sample": True, "max_new_tokens": 150}

# {"top_p": 0.7, "do_sample": True, "max_new_tokens": 150}
# {"top_p": 0.8, "do_sample": True, "max_new_tokens": 150}
# {"top_p": 0.9, "do_sample": True, "max_new_tokens": 150}

# {"no_repeat_ngram_size": 2, "do_sample": True, "max_new_tokens": 150}

# {"max_new_tokens": 500}
# {"max_new_tokens": 1000}
# {"max_new_tokens": 1500}
# {"max_new_tokens": 2000}

##### Basic Instruction Prompt

# SMALL MODEL

# {}

# {"temperature": 1e-5}

# {"top_k": 10, "do_sample": True}
# {"top_k": 30, "do_sample": True}
# {"top_k": 50, "do_sample": True}

# {"top_p": 0.7, "do_sample": True}
# {"top_p": 0.8, "do_sample": True}
# {"top_p": 0.9, "do_sample": True}

# {"repetition_penalty": 1.1, "do_sample": True}
# {"repetition_penalty": 1.2, "do_sample": True}
# {"repetition_penalty": 1.3, "do_sample": True}

# {"no_repeat_ngram_size": 2, "do_sample": True}

# {"temperature": 0.5, "do_sample": True}

# {"max_new_tokens": 100}
# {"max_new_tokens": 150}
# {"max_new_tokens": 200}

# {"temperature": 1e-5, "max_new_tokens": 100}
# {"temperature": 1e-5, "max_new_tokens": 150}
# {"temperature": 1e-5, "max_new_tokens": 200}

# {"temperature": 0.5, "do_sample": True, "max_new_tokens": 150}

# {"repetition_penalty": 1.1, "do_sample": True, "max_new_tokens": 150}
# {"repetition_penalty": 1.2, "do_sample": True, "max_new_tokens": 150}
# {"repetition_penalty": 1.3, "do_sample": True, "max_new_tokens": 150}

# {"top_k": 10, "do_sample": True, "max_new_tokens": 150}
# {"top_k": 30, "do_sample": True, "max_new_tokens": 150}
# {"top_k": 50, "do_sample": True, "max_new_tokens": 150}

# {"top_p": 0.7, "do_sample": True, "max_new_tokens": 150}
# {"top_p": 0.8, "do_sample": True, "max_new_tokens": 150}
# {"top_p": 0.9, "do_sample": True, "max_new_tokens": 150}

# {"no_repeat_ngram_size": 2, "do_sample": True, "max_new_tokens": 150}

# {"max_new_tokens": 500}
# {"max_new_tokens": 1000}
# {"max_new_tokens": 1500}
# {"max_new_tokens": 2000}

##### Few shot prompt

# SMALL MODEL

# {}

# {"temperature": 1e-5}

# {"max_new_tokens": 100}
# {"max_new_tokens": 150}
# {"max_new_tokens": 200}

# {"temperature": 1e-5, "max_new_tokens": 100}
# {"temperature": 1e-5, "max_new_tokens": 150}
# {"temperature": 1e-5, "max_new_tokens": 200}

# {"temperature": 0.5, "do_sample": True, "max_new_tokens": 150}

# {"top_k": 10, "do_sample": True}
# {"top_k": 30, "do_sample": True}
# {"top_k": 50, "do_sample": True}

# {"top_p": 0.7, "do_sample": True}
# {"top_p": 0.8, "do_sample": True}
# {"top_p": 0.9, "do_sample": True}

# {"repetition_penalty": 1.1, "do_sample": True}
# {"repetition_penalty": 1.2, "do_sample": True}
# {"repetition_penalty": 1.3, "do_sample": True}

# {"no_repeat_ngram_size": 2, "do_sample": True}

# {"temperature": 0.5, "do_sample": True}

# {"repetition_penalty": 1.1, "do_sample": True, "max_new_tokens": 150}
# {"repetition_penalty": 1.2, "do_sample": True, "max_new_tokens": 150}
# {"repetition_penalty": 1.3, "do_sample": True, "max_new_tokens": 150}

# {"top_k": 10, "do_sample": True, "max_new_tokens": 150}
# {"top_k": 30, "do_sample": True, "max_new_tokens": 150}
# {"top_k": 50, "do_sample": True, "max_new_tokens": 150}

# {"top_p": 0.7, "do_sample": True, "max_new_tokens": 150}
# {"top_p": 0.8, "do_sample": True, "max_new_tokens": 150}
# {"top_p": 0.9, "do_sample": True, "max_new_tokens": 150}

# {"no_repeat_ngram_size": 2, "do_sample": True, "max_new_tokens": 150}

# {"max_new_tokens": 500}
# {"max_new_tokens": 1000}
# {"max_new_tokens": 1500}
# {"max_new_tokens": 2000}

##### Chain of thought prompt

# SMALL MODEL

# {}

# {"temperature": 1e-5}

# {"max_new_tokens": 100}
# {"max_new_tokens": 150}
# {"max_new_tokens": 200}

# {"temperature": 1e-5, "max_new_tokens": 100}
# {"temperature": 1e-5, "max_new_tokens": 150}
# {"temperature": 1e-5, "max_new_tokens": 200}

# {"top_k": 10, "do_sample": True}
# {"top_k": 30, "do_sample": True}
# {"top_k": 50, "do_sample": True}

# {"top_p": 0.7, "do_sample": True}
# {"top_p": 0.8, "do_sample": True}
# {"top_p": 0.9, "do_sample": True}

# {"repetition_penalty": 1.1, "do_sample": True}
# {"repetition_penalty": 1.2, "do_sample": True}
# {"repetition_penalty": 1.3, "do_sample": True}

# {"no_repeat_ngram_size": 2, "do_sample": True}

# {"temperature": 0.5, "do_sample": True}

# {"repetition_penalty": 1.1, "do_sample": True, "max_new_tokens": 150}
# {"repetition_penalty": 1.2, "do_sample": True, "max_new_tokens": 150}
# {"repetition_penalty": 1.3, "do_sample": True, "max_new_tokens": 150}

# {"top_k": 10, "do_sample": True, "max_new_tokens": 150}
# {"top_k": 30, "do_sample": True, "max_new_tokens": 150}
# {"top_k": 50, "do_sample": True, "max_new_tokens": 150}

# {"top_p": 0.7, "do_sample": True, "max_new_tokens": 150}
# {"top_p": 0.8, "do_sample": True, "max_new_tokens": 150}
# {"top_p": 0.9, "do_sample": True, "max_new_tokens": 150}

# {"no_repeat_ngram_size": 2, "do_sample": True, "max_new_tokens": 150}

# {"max_new_tokens": 500}
# {"max_new_tokens": 1000}
# {"max_new_tokens": 1500}
# {"max_new_tokens": 2000}

##### Tree of thought prompt

# SMALL MODEL

# {}

# {"temperature": 1e-5}

# {"max_new_tokens": 100}
# {"max_new_tokens": 150}
# {"max_new_tokens": 200}

# {"temperature": 1e-5, "max_new_tokens": 100}
# {"temperature": 1e-5, "max_new_tokens": 150}
# {"temperature": 1e-5, "max_new_tokens": 200}

# {"top_k": 10, "do_sample": True}
# {"top_k": 30, "do_sample": True}
# {"top_k": 50, "do_sample": True}

# {"top_p": 0.7, "do_sample": True}
# {"top_p": 0.8, "do_sample": True}
# {"top_p": 0.9, "do_sample": True}

# {"repetition_penalty": 1.1, "do_sample": True}
# {"repetition_penalty": 1.2, "do_sample": True}
# {"repetition_penalty": 1.3, "do_sample": True}

# {"no_repeat_ngram_size": 2, "do_sample": True}

# {"temperature": 0.5, "do_sample": True}

# {"repetition_penalty": 1.1, "do_sample": True, "max_new_tokens": 150}
# {"repetition_penalty": 1.2, "do_sample": True, "max_new_tokens": 150}
# {"repetition_penalty": 1.3, "do_sample": True, "max_new_tokens": 150}

# {"top_k": 10, "do_sample": True, "max_new_tokens": 150}
# {"top_k": 30, "do_sample": True, "max_new_tokens": 150}
# {"top_k": 50, "do_sample": True, "max_new_tokens": 150}

# {"top_p": 0.7, "do_sample": True, "max_new_tokens": 150}
# {"top_p": 0.8, "do_sample": True, "max_new_tokens": 150}
# {"top_p": 0.9, "do_sample": True, "max_new_tokens": 150}

# {"no_repeat_ngram_size": 2, "do_sample": True, "max_new_tokens": 150}

# {"max_new_tokens": 500}
# {"max_new_tokens": 1000}
# {"max_new_tokens": 1500}
# {"max_new_tokens": 2000}

##### Persona prompt

# SMALL MODEL

# {}

# {"temperature": 1e-5}

# {"max_new_tokens": 100}
# {"max_new_tokens": 150}
# {"max_new_tokens": 200}

# {"temperature": 1e-5, "max_new_tokens": 100}
# {"temperature": 1e-5, "max_new_tokens": 150}
# {"temperature": 1e-5, "max_new_tokens": 200}

# {"top_k": 10, "do_sample": True}
# {"top_k": 30, "do_sample": True}
# {"top_k": 50, "do_sample": True}

# {"top_p": 0.7, "do_sample": True}
# {"top_p": 0.8, "do_sample": True}
# {"top_p": 0.9, "do_sample": True}

# {"repetition_penalty": 1.1, "do_sample": True}
# {"repetition_penalty": 1.2, "do_sample": True}
# {"repetition_penalty": 1.3, "do_sample": True}

# {"no_repeat_ngram_size": 2, "do_sample": True}

# {"temperature": 0.5, "do_sample": True}

# {"repetition_penalty": 1.1, "do_sample": True, "max_new_tokens": 150}
# {"repetition_penalty": 1.2, "do_sample": True, "max_new_tokens": 150}
# {"repetition_penalty": 1.3, "do_sample": True, "max_new_tokens": 150}

# {"top_k": 10, "do_sample": True, "max_new_tokens": 150}
# {"top_k": 30, "do_sample": True, "max_new_tokens": 150}
# {"top_k": 50, "do_sample": True, "max_new_tokens": 150}

# {"top_p": 0.7, "do_sample": True, "max_new_tokens": 150}
# {"top_p": 0.8, "do_sample": True, "max_new_tokens": 150}
# {"top_p": 0.9, "do_sample": True, "max_new_tokens": 150}

# {"no_repeat_ngram_size": 2, "do_sample": True, "max_new_tokens": 150}

# {"max_new_tokens": 500}
# {"max_new_tokens": 1000}
# {"max_new_tokens": 1500}
# {"max_new_tokens": 2000}


# Hyperparameters to be passed to model (note commented out ones above show previous runs)
LLM_HYPERPARAMETERS = {"temperature": 1e-5, "max_new_tokens": 500}

In [None]:
device_id = 0 if torch.cuda.is_available() else -1

# Initialise the llm using the model name in use, passing the hyperparameters and device_id (to ensure use of GPU if available)
llm = HuggingFacePipeline.from_model_id(
    model_id=LLM_IN_USE_NAME,
    task="text-generation",
    device=device_id,
    pipeline_kwargs=LLM_HYPERPARAMETERS
)

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

adapter_config.json:   0%|          | 0.00/789 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/27.3M [00:00<?, ?B/s]

Device set to use cuda:0


In [None]:
# The different prompt types used for the RAG model runs

##### Zero Shot

zero_shot_rag_prompt = """Use the following pieces of context to answer the question.

{context}

Question: {question}

Helpful Answer:
"""

##### Persona

persona_rag_prompt = """You are a warm, empathetic, and knowledgeable assistant for the Irish Citizen's Information service. Respond clearly, simply, and accurately, and use the following pieces of context to answer the question as well as your existing knowledge of official information relevant to Ireland.

{context}

Question: {question}

Helpful Answer:
"""


##### Few shot

few_shot_rag_prompt = """Answer the question carefully, using the following pieces of context and examples below to guide the style, structure and level of detail expected in the answer. ONLY provide the answer to Example 3. Follow the same structure and level of detail as Example 1 and Example 2 - short, factual, and readable. Do not list more than necessary. Stop after answering Example 3.

{context}

Example 1:

Q: What documents do I need to apply online for a driver's license in Ireland?
A: You need a Public Services Card and verified MyGovID, proof of your address if your address is different to the one you provided when you got your Public Services Card, proof that you are normally resident in Ireland if you are not an EU or EEA citizen, a medical report form, dated within one month if required, and a Certificate of Professional Competence (CPC) for professional drivers.

Example 2:

Q: What criteria do I need to meet to qualify for unemployment benefits in Ireland?
A: You must be under 66 years of age, or under 70 years and deferring your State Pension (Contributory), be unemployed (you must be unemployed for at least 4 days out of 7), have had a substantial loss of employment and as a result be unemployed for at least 4 days out of 7 (this does not apply to casual workers and part-time fire fighters), be capable of work, be available for and genuinely seeking work, and have enough social insurance (PRSI) contributions.

Example 3:
Q: {question}
A:
"""


##### Basic Instruction Prompt

instruction_rag_prompt = """Use the following pieces of context to answer the question at the end. Follow these rules carefully:
1. If you don't know the answer, don't try to make up an answer. Just say "I can't find the final answer but you may want to check the following links".
2. If you find the answer, write the answer in a concise way with five sentences maximum.
3. DO NOT copy entire sections, lists, or extracted content. Instead, extract only the relevant poionts to answer the question effectively.
4. Exclude any headings, subheadings, or introductory information. Only include the final, relevant points in your response.
5. DO NOT include extracted documents section.

{context}

Question: {question}

Helpful Answer:
"""

##### Chain of thought Prompt

chain_of_thought_rag_prompt = """Use the following pieces of context to reason step by step before answering the question.

{context}

Question: {question}

Begin by enclosing all thoughts within <thinking> tags, exploring multiple angles and approaches. Break down the solution into clear steps within <step> tags. Start with a 10-step budget, requesting more for complex problems if needed. Use <count> tags after each step to show the remaining budget. Stop when reaching 0. Continuously adjust your reasoning based on intermediate results and reflections, adapting your strategy as you progress. Regularly evaluate progress using <reflection> tags. Be critical and honest about your reasoning process. Assign a quality score between 0.0 and 1.0 using <reward> tags after each reflection. Use this to guide your approach: 0.8+: Continue current approach 0.5-0.7: Consider minor adjustments Below 0.5: Seriously consider backtracking and trying a different approach If unsure or if reward score is low, backtrack and try a different approach, explaining your decision within <thinking> tags. Explore multiple solutions individually if possible, comparing approaches in reflections. Use thoughts as a scratchpad, writing out all calculations and reasoning explicitly. Synthesize the final answer within <answer> tags, providing a clear, concise summary. Conclude with a final reflection on the overall solution, discussing effectiveness, challenges, and solutions. Assign a final reward score.

Helpful Answer:
"""

##### Tree of thought Prompt

tree_of_thought_rag_prompt = """Use the following pieces of context to reason step by step before answering the question.

{context}

Question: {question}

Begin by enclosing all thoughts within <thinking> tags. Instead of a single linear chain, explicitly generate and explore multiple branches of reasoning as separate <branch id="X"> elements, each representing a distinct approach or angle.

Within each <branch>:
- Break down the reasoning into clear <step> tags.
- After each step, include a <count> tag showing the remaining step budget (start with 10 steps per branch; request more if needed).
- Use <reflection> tags periodically within branches to critically evaluate progress.
- Assign a <reward> score (0.0 to 1.0) after reflections to assess quality of that branch's approach:
   - 0.8+ Continue this branch
   - 0.5-0.7 Consider minor adjustments
   - Below 0.5 Backtrack or abandon this branch
- If reward is low or you're unsure, backtrack and try a different <branch>, explaining the reasoning inside new <thinking> tags.
- Use thoughts as a scratchpad: show calculations, considerations, pros/cons explicitly.

Once multiple branches have been explored and evaluated, compare and synthesize insights in a <comparison> tag, weighing strengths and weaknesses.
Finally, provide a consolidated final answer inside <answer> tags - a clear, consise summary based on the best branch or synthesis.
Conclude with an overall <reflection> and a final <reward> score reflecting the whole process's effectiveness.
"""

prompt_dict_rag = {
      ZERO_SHOT_PROMPT_TYPE: zero_shot_rag_prompt,
      PERSONA_PROMPT_TYPE: persona_rag_prompt,
      FEW_SHOT_PROMPT_TYPE: few_shot_rag_prompt,
      INSTRUCTION_PROMPT_TYPE: instruction_rag_prompt,
      CHAIN_OF_THOUGHT_PROMPT_TYPE: chain_of_thought_rag_prompt,
      TREE_OF_THOUGHT_PROMPT_TYPE: tree_of_thought_rag_prompt
  }


prompt_template_rag = get_prompt_by_type(PROMPT_TYPE_IN_USE, prompt_dict_rag)

PROMPT_RAG = PromptTemplate(
 template=prompt_template_rag, input_variables=["context", "question"]
)

In [None]:
# The different prompt types used for the LLM model runs

##### Zero Shot

zero_shot_llm_prompt = """Use your existing knowledge to answer the question.

Question: {question}

Helpful Answer:
"""

##### Persona

persona_llm_prompt = """You are a warm, empathetic, and knowledgeable assistant for the Irish Citizen's Information service. Respond clearly, simply, and accurately, and use your existing knowledge of official information relevant to Ireland.

Question: {question}

Helpful Answer:
"""


##### Few shot

few_shot_llm_prompt = """Answer the question carefully, using the examples below to guide the style, structure and level of detail expected in the answer. ONLY provide the answer to Example 3. Follow the same structure and level of detail as Example 1 and Example 2 - short, factual, and readable. Do not list more than necessary. Stop after answering Example 3.

Example 1:

Q: What documents do I need to apply online for a driver's license in Ireland?
A: You need a Public Services Card and verified MyGovID, proof of your address if your address is different to the one you provided when you got your Public Services Card, proof that you are normally resident in Ireland if you are not an EU or EEA citizen, a medical report form, dated within one month if required, and a Certificate of Professional Competence (CPC) for professional drivers.

Example 2:

Q: What criteria do I need to meet to qualify for unemployment benefits in Ireland?
A: You must be under 66 years of age, or under 70 years and deferring your State Pension (Contributory), be unemployed (you must be unemployed for at least 4 days out of 7), have had a substantial loss of employment and as a result be unemployed for at least 4 days out of 7 (this does not apply to casual workers and part-time fire fighters), be capable of work, be available for and genuinely seeking work, and have enough social insurance (PRSI) contributions.

Example 3:
Q: {question}
A:
"""

##### Basic Instruction Prompt

instruction_llm_prompt = """Answer the following question carefully. Follow these rules carefully:
1. If you don't know the answer, don't try to make up an answer. Just say "I can't find the final answer but you may want to check the following links".
2. If you find the answer, write the answer in a concise way with five sentences maximum.
3. DO NOT copy entire sections, lists, or extracted content. Instead, extract only the relevant poionts to answer the question effectively.
4. Exclude any headings, subheadings, or introductory information. Only include the final, relevant points in your response.

Question: {question}

Helpful Answer:
"""

##### Chain of thought Prompt

chain_of_thought_llm_prompt = """Reason step by step before answering the question.

Question: {question}

Begin by enclosing all thoughts within <thinking> tags, exploring multiple angles and approaches. Break down the solution into clear steps within <step> tags. Start with a 10-step budget, requesting more for complex problems if needed. Use <count> tags after each step to show the remaining budget. Stop when reaching 0. Continuously adjust your reasoning based on intermediate results and reflections, adapting your strategy as you progress. Regularly evaluate progress using <reflection> tags. Be critical and honest about your reasoning process. Assign a quality score between 0.0 and 1.0 using <reward> tags after each reflection. Use this to guide your approach: 0.8+: Continue current approach 0.5-0.7: Consider minor adjustments Below 0.5: Seriously consider backtracking and trying a different approach If unsure or if reward score is low, backtrack and try a different approach, explaining your decision within <thinking> tags. Explore multiple solutions individually if possible, comparing approaches in reflections. Use thoughts as a scratchpad, writing out all calculations and reasoning explicitly. Synthesize the final answer within <answer> tags, providing a clear, concise summary. Conclude with a final reflection on the overall solution, discussing effectiveness, challenges, and solutions. Assign a final reward score.

Helpful Answer:
"""

##### Tree of thought Prompt

tree_of_thought_llm_prompt = """Reason step by step before answering the question.

Question: {question}

Begin by enclosing all thoughts within <thinking> tags. Instead of a single linear chain, explicitly generate and explore multiple branches of reasoning as separate <branch id="X"> elements, each representing a distinct approach or angle.

Within each <branch>:
 - Break down the reasoning into clear <step> tags.
 - After each step, include a <count> tag showing the remaining step budget (start with 10 steps per branch; request more if needed).
 - Use <reflection> tags periodically within branches to critically evaluate progress.
 - Assign a <reward> score (0.0 to 1.0) after reflections to assess quality of that branch's approach:
   - 0.8+ Continue this branch
   - 0.5-0.7 Consider minor adjustments
   - Below 0.5 Backtrack or abandon this branch
 - If reward is low or you're unsure, backtrack and try a different <branch>, explaining the reasoning inside new <thinking> tags.
 - Use thoughts as a scratchpad: show calculations, considerations, pros/cons explicitly.

Once multiple branches have been explored and evaluated, compare and synthesize insights in a <comparison> tag, weighing strengths and weaknesses.
Finally, provide a consolidated final answer inside <answer> tags - a clear, consise summary based on the best branch or synthesis.
Conclude with an overall <reflection> and a final <reward> score reflecting the whole process's effectiveness.
"""

prompt_dict_llm = {
      ZERO_SHOT_PROMPT_TYPE: zero_shot_llm_prompt,
      PERSONA_PROMPT_TYPE: persona_llm_prompt,
      FEW_SHOT_PROMPT_TYPE: few_shot_llm_prompt,
      INSTRUCTION_PROMPT_TYPE: instruction_llm_prompt,
      CHAIN_OF_THOUGHT_PROMPT_TYPE: chain_of_thought_llm_prompt,
      TREE_OF_THOUGHT_PROMPT_TYPE: tree_of_thought_llm_prompt
  }

prompt_template_llm = get_prompt_by_type(PROMPT_TYPE_IN_USE, prompt_dict_llm)

PROMPT_LLM = PromptTemplate(
 template=prompt_template_llm, input_variables=["question"]
)

In [None]:
# If re-ranker set, initialise re-ranking model, otherwise set to None
if RE_RANKER:
  RERANKER = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")
else:
  RERANKER = None

artifact.metadata: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/405 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

  self.scaler = torch.cuda.amp.GradScaler()


In [None]:
def post_process_helpful_answer_prompt(answer: str) -> str:
  """
  Extract actual answer text from generated LLM output

  Args:
    answer (str): LLM output
  
  Returns:
    str: actual answer text extracted from generated LLM output
  """

  # Use flag to check start of answer, if present, extract from last index position to end of document, otherwise just return full generated answer
  if "Helpful Answer:" in answer:
    final_response = answer.split("Helpful Answer:")[-1].strip()
  else:
    final_response = answer.strip()

  return final_response

In [None]:
def post_process_helpful_answer_prompt_chain_of_thought(answer: str) -> str:
  """
  Extract actual answer text from chain of thought & tree of thought prompts

  Args:
    answer (str): LLM output

  Returns:
    str: actual answer text extracted from generated LLM output

  """

  # Use re package to find matches for <answer> </answer> tags (used in prompt)
  start_matches = list(re.finditer(r"<answer>", answer, flags=re.IGNORECASE))

  end_matches = list(re.finditer(r"</answer>", answer, flags=re.IGNORECASE))


  # If tags not found, return empty string
  if len(start_matches) < 2:
    return ""


  # Otherwise, extract answer from between these tags and return
  start_index = start_matches[-1].end()

  if end_matches:
    end_after_start = [m for m in end_matches if m.start() > start_index]

    if end_after_start:
      end_index = end_after_start[-1].start()
    else:
      end_index = len(answer)
  else:
    end_index = len(answer)

  return answer[start_index:end_index].strip()

In [None]:
def post_process_few_shot(answer: str) -> str:
  """
  Extract actual answer text from few-shot prompts

  Args:
    answer (str): LLM output

  Returns:
    str: actual answer text extracted from generated LLM output

  """

  # Use marker "A:" used in prompt to outline start of generated answer
  marker = "A:"

  # If this marker not found in text, return empty string
  if marker not in answer:
    return ""

  # Return from last index of marker to end of text
  return answer.split(marker)[-1].strip()

In [None]:
def trim_generated_output_to_last_complete_sentence(generated_text: str) -> str:
  """
  Trim generated output to last complete sentence to ensure that generated output is not partial sentence
  """

  # Use regex match to determine if output is character other than . or ! or ?
  regex_match = re.search(r"([.!?])[^.!?]*$", generated_text)

  # If there is a match, output mustn't have ended successfully so trim output to last index + 1
  if regex_match:
    return generated_text[: regex_match.start(1) + 1]

  return generated_text

In [None]:
def track_runtime(func: Callable[..., Any]):
  """
  Decorator to track the runtime of a function and add metadata around timing to the output

  Args:
    func (Callable[..., Any]): the function to be wrapped
  
  Returns:
    Callable[..., Any]: A wrapped function that returns a tuple with output and runtime information
  """
  @wraps(func)
  def wrapper(*args, **kwargs):
    # Capture start time of function
    start_time = time.time()

    # Call function
    try:
      result = func(*args, **kwargs)
    except Exception as e:
      print("Error in function execution: {}".format(e))
      raise e

    # Capture end time of function
    end_time = time.time()

    # Extract run time by subtracting start time from end time
    run_time = end_time - start_time

    # Return both function outputs of wrapped function and add run time information to returned outputs
    if isinstance(result, tuple):
      raw_output = result[0]

      formatted_output = result[1] if len(result) > 1 else []

      relevant_docs = result[2] if len(result) > 2 else []

      timing_info = result[3] if len(result) > 3 else {}

      post_process_function = result[4] if len(result) > 4 else {}
    else:
      raw_output = result
      formatted_output = result
      relevant_docs = []
      timing_info = {}
      post_process_function = {}

    return (raw_output, formatted_output, relevant_docs, timing_info, post_process_function, run_time)

  return wrapper

In [None]:
def extract_relevant_document_chunks(question: str, knowledge_index: FAISS, num_retrieved_docs: int, num_docs_final: int, reranker: Any) -> List[Document]:
  """
  Extract relevant documents from provided vector store for given question value

  Args:
    question (str): actual question against which relevant documents should be retrieved
    knowledge_index (FAISS): vector db variable
    num_retrieved_docs (int): number of docs to retrieve
    num_docs_final (int): number of final docs to return (if re-ranking enabled)
    reranker (Any): re-ranker model if provided
  
  Returns:
    List[Document]: 
  """

  # Extract relevant documents using the similarity_search function
  relevant_docs = knowledge_index.similarity_search(query=question, k=num_retrieved_docs)

  # Extract content from retrieved documents
  relevant_docs = [doc.page_content for doc in relevant_docs]

  # If re-ranking enabled, use reranker model to rerank the retrieved documents
  if reranker:
    print("=> Reranking documents...")
    relevant_docs = reranker.rerank(question, relevant_docs, k=num_docs_final)
    relevant_docs = [doc["content"] for doc in relevant_docs]

  # Filter to number of final documents value
  relevant_docs = relevant_docs[:num_docs_final]

  # Return retrieved documents
  return relevant_docs

In [None]:
# This cell was used to test that the re-ranking was working as expected
# Use get_file_paths_matching_regex function to match files within content directory with golden results and rag within the file name
rag_results_file_paths = get_file_paths_matching_regex("/content", r"(?=.*golden_results)(?=.*rag)")


for results_file_path in rag_results_file_paths:
  current_result_info = read_json_file(results_file_path)

  for i, info in enumerate(current_result_info["results"]):
    retrieved_documents = extract_relevant_document_chunks(info["question"], vectorstore_medium_model, 20, 5, RERANKER)
    info["final_documents"] = retrieved_documents

  write_json_file(results_file_path, current_result_info)

Successfully loaded content from /content/golden_results_few_shot_mistralai_Mistral-7B-Instruct-v0.3_NVIDIA_A100-SXM4-40GB_rag_temp0_topk0_topP0_repPenalty0_noRepeatNGrams0_max750_24-06-2025_14-08-06.json file
=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 36.71it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 35.84it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 36.51it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 38.02it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 37.67it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 37.78it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 36.92it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 37.69it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 38.36it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 38.80it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 37.76it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 38.17it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 37.98it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 37.39it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 37.59it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 37.20it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 39.00it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 38.25it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 40.17it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 40.49it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 39.98it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 40.71it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 40.69it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 41.29it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 40.40it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 40.11it/s]


Successfully saved content to /content/golden_results_few_shot_mistralai_Mistral-7B-Instruct-v0.3_NVIDIA_A100-SXM4-40GB_rag_temp0_topk0_topP0_repPenalty0_noRepeatNGrams0_max750_24-06-2025_14-08-06.json
Successfully loaded content from /content/golden_results_few_shot_mistralai_Mistral-7B-Instruct-v0.3_NVIDIA_A100-SXM4-40GB_rag_temp0_topk0_topP0.7_repPenalty0_noRepeatNGrams0_max150_24-06-2025_13-06-06.json file
=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 41.69it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 40.01it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 39.99it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 41.09it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 41.05it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 40.27it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 40.37it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 40.34it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 41.23it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 40.83it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 41.00it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 41.21it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 40.10it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 40.34it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 41.67it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 41.90it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 43.35it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 42.02it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 43.39it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 41.69it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 41.79it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 41.18it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 41.73it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 42.89it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 41.23it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 42.21it/s]


Successfully saved content to /content/golden_results_few_shot_mistralai_Mistral-7B-Instruct-v0.3_NVIDIA_A100-SXM4-40GB_rag_temp0_topk0_topP0.7_repPenalty0_noRepeatNGrams0_max150_24-06-2025_13-06-06.json
Successfully loaded content from /content/golden_results_few_shot_mistralai_Mistral-7B-Instruct-v0.3_NVIDIA_A100-SXM4-40GB_rag_temp0_topk0_topP0_repPenalty0_noRepeatNGrams0_max1000_24-06-2025_14-24-18.json file
=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 42.26it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 41.40it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 41.65it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 43.28it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 42.69it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 42.30it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 42.26it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 41.38it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 42.12it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 41.76it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 40.82it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 41.36it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 41.22it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 41.25it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 40.38it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 39.41it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 40.68it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 39.81it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 42.61it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 40.38it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 39.76it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 40.51it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 40.10it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 41.38it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 39.29it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 39.99it/s]


Successfully saved content to /content/golden_results_few_shot_mistralai_Mistral-7B-Instruct-v0.3_NVIDIA_A100-SXM4-40GB_rag_temp0_topk0_topP0_repPenalty0_noRepeatNGrams0_max1000_24-06-2025_14-24-18.json
Successfully loaded content from /content/golden_results_few_shot_mistralai_Mistral-7B-Instruct-v0.3_NVIDIA_A100-SXM4-40GB_rag_temp0_topk0_topP0_repPenalty0_noRepeatNGrams0_max500_24-06-2025_13-31-11.json file
=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 39.76it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 38.17it/s]

=> Reranking documents...



100%|██████████| 1/1 [00:00<00:00, 39.36it/s]

=> Reranking documents...



100%|██████████| 1/1 [00:00<00:00, 39.40it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 40.57it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 39.47it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 40.41it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 40.83it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 41.40it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 41.02it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 39.94it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 39.90it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 39.10it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 39.71it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 39.13it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 38.29it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 39.13it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 38.56it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 40.56it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 39.06it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 39.32it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 39.89it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 40.08it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 41.22it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 39.63it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 39.51it/s]


Successfully saved content to /content/golden_results_few_shot_mistralai_Mistral-7B-Instruct-v0.3_NVIDIA_A100-SXM4-40GB_rag_temp0_topk0_topP0_repPenalty0_noRepeatNGrams0_max500_24-06-2025_13-31-11.json
Successfully loaded content from /content/golden_results_few_shot_mistralai_Mistral-7B-Instruct-v0.3_NVIDIA_A100-SXM4-40GB_rag_temp0_topk0_topP0_repPenalty0_noRepeatNGrams0_max2000_24-06-2025_13-51-24.json file
=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 40.81it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 40.05it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 39.68it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 41.14it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 40.31it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 40.56it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 40.37it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 40.13it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 41.03it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 40.74it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 40.68it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 40.91it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 40.45it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 40.72it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 41.42it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 40.55it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 41.99it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 41.06it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 43.14it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 39.99it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 40.70it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 41.00it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 40.82it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 41.76it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 41.12it/s]


=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 41.37it/s]

Successfully saved content to /content/golden_results_few_shot_mistralai_Mistral-7B-Instruct-v0.3_NVIDIA_A100-SXM4-40GB_rag_temp0_topk0_topP0_repPenalty0_noRepeatNGrams0_max2000_24-06-2025_13-51-24.json





In [None]:
# Note use of decorator function here to ensure run time information extracted
@track_runtime
def answer_with_rag(
    question: str,
    llm,
    knowledge_index: FAISS,
    prompt_template,
    reranker: Optional[RAGPretrainedModel] = None,
    num_retrieved_docs: int = 30,
    num_docs_final: int = 5,
    post_process_function = None,
):
    """
    Function to enable RAG-enabled LLM inference

    Args:
      question (str): the user query
      llm: the model used for inference
      knowledge_index (FAISS): vector db store generated with citizen information documents
      prompt_template: prompt template used to be passed to the model
      reranker (Optional[RAGPretrainedModel]): re-ranking model
      num_retrieved_docs (int): number of initial relevant docs to be retrieved
      num_docs_final (int): number of final documents to be returned
      post_process_function: post process function to be called on generated output
    
    Returns:
      answer (str): raw generated output
      formatted_answer (str): output after post-processing completed
      relevant_docs: retrieved documents returned
      timing_info: timing information which stores run-time etc.
      post_process_function: the function used to post-process generated output 
    """

    # Initislise empty dict for timing information
    timing_info = {}

    # Extract documents with the retriever
    print("=> Retrieving documents...")

    # Keep track of retrieval start time
    start_retrieval = time.time()

    # Get relevant docs
    relevant_docs = knowledge_index.similarity_search(query=question, k=num_retrieved_docs)

    # Keep track of retrieval end time
    end_retrieval = time.time()

    # Set timing info retrieval duration time
    timing_info["retrieval_duration"] = end_retrieval - start_retrieval


    # Extract content for retrieved documents
    relevant_docs = [doc.page_content for doc in relevant_docs]

    # Re-rank results if enabled
    if reranker:
        print("=> Reranking documents...")
        # Keep track of re-ranking start time
        start_rerank = time.time()

        # Actually re-rank docs
        relevant_docs = reranker.rerank(question, relevant_docs, k=num_docs_final)

        # Keep track of re-ranking start time
        end_rerank = time.time()

        # Set timing info re-rank duration time
        timing_info["rerank_duration"] = end_rerank - start_rerank

        # Extract content for re-ranked documents
        relevant_docs = [doc["content"] for doc in relevant_docs]

    # Filter docs to index position of num_docs_final
    relevant_docs = relevant_docs[:num_docs_final]

    # Construct the final prompt with the extracted documents text included
    context = "\nExtracted documents:\n"
    context += "".join([f"Document {str(i)}:::\n" + doc for i, doc in enumerate(relevant_docs)])

    final_prompt = prompt_template.format(question=question, context=context)

    # Call model to generate answer
    print("=> Generating answer...")

    # Keep track of start time when actual model called 
    start_gen = time.time()
    answer = llm(final_prompt)

    # Keep track of end time when actual model returns output
    end_gen = time.time()

    # Set generation timing info on timing info dict
    timing_info["generation_duration"] = end_gen - start_gen

    # Extract number output tokens and time per output tokens if available (NOTE: this wasn't used in final analysis as latency alone was sufficient to show differences)
    if hasattr(llm.pipeline.tokenizer, "encode"):
      output_tokens = llm.pipeline.tokenizer.encode(answer, add_special_tokens=False)
      timing_info["number_output_tokens"] = len(output_tokens)
      timing_info["time_per_output_token"] = timing_info["generation_duration"] / timing_info["number_output_tokens"] if output_tokens else None
    else:
      timing_info["number_output_tokens"] = -1
      timing_info["time_per_output_token"] = None


    # Format generated output by calling post process function
    formatted_answer = post_process_function(answer) if post_process_function else answer

    # Then once again post-process by replacing any numbered points with - bullets
    formatted_answer = re.sub(r'\b([a-zA-Z0-9])\.\s+', "- ", formatted_answer).strip()

    # Trim output to last complete sentence
    formatted_answer = trim_generated_output_to_last_complete_sentence(formatted_answer)

    # Return raw answer, formatted answer, retrieved documents, timing information and post process function used for post-processing
    return answer, formatted_answer, relevant_docs, timing_info, post_process_function

In [None]:
# Note use of decorator function here to ensure run time information extracted
@track_runtime
def answer_without_rag(
    question: str,
    llm,
    prompt_template,
    post_process_function = None,
    ):
    """
    Function for LLM inference

    Args:
      question (str): actual user query
      llm: model to be used for inference
      prompt_template: prompt to be provided to llm
      post_process_function: function to be used for post-processing of output
    
    Returns:
      answer: raw output from LLM model
      formatted_answer: output after post-processing
      retrieved documents: empty list as not used in non-RAG based inference
      timing_info: timing information which stores inference run time
      post_process_function: function used for post-processing
    """

    # Initialise empty timing info dict
    timing_info = {}

    # Get prompt type
    final_prompt = prompt_template.format(question=question)

    # Inference
    print("=> Generating answer...")

    # Keep track of inference start time
    start_gen = time.time()

    # Actually call llm with prompt
    answer = llm(final_prompt)

    # Keep track of inference end time
    end_gen = time.time()

    # Set inferencing duration on timing info dict
    timing_info["generation_duration"] = end_gen - start_gen

    # If available, extract number of output tokens and time per output token (NOTE: in final output, didn't need these as latency alone showed difference)
    if hasattr(llm.pipeline.tokenizer, "encode"):
      output_tokens = llm.pipeline.tokenizer.encode(answer, add_special_tokens=False)
      timing_info["number_output_tokens"] = len(output_tokens)
      timing_info["time_per_output_token"] = timing_info["generation_duration"] / timing_info["number_output_tokens"] if output_tokens else None
    else:
      timing_info["number_output_tokens"] = -1
      timing_info["time_per_output_token"] = None


    # Call post-processing function to format answer
    formatted_answer = post_process_function(answer) if post_process_function else answer


    # Then once again post-process by replacing any numbered points with - bullets
    formatted_answer = re.sub(r'\b([a-zA-Z0-9])\.\s+', "- ", formatted_answer).strip()

    # Trim output to last complete sentence
    formatted_answer = trim_generated_output_to_last_complete_sentence(formatted_answer)

    # Return raw answer, formatted answer, empty list as no retrieval here, timing information and post process function used for post-processing
    return answer, formatted_answer, [], timing_info, post_process_function

In [None]:
# Initialise dict with prompt types as keys and prompt templates stored as values
post_process_function_dict = {
    ZERO_SHOT_PROMPT_TYPE: post_process_helpful_answer_prompt,
    INSTRUCTION_PROMPT_TYPE: post_process_helpful_answer_prompt,
    PERSONA_PROMPT_TYPE: post_process_helpful_answer_prompt,
    TREE_OF_THOUGHT_PROMPT_TYPE: post_process_helpful_answer_prompt_chain_of_thought,
    CHAIN_OF_THOUGHT_PROMPT_TYPE: post_process_helpful_answer_prompt_chain_of_thought,
    FEW_SHOT_PROMPT_TYPE: post_process_few_shot,
}

In [None]:
# Extract post processing function to be used based on current prompt type
post_process_function = get_post_process_function_by_prompt_type(PROMPT_TYPE_IN_USE, post_process_function_dict)

In [None]:
# Test call for RAG workflow to ensure everything working as expected
raw_answer_rag, formatted_answer_rag, relevant_docs, timing_info_rag, post_process_function_rag, total_run_time_rag = retry_hugging_face_inference(answer_with_rag, "Who is entitled to Irish citizenship?", llm, vectorstore_medium_model, PROMPT_RAG, RERANKER, NUMBER_RETRIEVED_DOCS, NUMBER_FINAL_DOCS, post_process_function)

=> Retrieving documents...
=> Generating answer...


  answer = llm(final_prompt)


In [None]:
# Print post-processed output to validate
print(formatted_answer_rag)

* A person born in Ireland or a child born abroad to an Irish parent or parents
* A person who has been granted Irish citizenship by naturalisation
* A person who has been granted Irish citizenship by registration
* A person who has been granted Irish citizenship by descent
* A person who has been granted Irish citizenship by marriage to an Irish citizen
* A person who has been granted Irish citizenship by operation of law
* A person who has been granted Irish citizenship by order of the President


Answer:

* A person born in Ireland or a child born abroad to an Irish parent or parents
* A person who has been granted Irish citizenship by naturalisation
* A person who has been granted Irish citizenship by registration
* A person who has been granted Irish citizenship by descent
* A person who has been granted Irish citizenship by marriage to an Irish citizen
* A person who has been granted Irish citizenship by operation of law
* A person who has been granted Irish citizenship by order 

In [None]:
# Print raw output to visualise full response from model
print(raw_answer_rag)

Answer the question carefully, using the following pieces of context and examples below to guide the style, structure and level of detail expected in the answer. ONLY provide the answer to Example 3. Follow the same structure and level of detail as Example 1 and Example 2 - short, factual, and readable. Do not list more than necessary. Stop after answering Example 3.


Extracted documents:


Example 1:

Q: What documents do I need to apply online for a driver's license in Ireland?
A: You need a Public Services Card and verified MyGovID, proof of your address if your address is different to the one you provided when you got your Public Services Card, proof that you are normally resident in Ireland if you are not an EU or EEA citizen, a medical report form, dated within one month if required, and a Certificate of Professional Competence (CPC) for professional drivers.

Example 2:

Q: What criteria do I need to meet to qualify for unemployment benefits in Ireland?
A: You must be under 66 

In [None]:

# Test call for LLM workflow to ensure everything working as expected
raw_answer_llm, formatted_answer_llm, _, timing_info_llm, post_process_function_llm, total_run_time_llm = retry_hugging_face_inference(answer_without_rag, "Who is entitled to Irish citizenship?", llm, PROMPT_LLM, post_process_function)

=> Generating answer...


In [None]:
# Print post-processed output to validate
print(formatted_answer_llm)




In [None]:
# Print raw output to visualise full response from model
print(raw_answer_llm)

Reason step by step before answering the question.

Question: Who is entitled to Irish citizenship?

Begin by enclosing all thoughts within <thinking> tags. Instead of a single linear chain, explicitly generate and explore multiple branches of reasoning as separate <branch id="X"> elements, each representing a distinct approach or angle.

Within each <branch>:
 - Break down the reasoning into clear <step> tags.
 - After each step, include a <count> tag showing the remaining step budget (start with 10 steps per branch; request more if needed).
 - Use <reflection> tags periodically within branches to critically evaluate progress.
 - Assign a <reward> score (0.0 to 1.0) after reflections to assess quality of that branch's approach:
   - 0.8+ Continue this branch
   - 0.5-0.7 Consider minor adjustments
   - Below 0.5 Backtrack or abandon this branch
 - If reward is low or you're unsure, backtrack and try a different <branch>, explaining the reasoning inside new <thinking> tags.
 - Use thou

In [None]:
# Before completing full run, sanity check that correct prompts and post-process function are in use
print(PROMPT_RAG)

print("*" * 50)

print(PROMPT_LLM)

print ("*" * 50)

print(post_process_function)

input_variables=['context', 'question'] input_types={} partial_variables={} template="Answer the question carefully, using the following pieces of context and examples below to guide the style, structure and level of detail expected in the answer. ONLY provide the answer to Example 3. Follow the same structure and level of detail as Example 1 and Example 2 - short, factual, and readable. Do not list more than necessary. Stop after answering Example 3.\n\n{context}\n\nExample 1:\n\nQ: What documents do I need to apply online for a driver's license in Ireland?\nA: You need a Public Services Card and verified MyGovID, proof of your address if your address is different to the one you provided when you got your Public Services Card, proof that you are normally resident in Ireland if you are not an EU or EEA citizen, a medical report form, dated within one month if required, and a Certificate of Professional Competence (CPC) for professional drivers.\n\nExample 2:\n\nQ: What criteria do I ne

## Golden Dataset - Generate answers with LLM + RAG

In [None]:
# Open the golden_dataset.json file and read contents
with open("golden_dataset.json", "r", encoding="utf-8") as f:
      golden_dataset = json.load(f)


# If set to True, run full golden dataset inference for all six prompt types
if RUN_ALL_PROMPTS:
  print("Running all prompts...")
  print("*" * 100)
  print("\n")
  print(LLM_HYPERPARAMETERS)
  # Loop over prompts
  for prompt_type, prompt_text in prompt_dict_rag.items():
    print("Running {} prompt type".format(prompt_type))
    print("*" * 100)
    print("\n")
    rag_answers_large_model_golden_questions = []


    # Get prompt template contents for current prompt type
    prompt_template_rag = get_prompt_by_type(prompt_type, prompt_dict_rag)

    # Construct prompt template
    PROMPT_RAG = PromptTemplate(
      template=prompt_template_rag, input_variables=["context", "question"]
    )
    
    # Extract post-process function for current prompt type
    post_process_function = get_post_process_function_by_prompt_type(prompt_type, post_process_function_dict)

    # Loop over golden dataset, generate output for each question within and store generated outputs
    for i in golden_dataset:
      raw_rag_answer, formatted_rag_answer, relevant_docs, timing_info, post_process_function_rag, run_time_rag = retry_hugging_face_inference(answer_with_rag, i["question"], llm, vectorstore_medium_model, PROMPT_RAG, RERANKER, NUMBER_RETRIEVED_DOCS, NUMBER_FINAL_DOCS, post_process_function)
      post_process_function_rag_string = inspect.getsource(post_process_function_rag)
      rag_answers_large_model_golden_questions.append({"ground_truth": i["answer"], "question": i["question"], "raw_answer": raw_rag_answer, "answer": formatted_rag_answer, "total_run_time": run_time_rag, "timing_info": timing_info, "post_process_function": post_process_function_rag_string, "final_documents": relevant_docs})

    instance_name = get_instance_name()

    # Extract unique json file path name
    golden_large_model_file_path = generate_json_file_name(prompt_type, LLM_IN_USE_NAME, "rag", TEMPERATURE, TOP_K, TOP_P, REPETITION_PENALTY, NO_REPEAT_N_GRAMS, MAX_TOKENS, instance_name, "golden")


    # initialise dict with all info to be stored in results file for current run
    rag_experiment_results = {
    "hyperparameters": LLM_HYPERPARAMETERS,
    "model": LLM_IN_USE_NAME,
    "results": rag_answers_large_model_golden_questions,
    "colab_instance": instance_name,
    "prompt_template": prompt_template_rag,
    "prompt_type": prompt_type,
    "knowledge_index": "FAISS",
    "reranker": RE_RANKER,
    "num_retrieved_docs": NUMBER_RETRIEVED_DOCS,
    "num_docs_final":  NUMBER_FINAL_DOCS,
    "embeddings": { "model_name": MEDIUM_TOKENISER_MODEL_NAME,
                    "model_kwargs": {"device": device,},
                    "encode_kwargs": {"normalize_embeddings": True, "multi_process": False}
                   },
    "chunk_size": CHUNK_SIZE,
    "chunk_overlap": int(CHUNK_SIZE / 10),
    "add_start_index": True,
    "strip_whitespace": True,
    "separators": MARKDOWN_SEPARATORS,
    }

    # write json file with this information
    write_json_file(golden_large_model_file_path, rag_experiment_results)

# If set to False, only run for current prompt template
else:
  rag_answers_large_model_golden_questions = []

  # Loop over each question in the dataset, generate outputs and append information to list for each qa pair
  for i in golden_dataset:
    print("Question: {}".format(i["question"]))
    print("\n")
    print("*" * 50)
    raw_rag_answer, formatted_rag_answer, relevant_docs, timing_info, post_process_function_rag, run_time_rag = retry_hugging_face_inference(answer_with_rag, i["question"], llm, vectorstore_medium_model, PROMPT_RAG, RERANKER, NUMBER_RETRIEVED_DOCS, NUMBER_FINAL_DOCS, post_process_function)
    post_process_function_rag_string = inspect.getsource(post_process_function_rag)
    print("Answer: {}".format(formatted_rag_answer))
    print("\n")
    print("*" * 50)
    rag_answers_large_model_golden_questions.append({"ground_truth": i["answer"], "question": i["question"], "raw_answer": raw_rag_answer, "answer": formatted_rag_answer, "total_run_time": run_time_rag, "timing_info": timing_info, "post_process_function": post_process_function_rag_string, "final_documents": relevant_docs})

  return torch.cuda.amp.autocast() if self.activated else NullContextManager()


Question: Who is entitled to Irish citizenship?


**************************************************
=> Retrieving documents...
=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 35.62it/s]

=> Generating answer...



  return torch.cuda.amp.autocast() if self.activated else NullContextManager()


Answer: You are entitled to claim Irish citizenship if any one of the following applies:

- You were born in Ireland or Northern Ireland before 1 January 2005
- You were born in Ireland or Northern Ireland after 31 December 2004
and any of the following applies to you:
   - One (or both) of your parents was Irish or entitled to be an Irish citizen or a British citizen at the time of your birth
   - One (or both) of your parents was an Irish or British citizen and died before you were born
   - One (or both) of your parents was entitled to live in Ireland or Northern Ireland without any restriction on their period of residency
   - One (or both) of your parents was legally resident on the island of Ireland for 3 out of the 4 years immediately before your birth (this does not include residence on a student visa, or residence while awaiting an international protection decision or residence under a declaration of subsidiary protection)
- You were born abroad and any of the following apply 

100%|██████████| 1/1 [00:00<00:00, 40.50it/s]

=> Generating answer...



  return torch.cuda.amp.autocast() if self.activated else NullContextManager()


Answer: Ordinary residence is the country where you are usually resident over a number of years. It is not the same as your tax residence, which is based on the number of days you spend in Ireland. If you have been resident for the previous 3 tax years, then you become ordinarily resident from the start of the fourth year. If you leave the country, you will continue to be ordinarily resident until you have been non-resident for 3 continuous tax years.


**************************************************
Question: What counts as genuine redundancy?


**************************************************
=> Retrieving documents...
=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 40.53it/s]

=> Generating answer...



  return torch.cuda.amp.autocast() if self.activated else NullContextManager()


Answer: Genuine redundancy is when your employer has a real business reason to make you redundant.

Reasons for redundancy include:

Financial difficulties within the business: for example, your employer needs to cut costs and this means staff numbers must be reduced.

Lack of work : for example, your employer no longer needs or has a reduced need for employees with your skills or new technology has made your job unnecessary.

Reorganisation within the business: for example, your employer has decided to carry on the business with fewer or no staff or the job you do no longer exists.

Business closure : the business is closing down or moving.

The difference between redundancy and dismissal

Redundancy

Redundancy is dismissal from your job, caused by your employer needing to reduce the workforce resulting in your job no longer existing.

The burden of proof is on your employer to show that a legitimate redundancy situation exists and that therefore the dismissal is fair.

For the dismi

100%|██████████| 1/1 [00:00<00:00, 42.33it/s]

=> Generating answer...



  return torch.cuda.amp.autocast() if self.activated else NullContextManager()


Answer: A GP visit card is a card that gives you free visits to a participating family doctor (GP). If you are not eligible for a medical card, you may be eligible for a GP visit card. You can apply for a GP visit card online.


**************************************************
Question: Who can get a GP visit card?


**************************************************
=> Retrieving documents...
=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 41.84it/s]

=> Generating answer...



  return torch.cuda.amp.autocast() if self.activated else NullContextManager()


Answer: * You must be ordinarily resident in Ireland
* You must be living in Ireland and intend to live here for at least one year
* You will qualify for a GP visit card with no means test if you are:
	+ Aged under 8
	+ Aged over 70
	+ Getting Carer’s Benefit or Carer’s Allowance, at full or half-rate
* If you are aged under 70, your income is assessed by the HSE to see whether you qualify for a GP visit card.
* If you are not eligible for a medical card, you may still qualify for a GP visit card.


**************************************************
Question: How long will my passport application take?


**************************************************
=> Retrieving documents...
=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 41.74it/s]

=> Generating answer...



  return torch.cuda.amp.autocast() if self.activated else NullContextManager()


Answer: - Simple online renewals take around 10 working days to process.
- Complex online renewals take around 15 working days to process. A complex renewal is when you are asked to submit extra documentation or you are changing information on your passport. You could be changing your name or reporting a lost or stolen passport.
- If you are applying by post, you should allow at least 8 weeks for your application to be processed (not including postage times). This period may be extended at certain times of the year due to increased application numbers.
- Check the Passport Service website for current processing times.


**************************************************
Question: What is a Police Certificate?


**************************************************
=> Retrieving documents...
=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 40.95it/s]

=> Generating answer...



  return torch.cuda.amp.autocast() if self.activated else NullContextManager()


Answer: A Police Certificate is a document issued by the Gardaí for various reasons, such as starting a business abroad or applying for a visa in another country. It is different from Garda vetting, which is used to check if you have a criminal record or any history that might pose a threat to vulnerable people.


**************************************************
Question: How to register to vote?


**************************************************
=> Retrieving documents...
=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 41.86it/s]

=> Generating answer...



  return torch.cuda.amp.autocast() if self.activated else NullContextManager()


Answer: To register to vote, you can:

- Register online at checktheregister.ie
- Fill in a paper application form (ERF 2)

What do I need to register to vote?

To register, you will need to provide:

* Your name
* Your date of birth
* Your PPS number
* An address - when you register, you can state that you have ‘no fixed address’. But you will need to tell your local authority the area where you live and spend most of your time. This will affect where you vote.

If you are in Dublin and want to register online, you will need to provide an address or Eircode.

PPS number

You have to provide a PPS number when you register to vote.

If you do not give a PPS number, you must get your paper application form (ERF 2) and identity witnessed at a local Garda station. You should bring your ID with you.


**************************************************
Question: What is the Vacant Property Refurbishment Grant?


**************************************************
=> Retrieving documents...
=>

100%|██████████| 1/1 [00:00<00:00, 41.38it/s]

=> Generating answer...



  return torch.cuda.amp.autocast() if self.activated else NullContextManager()


Answer: The Vacant Property Refurbishment Grant provides funding so you can refurbish vacant properties to live in or rent out. It covers the work listed below. The amount you get is based on a cost assessment by your local authority. It will not go above the funding limits for each type of work set-out in this table. This rule does not apply if you live on an off-shore island, where you can get up to 20% above these funding limits.


**************************************************
Question: Can my employer force me to take annual leave?


**************************************************
=> Retrieving documents...
=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 40.42it/s]

=> Generating answer...



  return torch.cuda.amp.autocast() if self.activated else NullContextManager()


Answer: Usually, employees can ask to take annual leave at specific times. Your employer can accept your request, or refuse your request.

Your employer decides when annual leave may be taken, but this is subject to a number of conditions. Your employer must:

Take into account your family responsibilities, as well as the available opportunities for rest and recreation.

Discuss your annual leave with you (or your union) at least one month before you are to take the leave.

If you have worked for at least 8 months, you are entitled to an unbroken period of 2 weeks' annual leave. This means you can get 2 weeks off in a row.

Holiday pay

Holiday pay (pay for annual leave) must be paid in advance at your normal weekly rate.

If your pay changes from week-to-week (for example, because of commission or bonus payments), your holiday pay is the average of your pay over the 13 weeks before you take holidays.

Part-time employees

Generally, annual leave for part-time workers is calculated usi

100%|██████████| 1/1 [00:00<00:00, 41.69it/s]

=> Generating answer...



  return torch.cuda.amp.autocast() if self.activated else NullContextManager()


Answer: If you get sick while on annual leave, you can continue to build up your annual leave entitlement once you have a medical certificate.

Your employer cannot insist you take annual leave on days you are off sick and have a medical certificate.

If you are on long-term sick leave and cannot take your annual leave due to illness, you can carry it over for up to 15 months after the end of the year you built it up. If you leave your job within these 15 months, you should get holiday pay instead of the annual leave days you could not take due to illness.


**************************************************
Question: What is a valid reason for ending a tenancy after the first 6 months?


**************************************************
=> Retrieving documents...
=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 41.35it/s]

=> Generating answer...



  return torch.cuda.amp.autocast() if self.activated else NullContextManager()


Answer: Generally, your landlord can only end a tenancy after the first 6 months if:

* You do not comply with the obligations of the tenancy, for example, by not paying your rent on time
* The property is no longer suited to your needs, for example, if it is too small
* The landlord intends to sell the property within 9 months. However, this may not apply if the landlord plans to sell 10 or more dwellings in a development within a 6-month period – see ‘Restriction on terminating when selling multiple properties’ below.


**************************************************
Question: What are your rights when an order is delayed or not delivered?


**************************************************
=> Retrieving documents...
=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 40.57it/s]

=> Generating answer...



  return torch.cuda.amp.autocast() if self.activated else NullContextManager()


Answer: * You have the right to a refund within 14 days of cancellation if you bought online on or before 28 November 2022.
* If you bought online from a UK trader after 1 January 2021, you may not automatically have the same consumer rights.
* If you bought online from a UK trader, you can read more about buying online from the UK after Brexit.
* If you bought online from a trader in the EU, you have the same rights as buying in a shop.
* If you bought online from a trader in the UK, you may not have the same rights as buying in a shop.
* If you bought online from a trader in the EU, you have the same rights as buying in a shop.
* If you bought online from a trader in the UK, you may not have the same rights as buying in a shop.
* If you bought online from a trader in the EU, you have the same rights as buying in a shop.
* If you bought online from a trader in the UK, you may not have the same rights as buying in a shop.
* If you bought online from a trader in the EU, you have the sam

100%|██████████| 1/1 [00:00<00:00, 41.52it/s]

=> Generating answer...



  return torch.cuda.amp.autocast() if self.activated else NullContextManager()


Answer: * You must be at least 21 years of age
* You must have a valid 'declaration of eligibility and suitability'
* You must be 'habitually resident in the State' (living in Ireland for at least one year before the date of the making of the adoption order)
* You may be eligible to adopt if you are the mother, father or relative of the child
* You may also be eligible to adopt if you are a sole applicant, provided the Adoption Authority is satisfied that it is desirable and in the best interests of the child


**************************************************
Question: How do penalty points work?


**************************************************
=> Retrieving documents...
=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 41.53it/s]

=> Generating answer...



  return torch.cuda.amp.autocast() if self.activated else NullContextManager()


Answer: Penalty points are recorded on your driving licence when you are convicted in court of a driving offence that attracts penalty points or when you pay a fixed charge notice that was issued to you for an offence that also carries penalty points. If you get 12 penalty points in any 3 year period, you are automatically disqualified from driving for 6 months. Learner permit drivers are disqualified if they have 7 penalty points. This lower threshold also applies for the first 2 years of a driver’s first full driving licence. Penalty points are not added to your licence immediately. Fixed charge and penalty offences have specific procedures for payment and notification.


**************************************************
Question: Do I have a right to redundancy pay?


**************************************************
=> Retrieving documents...
=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 41.30it/s]

=> Generating answer...



  return torch.cuda.amp.autocast() if self.activated else NullContextManager()


Answer: If you lose your job you may be eligible for redundancy pay. This page summarises the rules on qualifying for a redundancy payment.

To qualify for redundancy pay, you must have 2 years' (104 weeks) service in your job. This means you must have worked for your employer for at least 104 weeks (2 years) in the same job.

If you are an agency worker, you are also protected under redundancy legislation. If the employment agency pays your wages, it is responsible for paying the statutory redundancy payment.

If you are an apprentice and are made redundant during the apprenticeship, you may qualify for a redundancy payment. You must meet the conditions of having 2 years' service (104 weeks) over the age of 16.

If you are on a fixed-term contract, you may be entitled to statutory redundancy if your employer does not renew your fixed-term contract under the same or a similar contract before the term expires.


**************************************************
Question: How to apply f

100%|██████████| 1/1 [00:00<00:00, 39.83it/s]

=> Generating answer...



  return torch.cuda.amp.autocast() if self.activated else NullContextManager()


Answer: - **Apply online**: You can apply for most social welfare payments online at MyWelfare.ie.
- **Get an application form**: If you can’t apply online, you can get an application form from the Department of Social Protection, your local Intreo Centre or Social Welfare Branch Office, or your local Citizens Information Centre.
- **Complete the form**: Fill in the application form with all the required details.
- **Provide supporting documents**: You may need to provide additional documents, such as payslips or medical reports, to support your claim.
- **Send the form**: Send the completed application form to the address printed on the form.
- **Wait for a decision**: The Department of Social Protection will review your application and make a decision on whether you qualify for the payment.
- **Appeal the decision**: If you are unhappy with the decision, you can appeal to the Social Welfare Appeals Office.


**************************************************
Question: How can I comme

100%|██████████| 1/1 [00:00<00:00, 41.01it/s]

=> Generating answer...



  return torch.cuda.amp.autocast() if self.activated else NullContextManager()


Answer: You can comment on a planning application by writing to the local authority where the application was made.

You must include your name, address, phone number, and email address (if you have one).

Your submission or observation must be acknowledged by the local authority.

The local authority cannot make a decision on an application until they have had it for 5 weeks. This is so people have time to view and comment on the application.

When the local authority decides on the planning application, you must be informed of their decision within 3 days of them making it.

If your submission is received after the deadline, it will be returned to you along with your fee. You will also be contacted by your local authority to tell you that your submission can't be considered. The deadline for commenting on a planning application is 5 weeks from when the local authority received the planning application.


**************************************************
Question: What is the Citizen

100%|██████████| 1/1 [00:00<00:00, 43.29it/s]

=> Generating answer...



  return torch.cuda.amp.autocast() if self.activated else NullContextManager()


Answer: The Citizens Information Service is a free, confidential, independent, and impartial information service provided by the Citizens Information Board. It supports the provision of information, advice, and advocacy on a broad range of public and social services. The service includes the Citizens Information website, citizensinformation.ie, and the Citizens Information Phone Service 0818 07 4000. It also funds and supports the Money Advice and Budgeting Service (MABS) and the National Advocacy Service for People with Disabilities.


**************************************************
Question: Who was eligible for YESS (Youth Employment Support Scheme)?


**************************************************
=> Retrieving documents...
=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 41.00it/s]

=> Generating answer...



  return torch.cuda.amp.autocast() if self.activated else NullContextManager()


Answer: * People aged between 18 and 24
* Had been out of work and getting a qualifying payment (see below) for at least 12 months or
* Faced a significant barrier to work (if unemployed for less than 12 months)

What were the qualifying payments?


**************************************************
Question: What is respite care?


**************************************************
=> Retrieving documents...
=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 40.57it/s]

=> Generating answer...



  return torch.cuda.amp.autocast() if self.activated else NullContextManager()


Answer: Respite care is when you can take a break from caring, and the person you care for is looked after by someone else. It can be covered by family members or an organisation so you can take a short break, a holiday or a rest. Respite care can be for carers of older people or people with different disabilities.


**************************************************
Question: What is a medical card?


**************************************************
=> Retrieving documents...
=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 40.86it/s]

=> Generating answer...



  return torch.cuda.amp.autocast() if self.activated else NullContextManager()


Answer: A medical card is a card that gives you free access to certain medical services in Ireland.

You can get a medical card if you are ordinarily resident in Ireland and meet certain criteria.

There are two types of medical cards:

- Full medical cards for people who qualify for a means test
- Emergency medical cards for people in certain emergency situations

You can apply for a medical card online or by post.

If you are under 70 years old, you can apply online using the online medical card application system.

If you are over 70 years old, you can apply using the MC1 Medical Card and GP Visit Card Application Form (pdf).

You can also get the application form and a list of participating GPs from your local health centre or Local Health Office for your area.

If you are applying online, you can upload photos, scans or photocopies of the original documents specified on the form.

You can track the progress of your medical card application on the HSE website.

If you are refused b

100%|██████████| 1/1 [00:00<00:00, 41.19it/s]

=> Generating answer...



  return torch.cuda.amp.autocast() if self.activated else NullContextManager()


Answer: Pre-nuptial agreements are not legally binding in Ireland. They are not based on any specific law. However, they can serve as a guide for the courts in judicial separation and divorce cases. If a pre-nuptial agreement makes ‘proper provision’ for each person, it is more likely to be persuasive on the judge.


**************************************************
Question: Who regulates aviation in Ireland?


**************************************************
=> Retrieving documents...
=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 42.28it/s]

=> Generating answer...



  return torch.cuda.amp.autocast() if self.activated else NullContextManager()


Answer: The Department of Transport regulates air transport. This includes commercial and private services.

EU airlines can fly freely within the EU/European Economic Area (EEA) without any restrictions on flight frequency or routes.

Airlines from countries outside of the EU/EEA must request permission. They must do this each time they want to fly to, from, or over Ireland.

The Department of Defence regulates military air transport in Ireland.


**************************************************
Question: What is the Cycle to Work Scheme?


**************************************************
=> Retrieving documents...
=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 41.16it/s]

=> Generating answer...



  return torch.cuda.amp.autocast() if self.activated else NullContextManager()


Answer: The Cycle to Work Scheme allows an employer to buy a new bike and bike safety equipment up to a value of €3,000 for employees. Employees can then repay the purchase cost to their employer from their gross salary. This document provides information on the scheme, including the types of bikes covered, the equipment covered, and the frequency of use.


**************************************************
Question: What are the operating hours and frequency of the Luas?


**************************************************
=> Retrieving documents...
=> Reranking documents...


100%|██████████| 1/1 [00:00<00:00, 41.37it/s]

=> Generating answer...





Answer: Operating hours vary by stop and direction of service. You can view Luas operating hours and Luas frequency on luas.ie.

There are electronic displays telling you the time of the next tram at every Luas stop. In addition, there are electronic displays and announcements informing passengers of the next stop on board every tram.

Special services operate during Christmas, New Year and on other occasions.


**************************************************


In [None]:
instance_name = get_instance_name()

In [None]:
# Gen unique file name for RAG run (note this is only used if only one prompt was run above)
golden_large_model_file_path = generate_json_file_name(PROMPT_TYPE_IN_USE, LLM_IN_USE_NAME, "rag", TEMPERATURE, TOP_K, TOP_P, REPETITION_PENALTY, NO_REPEAT_N_GRAMS, MAX_TOKENS, instance_name, "golden")

In [None]:
# Initialise results for rag experiment run (again note this cell is only used if one prompt type was used)
rag_experiment_results = {
    "hyperparameters": LLM_HYPERPARAMETERS,
    "model": LLM_IN_USE_NAME,
    "results": rag_answers_large_model_golden_questions,
    "colab_instance": instance_name,
    "prompt_template": prompt_template_rag,
    "prompt_type": PROMPT_TYPE_IN_USE,
    "knowledge_index": "FAISS",
    "reranker": RE_RANKER,
    "num_retrieved_docs": NUMBER_RETRIEVED_DOCS,
    "num_docs_final":  NUMBER_FINAL_DOCS,
    "embeddings": { "model_name": MEDIUM_TOKENISER_MODEL_NAME,
                    "model_kwargs": {"device": device,},
                    "encode_kwargs": {"normalize_embeddings": True, "multi_process": False}
                   },
    "chunk_size": CHUNK_SIZE,
    "chunk_overlap": int(CHUNK_SIZE / 10),
    "add_start_index": True,
    "strip_whitespace": True,
    "separators": MARKDOWN_SEPARATORS,
}

In [None]:
# Write the single RAG prompt type results to the file
write_json_file(golden_large_model_file_path, rag_experiment_results)

Successfully saved content to golden/golden_results_few_shot_johndennehy101_Mistral-7B-Instruct-v0.3-finetune-irish-citizen-info-v1_NVIDIA_A100-SXM4-40GB_rag_temp0_topk0_topP0_repPenalty0_noRepeatNGrams0_max1000_05-07-2025_14-20-08.json


True

## Golden Dataset - Generate answers with LLM

In [None]:
# Extract golden dataset from stored json file
with open("golden_dataset.json", "r", encoding="utf-8") as f:
      golden_dataset = json.load(f)


# If set to true, run inference for model for all prompts
if RUN_ALL_PROMPTS:
  print("Running all prompts...")
  print("*" * 100)
  print("\n")
  print(LLM_HYPERPARAMETERS)

  # Loop over prompt types
  for prompt_type, prompt_text in prompt_dict_llm.items():
    print("Running {} prompt type".format(prompt_type))
    print("*" * 100)
    print("\n")
    llm_answers_large_model_golden_questions = []

    # Get prompt type contents for current prompt type
    prompt_template_llm = get_prompt_by_type(prompt_type, prompt_dict_llm)

    # construct prompt template
    PROMPT_LLM = PromptTemplate(
      template=prompt_template_llm, input_variables=["question"]
    )

    # Get post process function for current prompt type
    post_process_function = get_post_process_function_by_prompt_type(prompt_type, post_process_function_dict)

    # For each question in the golden dataset, generate inference responses and write experiment results to json files
    for i in golden_dataset:
      raw_llm_answer, formatted_llm_answer, _, timing_info, post_process_function_llm, run_time_llm = retry_hugging_face_inference(answer_without_rag, i["question"], llm, PROMPT_LLM, post_process_function)
      post_process_function_llm_string = inspect.getsource(post_process_function_llm)
      llm_answers_large_model_golden_questions.append({"ground_truth": i["answer"], "question": i["question"], "raw_answer": raw_llm_answer, "answer": formatted_llm_answer, "total_run_time": run_time_llm, "timing_info": timing_info, "post_process_function": post_process_function_llm_string})

    instance_name = get_instance_name()

    golden_large_model_file_path = generate_json_file_name(prompt_type, LLM_IN_USE_NAME, "llm", TEMPERATURE, TOP_K, TOP_P, REPETITION_PENALTY, NO_REPEAT_N_GRAMS, MAX_TOKENS, instance_name, "golden")

    llm_experiment_results = {
    "hyperparameters": LLM_HYPERPARAMETERS,
    "model": LLM_IN_USE_NAME,
    "results": llm_answers_large_model_golden_questions,
    "colab_instance": instance_name,
    "prompt_template": prompt_template_llm,
    "prompt_type": prompt_type
    }

    write_json_file(golden_large_model_file_path, llm_experiment_results)

# Otherwise, only run for current prompt type
else:
  llm_answers_large_model_golden_questions = []
  
  # Loop over golden dataset, generate outputs for each question and store information in above list for each
  for i in golden_dataset:
    print("Question: {}".format(i["question"]))
    print("\n")
    print("*" * 50)
    raw_llm_answer, formatted_llm_answer, _, timing_info, post_process_function_llm, run_time_llm = retry_hugging_face_inference(answer_without_rag, i["question"], llm, PROMPT_LLM, post_process_function)
    post_process_function_llm_string = inspect.getsource(post_process_function_llm)
    print("Answer: {}".format(formatted_llm_answer))
    print("\n")
    print("*" * 50)
    llm_answers_large_model_golden_questions.append({"ground_truth": i["answer"], "question": i["question"], "raw_answer": raw_llm_answer, "answer": formatted_llm_answer, "timing_info": timing_info, "post_process_function": post_process_function_llm_string, "total_run_time": run_time_llm})

Question: Who is entitled to Irish citizenship?


**************************************************
=> Generating answer...
Answer: - You are entitled to Irish citizenship if you were born in Ireland, or if you were born outside Ireland to an Irish parent.
- If you were born outside Ireland, you may be entitled to Irish citizenship if you were born to an Irish parent and your parent was born in Ireland.
- If you were born outside Ireland, you may be entitled to Irish citizenship if you were born to an Irish parent and your parent was born in Ireland.
- If you were born outside Ireland, you may be entitled to Irish citizenship if you were born to an Irish parent and your parent was born in Ireland.
- If you were born outside Ireland, you may be entitled to Irish citizenship if you were born to an Irish parent and your parent was born in Ireland.
- If you were born outside Ireland, you may be entitled to Irish citizenship if you were born to an Irish parent and your parent was born in I

In [None]:
instance_name = get_instance_name()

In [None]:
# Generate unique file path name
golden_llm_large_model_file_path = generate_json_file_name(PROMPT_TYPE_IN_USE, LLM_IN_USE_NAME, "llm", TEMPERATURE, TOP_K, TOP_P, REPETITION_PENALTY, NO_REPEAT_N_GRAMS, MAX_TOKENS, instance_name, "golden")

In [None]:
# Initialise results variable for LLM run (note this should only be used if only one prompt type was ran)
llm_experiment_results = {
    "hyperparameters": LLM_HYPERPARAMETERS,
    "model": LLM_IN_USE_NAME,
    "results": llm_answers_large_model_golden_questions,
    "colab_instance": instance_name,
    "prompt_template": prompt_template_llm,
    "prompt_type": PROMPT_TYPE_IN_USE
}

In [None]:
# Write the single prompt type experiment to the json file
write_json_file(golden_llm_large_model_file_path, llm_experiment_results)

Successfully saved content to golden/golden_results_few_shot_johndennehy101_Mistral-7B-Instruct-v0.3-finetune-irish-citizen-info-v1_NVIDIA_A100-SXM4-40GB_llm_temp0_topk0_topP0_repPenalty0_noRepeatNGrams0_max1000_05-07-2025_14-29-09.json


True

## Evaluation

In [None]:
# Extract all result files for rag within the content directory
rag_results_file_paths = get_file_paths_matching_regex("/content", r"(?=.*golden_results)(?=.*rag)")

In [None]:
# Extract all result files for llm within the content directory
llm_results_file_paths = get_file_paths_matching_regex("/content", r"(?=.*golden_results)(?=.*llm)")

In [None]:
# Construct a dict with model results for rag
rag_model_results_dict = {}

for file_path in rag_results_file_paths:
  rag_model_results_contents = read_json_file(file_path)
  rag_model_results_dict[file_path] = rag_model_results_contents

Successfully loaded content from /content/golden/golden_results_few_shot_johndennehy101_Mistral-7B-Instruct-v0.3-finetune-irish-citizen-info-v1_NVIDIA_A100-SXM4-40GB_rag_temp0_topk0_topP0_repPenalty0_noRepeatNGrams0_max1000_05-07-2025_14-20-08.json file


In [None]:
# Construct a dict with model results for llm
llm_model_results_dict = {}

for file_path in llm_results_file_paths:
  llm_model_results_contents = read_json_file(file_path)
  llm_model_results_dict[file_path] = llm_model_results_contents

Successfully loaded content from /content/golden/golden_results_few_shot_johndennehy101_Mistral-7B-Instruct-v0.3-finetune-irish-citizen-info-v1_NVIDIA_A100-SXM4-40GB_llm_temp0_topk0_topP0_repPenalty0_noRepeatNGrams0_max1000_05-07-2025_14-29-09.json file


### Zero shot classification

In [None]:
def generate_zero_shot_classification(model, tokeniser, ground_truth: str, generated_text: str):
  """
  Utilise another LLM model to generate zero shot classification for generated text output using the provided ground truth as baseline
  """

  # Tokenise inputs
  inputs = tokeniser(ground_truth, generated_text, return_tensors="pt", truncation=True)

  # Disable gradient calculations to speed inference
  with torch.no_grad():
    logits = model(**inputs).logits

  # Apply softmax to convert logits into probabilites
  probs = F.softmax(logits, dim=1)

  # Extract the label mapping from the configuration
  labels = model.config.id2label

  # Construct dict for probs
  label_probabilities = {label: probs[0][i].item() for i, label in labels.items()}

  # Extract label with max probability value
  max_label = max(label_probabilities, key=label_probabilities.get)

  # Ensure both the most probable label and label probabilities for all are returned to enable comparison
  result = (max_label, label_probabilities)

  return result

In [None]:
# Initialise zero-shot labelling model and tokeniser
zero_shot_classification_model_name = "facebook/bart-large-mnli"
zero_shot_tokeniser = AutoTokenizer.from_pretrained(zero_shot_classification_model_name)
zero_shot_model = AutoModelForSequenceClassification.from_pretrained(zero_shot_classification_model_name)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

### Run evaluation suite - RAG

In [None]:
# Run evaluation suite for RAG
for _, rag_model_results_values in rag_model_results_dict.items():
  print(rag_model_results_values)
  rag_model_results_values["zero_shot"] = {
      "model": zero_shot_classification_model_name,
  }

  # for individual_item in rag_model_results_values["/content/golden/golden_results_meta-llama_Llama-3.2-3B-Instruct_NVIDIA_A100-SXM4-40GB_rag_temp1_topk0_topP0_repPenalty0_noRepeatNGrams0_max150_07-06-2025_13-13-28.json"]["results"]:
  for individual_item in rag_model_results_values["results"]:
    # Calculate rouge scores
    rag_rouge_1, rag_rouge_2, rag_rouge_l = generate_rouge_scores(individual_item["ground_truth"], individual_item["answer"])

    # Calculate precision, recall f1 scores
    precision_bert, recall_bert, f1_bert = score([individual_item["answer"]], [individual_item["ground_truth"]], model_type="bert-base-uncased", lang="en", rescale_with_baseline=True)

    # Calculate zero-shot labels
    zero_shot_label_prediction, zero_shot_probabilities = generate_zero_shot_classification(zero_shot_model, zero_shot_tokeniser, individual_item["ground_truth"], individual_item["answer"])

    # Assign results to dict
    individual_item["rouge1_score"] = rag_rouge_1
    individual_item["rouge2_score"] = rag_rouge_2
    individual_item["rougeL_score"] = rag_rouge_l
    individual_item["bert_precision"] = precision_bert.item()
    individual_item["bert_recall"] = recall_bert.item()
    individual_item["bert_f1"] = f1_bert.item()
    individual_item["zero_shot_label"] = zero_shot_label_prediction
    individual_item["zero_shot_probabilities"] = zero_shot_probabilities

  # Generate unique file path name
  golden_evaluation_results_rag_large_model_file_path = generate_json_file_name(rag_model_results_values["prompt_type"], LLM_IN_USE_NAME, "rag", TEMPERATURE, TOP_K, TOP_P, REPETITION_PENALTY, NO_REPEAT_N_GRAMS, MAX_TOKENS, instance_name, "results")
  
  # Write contents to file
  write_json_file(golden_evaluation_results_rag_large_model_file_path, rag_model_results_values)

{'hyperparameters': {'max_new_tokens': 1000}, 'model': 'johndennehy101/Mistral-7B-Instruct-v0.3-finetune-irish-citizen-info-v1', 'results': [{'ground_truth': 'You are entitled to claim Irish citizenship if any one of the following applies: 1. You were born in Ireland or Northern Ireland before 1 January 2005 2. You were born in Ireland or Northern Ireland after 31 December 2004 and any of the following applies to you: One (or both) of your parents was Irish or entitled to be an Irish citizen or a British citizen at the time of your birth One (or both) of your parents was an Irish or British citizen and died before you were born One (or both) of your parents was entitled to live in Ireland or Northern Ireland without any restriction on their period of residency One (or both) of your parents was legally resident on the island of Ireland for 3 out of the 4 years immediately before your birth (this does not include residence on a student visa, or residence while awaiting an international p

In [None]:
golden_evaluation_results_rag_large_model_file_path = generate_json_file_name(PROMPT_TYPE_IN_USE, LLM_IN_USE_NAME, "rag", TEMPERATURE, TOP_K, TOP_P, REPETITION_PENALTY, NO_REPEAT_N_GRAMS, MAX_TOKENS, instance_name, "results")

In [None]:
write_json_file(golden_evaluation_results_rag_large_model_file_path, rag_model_results_dict)

Successfully saved content to results/golden_results_instruction_mistralai_Mistral-7B-Instruct-v0.3_NVIDIA_A100-SXM4-40GB_rag_temp0_topk0_topP0_repPenalty0_noRepeatNGrams0_max0_17-06-2025_09-17-25.json


True

### Run evaluation suite - LLM

In [None]:
# Run evaluation suite for LLM
for _, llm_model_results_values in llm_model_results_dict.items():
  print(llm_model_results_values)
  llm_model_results_values["zero_shot"] = {
      "model": zero_shot_classification_model_name,
  }
  # for individual_item in llm_model_results_values["/content/golden/golden_results_meta-llama_Llama-3.2-3B-Instruct_NVIDIA_A100-SXM4-40GB_llm_temp1_topk0_topP0_repPenalty0_noRepeatNGrams0_max150_07-06-2025_13-15-58.json"]["results"]:
  for individual_item in llm_model_results_values["results"]:
    # Generate ROUGE scores
    llm_rouge_1, llm_rouge_2, llm_rouge_l = generate_rouge_scores(individual_item["ground_truth"], individual_item["answer"])

    # Generate precision, recall, F1 scores
    precision_bert, recall_bert, f1_bert = score([individual_item["answer"]], [individual_item["ground_truth"]], model_type="bert-base-uncased", lang="en", rescale_with_baseline=True)

    # Generate zero-shot labels
    zero_shot_label_prediction, zero_shot_probabilities = generate_zero_shot_classification(zero_shot_model, zero_shot_tokeniser, individual_item["ground_truth"], individual_item["answer"])

    # Assign result values to individual items
    individual_item["rouge1_score"] = llm_rouge_1
    individual_item["rouge2_score"] = llm_rouge_2
    individual_item["rougeL_score"] = llm_rouge_l
    individual_item["bert_precision"] = precision_bert.item()
    individual_item["bert_recall"] = recall_bert.item()
    individual_item["bert_f1"] = f1_bert.item()
    individual_item["zero_shot_label"] = zero_shot_label_prediction
    individual_item["zero_shot_probabilities"] = zero_shot_probabilities
  
  # Generate unique file path name
  golden_evaluation_results_llm_large_model_file_path = generate_json_file_name(llm_model_results_values["prompt_type"], LLM_IN_USE_NAME, "llm", TEMPERATURE, TOP_K, TOP_P, REPETITION_PENALTY, NO_REPEAT_N_GRAMS, MAX_TOKENS, instance_name, "results")
  
  # Write contents to results file
  write_json_file(golden_evaluation_results_llm_large_model_file_path, llm_model_results_values)

{'hyperparameters': {'max_new_tokens': 1000}, 'model': 'johndennehy101/Mistral-7B-Instruct-v0.3-finetune-irish-citizen-info-v1', 'results': [{'ground_truth': 'You are entitled to claim Irish citizenship if any one of the following applies: 1. You were born in Ireland or Northern Ireland before 1 January 2005 2. You were born in Ireland or Northern Ireland after 31 December 2004 and any of the following applies to you: One (or both) of your parents was Irish or entitled to be an Irish citizen or a British citizen at the time of your birth One (or both) of your parents was an Irish or British citizen and died before you were born One (or both) of your parents was entitled to live in Ireland or Northern Ireland without any restriction on their period of residency One (or both) of your parents was legally resident on the island of Ireland for 3 out of the 4 years immediately before your birth (this does not include residence on a student visa, or residence while awaiting an international p

In [None]:
golden_evaluation_results_llm_large_model_file_path = generate_json_file_name(PROMPT_TYPE_IN_USE, LLM_IN_USE_NAME, "llm", TEMPERATURE, TOP_K, TOP_P, REPETITION_PENALTY, NO_REPEAT_N_GRAMS, MAX_TOKENS, instance_name, "results")

In [None]:
write_json_file(golden_evaluation_results_llm_large_model_file_path, llm_model_results_dict)

Successfully saved content to results/golden_results_instruction_mistralai_Mistral-7B-Instruct-v0.3_NVIDIA_A100-SXM4-40GB_llm_temp0_topk0_topP0_repPenalty0_noRepeatNGrams0_max0_17-06-2025_09-18-02.json


True

In [None]:
# Tidy up so that new iteration can be run without requiring reload of full notebook
delete_model(llm)

In [None]:
# Tidy up file directories to remove generated outpus
delete_files_in_directory("/content/golden")
delete_files_in_directory("/content/results")

Deleted 1 files from /content/golden directory
Deleted 1 files from /content/results directory
