<a href="https://colab.research.google.com/github/Janhvi007/Flexible-RAG-style-system/blob/main/Copy_of_LLM_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook is a complete “question-answering” setup for working with PDFs and images that may contain both digital text and scanned pages. It starts by installing the tools it needs: Poppler (to convert PDFs into images), Tesseract (for OCR), and some Python libraries for reading PDFs, doing OCR, working with text, and running a small open-source language model.

The workflow looks like this:

1. Unpack the data = It takes the 'data.zip' file we upload  and extracts everything into a folder.
2. Read or OCR each file = For every page, it tries to pull the text directly. If that page has little or no text, it turns it into an image and runs OCR on it. This way we capture both real text and text hidden inside scanned images.
3. Create a single text file = It then saves all extracted text from all files into one big file 'ocr_output.txt', adding a note of which file each part came from.
4. Break the text into chunks=– Because a language model can’t read a huge document in one go, the script splits the text into overlapping pieces. These chunks are turned into a TF-IDF index (a way of quickly finding the chunks most related to a question).
5. Load a model = It loads Microsoft’s Phi-2, a small but capable free language model, so all processing runs locally in Colab without needing an API key.
6. Ask questions – There are two main ways to query:

   *Semantic search- 'ask_one' finds the chunks most similar to your question, then has the model answer using only that context.
   *Keyword search - 'ask_forced'pulls in small snippets of text that match specific words or phrases you give it, then answers strictly from those snippets.
     Both methods will say “Not enough evidence” if the answer isn’t in the retrieved text.

---




In [None]:


import os, re, zipfile, io, sys, warnings, subprocess
from pathlib import Path


PIN_TRANSFORMERS = "4.41.2"
PIN_ACCELERATE   = "0.30.1"


ZIP_PATH = "data.zip"            # your uploaded zip file
OUT_DIR = Path("/content/uploads")
OCR_TXT = Path("ocr_output.txt")


def _run(cmd):
    print(f"→ {cmd}")
    return subprocess.check_call(cmd, shell=True)


_run("apt-get -qq update")
_run("apt-get -qq install -y poppler-utils tesseract-ocr >/dev/null")


_run(f"pip -q install PyPDF2 pdf2image pytesseract scikit-learn pillow transformers=={PIN_TRANSFORMERS} accelerate=={PIN_ACCELERATE}")


from PyPDF2 import PdfReader
from pdf2image import convert_from_path
import pytesseract
from PIL import Image
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import torch
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM

warnings.filterwarnings("ignore")


OUT_DIR.mkdir(parents=True, exist_ok=True)
if os.path.exists(ZIP_PATH):
    with zipfile.ZipFile(ZIP_PATH, "r") as z:
        z.extractall(OUT_DIR)
    print(f" Extracted: {ZIP_PATH} → {OUT_DIR}")
else:
    print(" data.zip not found. Upload it to the Colab working directory.")


def ocr_pil(img: Image.Image) -> str:
    try:
        return pytesseract.image_to_string(img)
    except Exception:
        return ""

def pdf_text_or_ocr(pdf_path: str, dpi=220) -> str:
    text_all = []
    try:
        reader = PdfReader(pdf_path)
        for i, page in enumerate(reader.pages):
            t = (page.extract_text() or "").strip()
            if len(t) >= 50:
                text_all.append(t)
            else:
                # OCR that page only
                pages = convert_from_path(pdf_path, dpi=dpi, first_page=i+1, last_page=i+1)
                if pages:
                    text_all.append(ocr_pil(pages[0]))
    except Exception:
        # fallback: OCR whole doc
        try:
            pages = convert_from_path(pdf_path, dpi=dpi)
            for p in pages:
                text_all.append(ocr_pil(p))
        except Exception:
            pass
    return "\n".join(text_all)

def ocr_any(path: Path) -> str:
    p = str(path)
    low = p.lower()
    if low.endswith(".pdf"):
        return pdf_text_or_ocr(p)
    if low.endswith((".png",".jpg",".jpeg",".tif",".tiff",".bmp",".webp")):
        try:
            img = Image.open(p)
            return ocr_pil(img)
        except Exception:
            return ""
    return ""


files = []
for root, dirs, fnames in os.walk(OUT_DIR):
    for fn in fnames:
        fp = Path(root) / fn
        if fp.suffix.lower() in (".pdf",".png",".jpg",".jpeg",".tif",".tiff",".bmp",".webp"):
            files.append(fp)

print(f" Found {len(files)} PDF/image files to process.")
with open(OCR_TXT, "w", encoding="utf-8") as f:
    for i, fp in enumerate(files, 1):
        print(f"[{i}/{len(files)}] {fp}")
        txt = ocr_any(fp)
        if txt.strip():
            f.write(f"\n\n--- FILE: {fp} ---\n{txt}")

print(" Built ocr_output.txt")


try:
    raw = OCR_TXT.read_text(encoding="utf-8")
except UnicodeDecodeError:
    raw = OCR_TXT.read_text(encoding="latin1")
raw = re.sub(r"\s+", " ", raw).strip()
print(" OCR text length:", len(raw))


def chunk_text(t, chunk_size=2000, overlap=250):
    out=[]; i=0
    step = max(1, chunk_size - overlap)
    while i < len(t):
        out.append(t[i:i+chunk_size])
        i += step
    return [c for c in out if c.strip()]


chunks = []
vectorizer = None
X = None

def rebuild_retriever(text=None, chunk_size=2000, overlap=250):
    """(Re)build TF-IDF index from text (or ocr_output.txt)."""
    global chunks, vectorizer, X
    if text is None:
        try:
            t = OCR_TXT.read_text(encoding="utf-8")
        except UnicodeDecodeError:
            t = OCR_TXT.read_text(encoding="latin1")
        t = re.sub(r"\s+", " ", t).strip()
    else:
        t = re.sub(r"\s+", " ", text).strip()

    chunks = chunk_text(t, chunk_size=chunk_size, overlap=overlap)
    if not chunks:
        vectorizer = None; X = None
        print(" No chunks built (is OCR empty?)."); return
    vectorizer = TfidfVectorizer(lowercase=True, token_pattern=r"(?u)\b\w+\b", ngram_range=(1,2), min_df=1)
    X = vectorizer.fit_transform(chunks)
    print(f" Retriever ready. Chunks: {len(chunks)}")

def retrieve(question, top_k=5):
    """Safe retrieval that auto-rebuilds and guards shape drift."""
    global chunks, vectorizer, X
    if vectorizer is None or X is None or not chunks:
        rebuild_retriever()
        if vectorizer is None:
            return []
    if X.shape[0] != len(chunks):
        rebuild_retriever()
    q_vec = vectorizer.transform([question])
    sims = cosine_similarity(q_vec, X)[0]
    order = sims.argsort()[::-1][:min(top_k, len(chunks))]
    return [chunks[i] for i in order]


rebuild_retriever()


model_name = "microsoft/phi-2"
has_cuda = torch.cuda.is_available()
generator = pipeline(
    "text-generation",
    model=model_name,
    device_map="auto" if has_cuda else None,
    torch_dtype=torch.float16 if has_cuda else None,
)


def ask_one(question, top_k=6, max_ctx_chars=2200, max_new_tokens=160):
    """
    Retrieve semantically relevant chunks and answer from ONLY that context.
    If not found, the model should say 'Not enough evidence'.
    """
    ctx = "\n".join(retrieve(question, top_k=top_k))[:max_ctx_chars]
    if not ctx.strip():
        return "Not enough evidence"
    prompt = (
        "Context:\n" + ctx +
        "\n\nQuestion: " + question +
        "\nAnswer clearly using only the context above. If not in context, say 'Not enough evidence'.\n\nAnswer:"
    )
    out = generator(
        prompt,
        max_new_tokens=max_new_tokens,
        do_sample=False,
        truncation=True,
        pad_token_id=generator.tokenizer.eos_token_id,
    )[0]["generated_text"]
    ans = out.split("Answer:")[-1].strip()
    return ans or "Not enough evidence"


import re
from textwrap import shorten

def _windows_by_regex(text, patterns_any=None, patterns_all=None, window=360, max_hits=30):
    if not text: return []
    patterns_any = patterns_any or []
    patterns_all = patterns_all or []
    any_res = [re.compile(p, re.I) for p in patterns_any] if patterns_any else [re.compile(r".")]
    all_res = [re.compile(p, re.I) for p in patterns_all]

    hits=[]
    for rex in any_res:
        for m in rex.finditer(text):
            s = max(0, m.start()-window)
            e = min(len(text), m.end()+window)
            snip = text[s:e].replace("\n"," ")
            if all(ar.search(snip) for ar in all_res):
                hits.append(snip)


    seen=set(); uniq=[]
    for h in hits:
        k=h.strip()
        if k not in seen:
            seen.add(k); uniq.append(k)


    def score(sn):
        nums = re.findall(r"\$?\d[\d,\.]*%?", sn)
        return (len(nums), len(sn))
    uniq.sort(key=score, reverse=True)
    return uniq[:max_hits]

def ask_forced(question, patterns_any=None, patterns_all=None, max_ctx_chars=2200, max_snips=8, max_new_tokens=160):
    try:
        text = OCR_TXT.read_text(encoding="utf-8")
    except UnicodeDecodeError:
        text = OCR_TXT.read_text(encoding="latin1")
    snips = _windows_by_regex(text, patterns_any, patterns_all, window=360, max_hits=30)
    if not snips:
        return "Not enough evidence"
    ctx = " \n\n---\n\n ".join(snips[:max_snips])[:max_ctx_chars]
    prompt = (
        "Evidence:\n" + ctx +
        "\n\nQuestion: " + question +
        "\nAnswer strictly from the evidence above. If not present, say 'Not enough evidence'.\n\nAnswer:"
    )
    out = generator(
        prompt,
        max_new_tokens=max_new_tokens,
        do_sample=False,
        truncation=True,
        pad_token_id=generator.tokenizer.eos_token_id,
    )[0]["generated_text"]
    ans = out.split("Answer:")[-1].strip()
    return ans or "Not enough evidence"

print("\n Setup complete.")
print("Use ask_one('your question') for semantic retrieval, or ask_forced('q', patterns_any=[...]) for regex-anchored evidence.")
print("Examples:")
print(" ask_one('Is the lending portfolio described as diversified or specialized?')")
print(" ask_forced('What is the reported EPS?', patterns_any=[r'\\bEPS\\b', r'earnings per share'])")


I asked this question for example but it was giving answers from the entire zip file and all pdf's .

In [None]:
ask_forced(
  "What is the weighted average yield on total investments at amortized cost?",
  patterns_any=["weighted average yield", "total investments", "amortized cost"]
)

Earlier, the code was extracting answers by searching through all the PDFs in the uploaded ZIP file at once.That worked fine when I didn’t care where the answer came from, but sometimes I need to ask a question about a specific PDF.To do that, I built this extra helper code.


1.   The code imports 'os' and 'Path' from 'pathlib' for file handling. It also creates _pdf_cache, which stores text that’s already been extracted from a PDF so you don’t run OCR on it again. This saves time if you ask multiple questions from the same file.
2. **find_pdf function – locating the file**
You give it  the filename.It searches the extracted folder (OUT_DIR) for any PDF whose name contains that fragment.
If there’s more than one match, it picks the shortest file path (a simple way to pick a best match).If nothing matches, it returns None.
3.**pdf_text function** –getting the text from that PDFGiven the full PDF path, it first checks '_pdf_cache' to see if the text is already stored.
If not, it calls your earlier 'ocr_any' function to extract the text (running OCR if needed), then stores and returns it.
4.**windows_by_keywords – pulling small evidence windows**
This is for “keyword mode” (no regex needed).
You give it the PDF’s text and a list of keywords.
It finds each keyword in the text and grabs a small snippet (default 200 characters before and after).
It removes duplicates, keeps them in the order found, and limits the result to max_hits snippets.
This keeps the evidence short and relevant.
5.**ask_from_pdf –** the main function you’ll call.
This is what you use to ask your question from a single PDF.

Inputs:

pdf_fragment: part of the filename to pick the right PDF.

question: your natural-language question.

mode: "semantic" or "keywords".

keywords: required if mode="keywords".

Other parameters control chunk sizes, snippet sizes, number of results, etc.










In [None]:

import os
from pathlib import Path

_pdf_cache = {}

def find_pdf(name_fragment: str) -> Path | None:
    frag = name_fragment.lower().strip()
    matches = []
    for root, _, files in os.walk(OUT_DIR):
        for fn in files:
            if fn.lower().endswith(".pdf") and frag in fn.lower():
                matches.append(Path(root) / fn)
    if not matches:
        return None
    matches.sort(key=lambda p: len(str(p)))
    return matches[0]

def pdf_text(pdf_path: Path) -> str:
    key = str(pdf_path)
    if key in _pdf_cache:
        return _pdf_cache[key]
    text = ocr_any(pdf_path)
    _pdf_cache[key] = text
    return text

def windows_by_keywords(text: str, keywords: list[str], window=200, max_hits=8):
    if not text or not keywords: return []
    s = text
    s_low = s.lower()
    hits = []
    for kw in keywords:
        kw_low = kw.lower()
        i = 0
        while True:
            j = s_low.find(kw_low, i)
            if j == -1: break
            start = max(0, j - window)
            end   = min(len(s), j + len(kw) + window)
            hits.append(s[start:end].replace("\n", " ").strip())
            i = j + len(kw)

    seen, uniq = set(), []
    for h in hits:
        if h not in seen:
            seen.add(h); uniq.append(h)
    return uniq[:max_hits]

def ask_from_pdf(pdf_fragment: str,
                 question: str,
                 mode: str = "semantic",
                 keywords: list[str] | None = None,
                 *,
                 top_k: int = 8,
                 chunk_size: int = 1400,
                 overlap: int = 200,
                 window: int = 220,
                 max_snips: int = 6,
                 max_ctx_chars: int = 1800,
                 max_new_tokens: int = 200):

    pdf_path = find_pdf(pdf_fragment)
    if not pdf_path:
        return f"Not found: a PDF containing '{pdf_fragment}'."
    text = pdf_text(pdf_path)
    if not text.strip():
        return f"OCR produced no text for: {pdf_path.name}"

    if mode == "semantic":

        rebuild_retriever(text=text, chunk_size=chunk_size, overlap=overlap)
        return ask_one(question, top_k=top_k, max_ctx_chars=max_ctx_chars, max_new_tokens=max_new_tokens)


    if not keywords:
        return "Please provide keywords for keyword mode (e.g., ['weighted average yield','amortized cost'])."
    snips = windows_by_keywords(text, keywords, window=window, max_hits=max_snips)
    if not snips:
        return "Not enough evidence"
    ctx = " \n\n---\n\n ".join(snips)[:max_ctx_chars]
    prompt = (
        "Evidence:\n" + ctx +
        "\n\nQuestion: " + question +
        "\nAnswer strictly from the evidence above. If not present, say 'Not enough evidence'.\n\nAnswer:"
    )
    out = generator(
        prompt,
        max_new_tokens=max_new_tokens,
        do_sample=False,
        truncation=True,
        pad_token_id=generator.tokenizer.eos_token_id,
    )[0]["generated_text"]
    return (out.split("Answer:")[-1].strip() or "Not enough evidence")


In [None]:
ask_from_pdf(
  "Ares Capital Corporation_Earnings Call_2024-02-07",
  "Summarize how the weighted average yield changed compared with prior periods.",
  mode="semantic",
  top_k=10,
  max_ctx_chars=3500
)


I tried this example for to see if it works with longer prompts.


In [None]:
ask_from_pdf(
  "Ares Capital Corporation_Earnings Call_2024-02-07_English.pdf",
  "Using only the information provided in this specific earnings call transcript, provide a detailed explanation "
  "of how the weighted average yield on total investments at amortized cost and on the company's debt and other "
  "income-producing securities has changed compared with prior reporting periods. Include the exact percentage "
  "values for each relevant period mentioned, explain the direction and magnitude of any changes, and summarize "
  "any factors or market conditions that management cited as contributing to these changes.",
  mode="semantic",
  top_k=10,
  max_ctx_chars=3500
)


Here’s what this add-on does ,it’s specifically for extracting information from charts and figures.This code explicitly avoids macOS junk files.
Here’s the same explanation in simpler, human-friendly language:



### 1) Get the tools ready

It installs:

* transformers (for AI models)
* timm and pillow (for working with images)
* pymupdf (fast way to turn PDF pages into images)


### 2) Turn each PDF page into an image

* Looks through your uploads folder for all PDF files.
* Skips macOS junk files like __MACOSX.
* Opens each PDF and saves every page as a PNG image.
* Makes images twice as big (for better readability), but keeps them under 1400 pixels wide so they’re not too heavy.
* Stores them in '/content/chart_pages'.





### 3) Load two AI models for charts

* **BLIP** → Looks at an image and writes a short caption, often catching titles, labels, or keywords.
* **DePlot** → Looks at the image and tries to turn it into a table of numbers. It’s not perfect but can give useful data.


### 4) Pick only the pages that seem like charts

* The code checks BLIP’s caption for chart-related words like “chart”, “figure”, “EPS”, “revenue”, “yield”, “ratio”, etc.
* If the caption matches, it:

  1. Keeps the caption.
  2. Runs DePlot to get a table of numbers (if possible).

### 5) Save the chart info and add it to your search

* Everything found is written to 'charts_extracted.txt', including:

  1. The chart’s file name
  2.The BLIP caption
  3.Any table text from DePlot


 This chart text is combined with your regular OCR text 'ocr_output.txt' and the retriever is rebuilt so now searches include chart info too.














In [None]:


!pip -q install "transformers>=4.41.0" pillow timm==0.9.16
!pip -q install pymupdf

import os, io, re, math
from pathlib import Path
from PIL import Image
import fitz  # PyMuPDF
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration
from transformers import Pix2StructProcessor, Pix2StructForConditionalGeneration

CHART_IMG_DIR = Path("/content/chart_pages")
CHART_TXT = Path("charts_extracted.txt")
CHART_IMG_DIR.mkdir(parents=True, exist_ok=True)

device = 0 if torch.cuda.is_available() else -1
dev_str = "cuda:0" if device == 0 else "cpu"
print("Using device:", dev_str)


def is_sidecar(p: Path):
    return "/__MACOSX/" in str(p) or p.name.startswith("._") or p.name == ".DS_Store"

def render_pdf_pages_to_images(root="/content/uploads", max_width=1400):
    saved = []
    pdfs = []
    for r, d, fns in os.walk(root):
        for fn in fns:
            p = Path(r) / fn
            if p.suffix.lower() == ".pdf" and not is_sidecar(p):
                try:
                    if p.stat().st_size >= 2_000:
                        pdfs.append(p)
                except Exception:
                    continue
    pdfs.sort()
    for pdf in pdfs:
        try:
            doc = fitz.open(str(pdf))
        except Exception:
            continue
        for i in range(len(doc)):
            page = doc[i]

            zoom = 2.0
            mat = fitz.Matrix(zoom, zoom)
            pix = page.get_pixmap(matrix=mat, alpha=False)
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

            if img.width > max_width:
                ratio = max_width / img.width
                img = img.resize((max_width, int(img.height * ratio)), Image.LANCZOS)
            out_path = CHART_IMG_DIR / f"{pdf.stem}_p{i+1}.png"
            img.save(out_path, "PNG", optimize=True)
            saved.append(out_path)
        doc.close()
    print(f" Rendered {len(saved)} page images.")
    return saved


blip_name = "Salesforce/blip-image-captioning-base"
blip_processor = BlipProcessor.from_pretrained(blip_name)
blip_model = BlipForConditionalGeneration.from_pretrained(blip_name).to(dev_str)

deplot_name = "google/deplot"
deplot_processor = Pix2StructProcessor.from_pretrained(deplot_name)
deplot_model = Pix2StructForConditionalGeneration.from_pretrained(deplot_name).to(dev_str)

def caption_image(img: Image.Image, max_new_tokens=64):
    inputs = blip_processor(images=img, return_tensors="pt").to(blip_model.device)
    out = blip_model.generate(**inputs, max_new_tokens=max_new_tokens)
    return blip_processor.decode(out[0], skip_special_tokens=True).strip()

def chart_to_table_text(img: Image.Image, max_new_tokens=256):

    prompt = "Generate the data table of the figure below:"
    inputs = deplot_processor(images=img, text=prompt, return_tensors="pt").to(deplot_model.device)
    out = deplot_model.generate(**inputs, max_new_tokens=max_new_tokens)
    txt = deplot_processor.decode(out[0], skip_special_tokens=True).strip()

    return txt


def looks_like_chart(text_caption: str):

    keys = [
        "chart", "figure", "stock", "price", "volume", "eps", "surprise",
        "trend", "normalized", "revenue", "mm", "vs.", "index", "yield",
        "spread", "LTV", "loan-to-value", "nonaccrual", "coverage", "ratio"
    ]
    t = text_caption.lower()
    return any(k in t for k in keys)

def process_charts(max_pages=120):
    page_imgs = sorted(list(CHART_IMG_DIR.glob("*.png")))
    if not page_imgs:
        page_imgs = render_pdf_pages_to_images()
    results = []
    for i, p in enumerate(page_imgs[:max_pages], 1):
        try:
            img = Image.open(p).convert("RGB")
        except Exception:
            continue
        cap = caption_image(img)
        if looks_like_chart(cap):
            try:
                table_txt = chart_to_table_text(img)
            except Exception:
                table_txt = ""
            results.append((p.name, cap, table_txt))
        if i % 10 == 0:
            print(f"Processed {i}/{min(len(page_imgs), max_pages)} pages...")
    return results


chart_items = process_charts(max_pages=250)

with open(CHART_TXT, "w", encoding="utf-8") as f:
    for name, cap, table in chart_items:
        f.write(f"\n\n--- CHART_PAGE: {name} ---\n")
        f.write(f"[Caption]\n{cap}\n")
        if table and len(table.strip()) > 0:
            f.write("\n[DePlot table]\n")
            f.write(table)

print(f" Charts processed: {len(chart_items)}")
print(f" Saved chart text to: {CHART_TXT}")

# Merge into retriever
try:
    base_text = Path("ocr_output.txt").read_text(encoding="utf-8")
except UnicodeDecodeError:
    base_text = Path("ocr_output.txt").read_text(encoding="latin1")

try:
    chart_text = CHART_TXT.read_text(encoding="utf-8")
except UnicodeDecodeError:
    chart_text = CHART_TXT.read_text(encoding="latin1")

merged = (base_text + "\n\n" + chart_text).strip()
rebuild_retriever(text=merged, chunk_size=2200, overlap=300)
print(" Retriever rebuilt with chart captions + DePlot tables.")
print("Now ask chart questions via ask_one(...).")


Here’s what it does step by step:

1) Get the most relevant text for your question

Uses the retrieve() function to grab the top_k chunks from the document that are most related to your question.Joins them into one text block, but limits it to max_ctx_chars characters so it’s not too big.

2)  Keep only lines that have certain keywords

If require_keyword_in_line=True and you provide a keywords list, it will only keep the lines from that text that contain at least one of those keywords.This helps focus on exactly the part of the text you care about.

3) Decide what kinds of numbers to look for.If you don’t give your own regex patterns, it defaults to two patterns:

-Percentages

-Plain numbers

4) Search for matching numbers

Runs each regex pattern on the text and collects all matches into a list.

5) Return the first match

If it found any numbers, it returns:

answer=  the first number found.

evidence =the list of all matches found.

raw_context = the chunk of text where it found them.

If no numbers were found, it returns "Not enough evidence".

In [None]:
def ask_number(question, keywords=None, number_regexes=None, top_k=5, max_ctx_chars=2000, require_keyword_in_line=False):
    """
    Extracts numeric answer from context using regex, optionally filtering lines by keyword.
    """

    ctx_chunks = retrieve(question, top_k=top_k)
    context = "\n".join(ctx_chunks)[:max_ctx_chars]


    lines = context.split("\n")
    if require_keyword_in_line and keywords:
        lines = [line for line in lines if any(kw.lower() in line.lower() for kw in keywords)]
        context = "\n".join(lines)


    if number_regexes is None:
        number_regexes = [
            r'\b\d+(?:\.\d+)?%\b',  # percentages
            r'\b\d+(?:\.\d+)?\b'    # plain numbers
        ]

    import re
    matches = []
    for pattern in number_regexes:
        matches.extend(re.findall(pattern, context))


    if matches:
        return {"answer": matches[0], "evidence": matches, "raw_context": context}

    return {"answer": "Not enough evidence", "evidence": [], "raw_context": context}



In [None]:
ask_number(
    "What was the weighted average LTV of all new investments this quarter?",
    keywords=["weighted average LTV", "LTV", "loan-to-value"],
    require_keyword_in_line=True
)

*Answer to the second task of developing a prompt




In [None]:
ask_from_pdf(
  "Ares Capital Corporation_Earnings Call_2024-02-07_English.pdf",
  "For the Ares Capital earnings call on February 7, 2024, check whether the lending portfolio is described as diversified or specialized. Include the strongest evidence, such as the number of portfolio companies, mix of industries or sectors, geographic reach, top-10 investment share, average deal size, and any mention of first/second-lien or unitranche loans. Summarize all the reasons management gives for following this approach, explain the benefits they mention, and note any performance results they connect to it, such as returns, yield, non-accrual rates, credit ratings, or EBITDA growth. Also point out any changes from previous periods and why they happened.",
  mode="semantic",
  top_k=12,
  max_ctx_chars=5000
)
