# Environment Setup

In [None]:
# Install necessary Python packages
!pip install -q python-docx google-api-python-client faiss-cpu

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m36.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# Import necessary libraries

# Standard library
import os, io, re, json, math
from json import JSONDecodeError
from dataclasses import dataclass
from typing import Any, Dict

# Third-party libraries
import numpy as np
import pandas as pd
import faiss
from tqdm import tqdm
from docx import Document

# Google Colab & Drive API
from google.colab import auth
from oauth2client.client import GoogleCredentials
from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseDownload, MediaFileUpload

 # Drive Utilities and Global Configuration

In [None]:
# [Global Config + Drive Session]
# ---------------------------------------------------------------------
# Centralised configuration and reusable Google Drive client/session.
# - Define filenames and folder IDs used across the notebook.
# - Provide small, focused helpers for Drive lookups and downloads.
# - Cache the Drive service so we authenticate only once.
# ---------------------------------------------------------------------

# --- Global configuration -----------------------------------------------------
DATABASE_FOLDER_ID      = "1AxyCDytXdUViBaOAT5dyQbOmbECIqS4C"
DATABASE_FILENAME       = "database_df.jsonl"
CHUNKS_FILENAME         = "chunks.jsonl"
DEMBEDDINGS_FILENAME    = "dense_embeddings.npy"
FAISS_FILENAME          = "faiss.index"

# --- Drive session (cached) ---------------------------------------------------
drive = None

def _drive():
  """Return a cached Google Drive v3 client, authenticating on first use."""
  global drive
  if drive is None:
    auth.authenticate_user()
    creds = GoogleCredentials.get_application_default()
    drive = build("drive", "v3", credentials=creds)
  return drive

def _find_file(drive, folder_id: str, name: str):
  """Find a file by exact name within a folder; return its file ID or None."""
  q = f"'{folder_id}' in parents and trashed=false and name='{name}'"
  res = drive.files().list(q=q, fields="files(id,name)", pageSize=1).execute()
  files = res.get("files", [])
  return files[0]["id"] if files else None

def _download_file_to_path(drive, file_id: str, path: str):
  """Download a Drive file (by ID) to a local filesystem path."""
  req = drive.files().get_media(fileId=file_id)
  with open(path, "wb") as fh:
    downloader = MediaIoBaseDownload(fh, req)
    done = False
    while not done:
      _, done = downloader.next_chunk()

def _download_file_to_bytes(drive, file_id: str) -> bytes:
  """Download a Drive file (by ID) fully into memory and return raw bytes."""
  buf = io.BytesIO()
  req = drive.files().get_media(fileId=file_id)
  downloader = MediaIoBaseDownload(buf, req)
  done = False
  while not done:
    _, done = downloader.next_chunk()
  return buf.getvalue()

# Load Core Database

In [None]:
# [Load Database (JSONL Only) into database_df]
# ---------------------------------------------------------------------
# Loads a JSONL dataset from Google Drive into a pandas DataFrame.
# - Robust JSONL reader with a fallback line-by-line parser.
# - Returns a DataFrame with at least: ["filename", "content", "preprocessed"].
# ---------------------------------------------------------------------

# -----------------------
# JSONL I/O
# -----------------------
def _read_jsonl_df(path: str) -> pd.DataFrame:
  """Read a JSONL file into a DataFrame (fallback parser on error).

  Ensures the columns ["filename", "content", "preprocessed"] exist.
  """
  if not os.path.exists(path):
    return pd.DataFrame(columns=["filename", "content", "preprocessed"])

  # Fast path: pandas JSONL reader
  try:
    df = pd.read_json(path, orient="records", lines=True, dtype=False)
  except ValueError:
    # Fallback: line-by-line robust reader
    rows = []
    with open(path, "r", encoding="utf-8") as f:
      for line in f:
        s = line.strip()
        if not s:
          continue
        try:
          rows.append(json.loads(s))
        except JSONDecodeError:
          continue
    df = pd.DataFrame(rows)

  # Ensure required columns exist
  for col in ("filename", "content", "preprocessed"):
    if col not in df.columns:
      df[col] = None

  return df

# -----------------------
# Coerce temporal fields
# -----------------------
def _coerce_temporal_fields(df: pd.DataFrame) -> pd.DataFrame:
  """Normalize year and quarter columns in-place to numeric types."""
  if df.empty:
    return df

  df = df.copy()

  df["year"] = (
    pd.to_numeric(df.get("year"), errors="coerce")
      .astype("Int64")
  )

  quarter_numbers = (
    df.get("quarter")
      .astype(str)
      .str.extract(r"Q([1-4])", expand=False)
  )
  df["quarter"] = pd.to_numeric(quarter_numbers, errors="coerce").astype("Int64")

  return df

# -----------------------
# Load database
# -----------------------
def load_database():
  """Locate JSONL in Drive, download locally, read into a sorted DataFrame."""
  drive = _drive()
  local_tmp = f"/content/{DATABASE_FILENAME}"

  jsonl_id = _find_file(drive, DATABASE_FOLDER_ID, DATABASE_FILENAME)
  if not jsonl_id:
    raise FileNotFoundError(f"'{DATABASE_FILENAME}' not found in folder {DATABASE_FOLDER_ID}.")

  _download_file_to_path(drive, jsonl_id, local_tmp)
  df = _read_jsonl_df(local_tmp)
  df = _coerce_temporal_fields(df)

  if not df.empty and "filename" in df.columns:
    df = df.sort_values(by="filename", ascending=False).reset_index(drop=True)

  return df

# -----------------------
# Entrypoint
# -----------------------
database_df = load_database()

# Load Chunk Artefacts

In [None]:
# [Load Chunk Artefacts]
# ---------------------------------------------------------------------
# Loads precomputed chunks from Google Drive (JSONL) and materialises
# them as a list of Chunk dataclass instances.
# ---------------------------------------------------------------------

@dataclass
class Chunk:
  """Atomic transcript unit for retrieval/embedding."""
  chunk_id: str
  text: str
  filename: str
  doc_id: str
  speaker: str
  row_index: int
  block_index: int
  metadata: Dict[str, Any]

def _as_int(value):
  if isinstance(value, (int, float)) and not math.isnan(value):
    return int(value)
  if isinstance(value, str) and value.isdigit():
    return int(value)
  return None

def load_chunks() -> list[Chunk]:
  local_tmp = f"/content/{CHUNKS_FILENAME}"
  file_id = _find_file(_drive(), DATABASE_FOLDER_ID, CHUNKS_FILENAME)
  if not file_id:
    raise FileNotFoundError(f"{CHUNKS_FILENAME} not found in Drive folder")

  _download_file_to_path(_drive(), file_id, local_tmp)
  df = _read_jsonl_df(local_tmp)

  ordered: list[tuple[int | None, Chunk]] = []
  for row in df.to_dict("records"):
    meta = row.get("metadata", {}) or {}
    if isinstance(meta, str):
      try:
        meta = json.loads(meta)
      except Exception:
        meta = {}

    embed_idx = _as_int(meta.get("embedding_row"))

    chunk = Chunk(
      chunk_id    = row.get("chunk_id", ""),
      text        = row.get("text", "") or "",
      filename    = row.get("filename", "") or "",
      doc_id      = row.get("doc_id", "") or "",
      speaker     = row.get("speaker", "") or "",
      row_index   = int(row.get("row_index", -1)) if row.get("row_index") is not None else -1,
      block_index = int(row.get("block_index", -1)) if row.get("block_index") is not None else -1,
      metadata    = meta if isinstance(meta, dict) else {},
    )
    ordered.append((embed_idx, chunk))

  max_idx = max((idx for idx, _ in ordered if idx is not None), default=-1)
  slots = [None] * (max_idx + 1)
  tail: list[Chunk] = []

  for idx, chunk in ordered:
    if idx is not None and idx >= 0:
      if idx >= len(slots):
        slots.extend([None] * (idx - len(slots) + 1))
      slots[idx] = chunk
    else:
      tail.append(chunk)

  chunks_list = [c for c in slots if c is not None] + tail
  return chunks_list

# Entrypoint
chunks = load_chunks()

#  Load Dense Retrieval Artefacts

In [None]:
# [Load Dense Embeddings from Drive]
# ---------------------------------------------------------------------
# Downloads the dense embedding matrix (.npy) from Google Drive and loads
# it into memory as a NumPy array for use in dense retrieval or FAISS search.
# ---------------------------------------------------------------------

file_id = _find_file(_drive(), DATABASE_FOLDER_ID, DEMBEDDINGS_FILENAME)
if not file_id:
    raise FileNotFoundError(f"{DEMBEDDINGS_FILENAME} not found in folder {DATABASE_FOLDER_ID}")

dense_embeddings = np.load(io.BytesIO(_download_file_to_bytes(_drive(), file_id)), allow_pickle=False)

In [None]:
# [Load FAISS Index from Drive]
# ---------------------------------------------------------------------
# Downloads the FAISS index binary from Google Drive and deserialises it
# into a FAISS index object for dense retrieval.
# ---------------------------------------------------------------------

file_id = _find_file(_drive(), DATABASE_FOLDER_ID, FAISS_FILENAME)
if not file_id:
    raise FileNotFoundError(f"{FAISS_FILENAME} not found in Drive folder")

payload = np.frombuffer(_download_file_to_bytes(_drive(), file_id), dtype=np.uint8)
faiss_index = faiss.deserialize_index(payload)