## Data Preparation

In [1]:
import io
import os
import re
import gzip
import pickle
import zipfile
import requests
from requests.exceptions import RequestException
from typing import Iterable, Any, List, Dict, Optional
from pathlib import Path
import shutil

#### Download multivariate time-series data from GitHub

GitHub repo: https://github.com/laiguokun/multivariate-time-series-data/tree/master

Information about datasets: 
* Electricity consumption: The raw dataset is in https://archive.ics.uci.edu/ml/datasets/ElectricityLoadDiagrams20112014 (electricity consumption in kWh recorded every 15 minutes from 2011 to 2014). After processing, the dataset records the hourly electricity consumption of 321 clients from 2012 to 2014.
* Traffic Usage: The raw data is in http://pems.dot.ca.gov. After processing, the dataset is a collection of 48 months (2015-2016) of hourly data from the California Department of Transportation. The data describes the road occupancy rates (between 0 and 1) measured by different sensors on San Francisco Bay area freeways.
* Solar Energy: The raw data is in http://www.nrel.gov/grid/solar-power-data.html. After processing, the dataset contains the solar power production records in the year of 2006, which is sampled every 10 minutes from 137 PV plants in Alabama State. 
* Exchange Rate: The collection of the daily exchange rates of eight foreign countries including Australia, British, Canada, Switzerland, China, Japan, New Zealand and Singapore ranging from 1990 to 2016.

In [2]:
class GithubTxtGzDownloader:
    """
    Download all `.txt.gz` files from a GitHub repository (any subdirectory)
    and save them under the same subfolder structure in the current working dir.
    """

    def __init__(
        self,
        repo_owner: str,
        repo_name: str,
        branch: Optional[str] = None,  # if None, try 'main' then 'master'
        save_root: Path | str = ".",
    ):
        self.repo_owner = repo_owner
        self.repo_name = repo_name
        self.branch = branch
        self.save_root = Path(save_root)

    def _zip_url(self, branch: str) -> str:
        return f"https://codeload.github.com/{self.repo_owner}/{self.repo_name}/zip/refs/heads/{branch}"

    def _get_repo_zip_bytes(self) -> bytes:
        branches_to_try = [self.branch] if self.branch else ["main", "master"]
        if self.branch and self.branch not in branches_to_try:
            branches_to_try.insert(0, self.branch)

        last_status = None
        for br in branches_to_try:
            url = self._zip_url(br)
            resp = requests.get(url, timeout=60)
            last_status = resp.status_code
            if resp.status_code == 200:
                self.branch = br  # remember the branch that worked
                return resp.content

        raise RuntimeError(
            f"Failed to download repository zip. "
            f"Tried branches {branches_to_try}, last HTTP status: {last_status}"
        )

    @staticmethod
    def _strip_top_level_dir(zip_path: str) -> str:
        """
        Remove the first path component added by GitHub zip (e.g. 'repo-branch/').
        'repo-branch/sub/dir/file.txt.gz' -> 'sub/dir/file.txt.gz'
        """
        parts = zip_path.split("/", maxsplit=1)
        return parts[1] if len(parts) > 1 else ""

    def download(self) -> int:
        """
        Download and save all .txt.gz files. Returns the number of files saved.
        """
        zip_bytes = self._get_repo_zip_bytes()
        saved = 0

        with zipfile.ZipFile(io.BytesIO(zip_bytes)) as zf:
            for info in zf.infolist():
                # Skip directories
                if info.is_dir():
                    continue

                # Normalize path to drop top-level folder
                rel_path = self._strip_top_level_dir(info.filename)
                if not rel_path:
                    continue

                # Only keep .txt.gz files
                if not rel_path.lower().endswith(".txt.gz"):
                    continue

                # Build local path (mirror subdirectories)
                local_path = self.save_root / rel_path
                local_path.parent.mkdir(parents=True, exist_ok=True)

                # Copy raw bytes (do NOT decode)
                with zf.open(info, "r") as src, open(local_path, "wb") as dst:
                    shutil.copyfileobj(src, dst)

                saved += 1

        return saved

In [3]:
# Download .txt.gz from the laiguokun/multivariate-time-series-data GitHub repo
downloader = GithubTxtGzDownloader(repo_owner="laiguokun",
                                   repo_name="multivariate-time-series-data",
                                   branch="master", save_root=".", # save under current working directory
                                  )
count = downloader.download()
print(f"Saved {count} .txt.gz files.")

Saved 4 .txt.gz files.


#### Download textual data from arXiv

Paper: "Connecting the Dots: Multivariate Time Series Forecasting with Graph Neural Networks" by Zonghan Wu, Shirui Pan, Guodong Long, Jing Jiang, Xiaojun Chang, Chengqi Zhang. May 2020. Available at arXiv:2005.11650 (url: https://arxiv.org/abs/2005.11650).

In [4]:
def _extract_arxiv_id(s: str) -> Optional[str]:
    """
    Extract an arXiv ID from a URL or return the ID if already given.
    Supports new-style IDs (e.g., 2005.11650 or 2005.11650v2).
    Also handles arxiv.org/abs/<id> and arxiv.org/pdf/<id>.pdf.
    """
    s = (s or "").strip()

    # From URL: /abs/<id> or /pdf/<id>.pdf
    m = re.search(r"arxiv\.org/(?:pdf|abs)/([^\s/#?]+)", s, flags=re.IGNORECASE)
    if m:
        raw = m.group(1)
        if raw.lower().endswith(".pdf"):
            raw = raw[:-4]
        return raw

    # New-style ID directly
    if re.fullmatch(r"\d{4}\.\d{4,5}(?:v\d+)?", s):
        return s

    # (Optional) very light old-style support like cs/0704xxx(vN)
    if re.fullmatch(r"[a-z\-]+(?:\.[A-Z]{2})?/\d{7}(?:v\d+)?", s, flags=re.IGNORECASE):
        return s

    return None


def fetch_arxiv_textual_content(arxiv_url_or_id: str) -> Optional[str]:
    """
    Fetch readable text for an arXiv paper by resolving to ar5iv (HTML) and
    passing that through Jina Reader. Falls back to the arXiv ABS page.

    Args:
        arxiv_url_or_id: e.g. 'https://arxiv.org/pdf/2005.11650',
                         'https://arxiv.org/abs/2005.11650v2', or '2005.11650'

    Returns:
        Plain text (str) or None on failure.
    """
    if not arxiv_url_or_id or not isinstance(arxiv_url_or_id, str):
        raise ValueError("The 'arxiv_url_or_id' parameter must be a non-empty string.")

    arxiv_id = _extract_arxiv_id(arxiv_url_or_id)
    if not arxiv_id:
        raise ValueError("Could not extract a valid arXiv ID from the input.")

    jina_base = "https://r.jina.ai/"
    ar5iv_url = f"https://ar5iv.org/html/{arxiv_id}"
    abs_url = f"https://arxiv.org/abs/{arxiv_id}"

    # Try ar5iv first (best structured text), then fallback to ABS page
    for target in (ar5iv_url, abs_url):
        try:
            resp = requests.get(jina_base + target.lstrip("/"), timeout=20)
            resp.raise_for_status()
            return resp.content.decode("utf-8", errors="replace")
        except RequestException as e:
            print(f"Error fetching via Jina '{target}': {e}")
        except UnicodeDecodeError:
            print(f"Error decoding response for '{target}'.")

    return None

In [5]:
raw_text = fetch_arxiv_textual_content("https://arxiv.org/pdf/2005.11650")

In [6]:
print(raw_text)

Title: Connecting the Dots: Multivariate Time Series Forecasting with Graph Neural Networks

URL Source: https://ar5iv.org/html/2005.11650

Markdown Content:
(2020)

###### Abstract.

Modeling multivariate time series has long been a subject that has attracted researchers from a diverse range of fields including economics, finance, and traffic. A basic assumption behind multivariate time series forecasting is that its variables depend on one another but, upon looking closely, it’s fair to say that existing methods fail to fully exploit latent spatial dependencies between pairs of variables. In recent years, meanwhile, graph neural networks (GNNs) have shown high capability in handling relational dependencies. GNNs require well-defined graph structures for information propagation which means they cannot be applied directly for multivariate time series where the dependencies are not known in advance. In this paper, we propose a general graph neural network framework designed specifically

In [7]:
# Remove mathematical content
_MATH_ENV_NAMES = [
    "equation", "equation*", "align", "align*", "aligned",
    "gather", "gather*", "multline", "multline*", "eqnarray", "eqnarray*",
    "split", "cases", "array", "pmatrix", "bmatrix", "vmatrix", "Vmatrix"
]

def strip_equations(text: str) -> str:
    """
    Remove (La)TeX-style math from plain text:
      - \\begin{equation}...\\end{equation}, align, gather, etc.
      - $$...$$ display math
      - \\[ ... \\] display math
      - \\( ... \\) inline math
      - $...$ inline math

    Returns text with math removed and whitespace collapsed.
    """
    if not text:
        return text

    # 1) Remove \begin{...} ... \end{...} blocks for common math envs
    for env in _MATH_ENV_NAMES:
        pattern = rf"\\begin\{{{env}\}}.*?\\end\{{{env}\}}"
        text = re.sub(pattern, " ", text, flags=re.DOTALL)

    # 2) Remove display math: $$ ... $$
    text = re.sub(r"\$\$.*?\$\$", " ", text, flags=re.DOTALL)

    # 3) Remove display math: \[ ... \]
    text = re.sub(r"\\\[(.*?)\\\]", " ", text, flags=re.DOTALL)

    # 4) Remove inline math: \( ... \)
    text = re.sub(r"\\\((.*?)\\\)", " ", text, flags=re.DOTALL)

    # 5) Remove inline math: $ ... $   (avoid greediness / nested $)
    #    This matches a single pair of $...$ without spanning another $
    text = re.sub(r"\$(?:\\.|[^$\\])+\$", " ", text, flags=re.DOTALL)

    # 6) Clean up stray math delimiters if any remain
    text = re.sub(r"[\\$]", " ", text)

    # 7) Collapse whitespace/newlines
    text = re.sub(r"\s+\n\s+", "\n\n", text)         # keep paragraph breaks
    text = re.sub(r"[ \t]{2,}", " ", text)           # collapse long spaces
    text = re.sub(r"\n{3,}", "\n\n", text)           # max two newlines in a row
    return text.strip()


def prepare_text_for_llm(raw_text: str) -> str:
    """
    Convenience preprocessor: strip equations and tidy whitespace.
    """
    cleaned = strip_equations(raw_text)
    return cleaned

In [8]:
def sliding_window(
        seq: Iterable[Any],
        size: int,
        step: int
    ) -> List[Dict[str, Any]]:
    """
    Create overlapping chunks from a sequence using a sliding window approach.

    Minor change: early return for empty input to avoid producing a single empty chunk.

    Args:
        seq: The input sequence (string or list) to be chunked.
        size (int): The size of each chunk/window.
        step (int): The step size between consecutive windows.

    Returns:
        list of dicts: [{'start': int, 'content': <slice>}, ...]

    Example:
        >>> sliding_window("hello world", size=5, step=3)
        [{'start': 0, 'content': 'hello'}, {'start': 3, 'content': 'lo wo'}]
    """
    if size <= 0 or step <= 0:
        raise ValueError("size and step must be positive")

    # ---- Minor change: guard for empty seq ----
    n = len(seq)  # works for str and list
    if n == 0:
        return []

    result = []
    for i in range(0, n, step):
        batch = seq[i:i+size]
        result.append({'start': i, 'content': batch})
        if i + size > n:
            break

    return result

# Perhaps I should do per-paragraph chunking???
def chunk_documents(
        documents: Iterable[Dict[str, str]],
        size: int = 2000,
        step: int = 1000,
        content_field_name: str = 'content',
        min_chars: int = 0,      # <-- Minor change: drop tiny tail chunks if desired
) -> List[Dict[str, str]]:
    """
    Split documents into overlapping chunks while preserving metadata.

    Minor change: added `min_chars` to skip very small chunks that are often
    just headers or stubs after math removal. Set to 0 to keep all.

    Returns:
        list of chunk dicts with original metadata + 'start' + 'content'
    """
    results: List[Dict[str, str]] = []

    for doc in documents:
        doc_copy = doc.copy()
        if content_field_name not in doc_copy:
            continue
        doc_content = doc_copy.pop(content_field_name) or ""

        chunks = sliding_window(doc_content, size=size, step=step)
        for ch in chunks:
            if isinstance(ch['content'], str) and len(ch['content']) < min_chars:
                continue
            # merge metadata
            ch.update(doc_copy)
        results.extend([c for c in chunks if not isinstance(c['content'], str) or len(c['content']) >= min_chars])

    return results

In [9]:
def save_pickle(obj: Any, filename: str = "docs.pkl", compress: Optional[bool] = None) -> Path:
    """
    Save any Python object to a pickle file in the current working directory.
    If compress is None, auto-enable gzip when filename ends with '.gz'.
    """
    path = Path.cwd() / filename
    path.parent.mkdir(parents=True, exist_ok=True)

    use_compress = (compress if compress is not None else filename.endswith(".gz"))
    if use_compress:
        with gzip.open(path, "wb") as f:
            pickle.dump(obj, f, protocol=pickle.HIGHEST_PROTOCOL)
    else:
        with open(path, "wb") as f:
            pickle.dump(obj, f, protocol=pickle.HIGHEST_PROTOCOL)

    print(f"Saved {type(obj).__name__} to {path}")
    return path

def load_pickle(filename: str) -> Any:
    """
    Load a pickle file from the current working directory.
    WARNING: Only load pickles from sources you trust.
    """
    path = Path.cwd() / filename
    if filename.endswith(".gz"):
        with gzip.open(path, "rb") as f:
            return pickle.load(f)
    else:
        with open(path, "rb") as f:
            return pickle.load(f)

In [12]:
prepared_text = prepare_text_for_llm(raw_text)

docs = [{
        "content": prepared_text,
        "source": "arxiv:2005.11650",
        "title": "Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting",
    }]

# Pickle save 
save_pickle(docs, "arXiv_text.pkl")

#docs = load_pickle("arXiv_text.pkl")
#docs[0]['content'] # text

Saved list to /workspaces/Agentic-AI-course/Project/arXiv_text.pkl


PosixPath('/workspaces/Agentic-AI-course/Project/arXiv_text.pkl')

In [11]:
# Typical LLM-friendly character windows (tune to your model/pipeline)
chunks = chunk_documents(docs, size=2000, step=1000, content_field_name="content", 
                         min_chars=200)

# `chunks` is now a list of dicts ready for indexing/RAG
print(f"{len(chunks)} chunks; first content preview:\n")
if chunks:
    print(chunks[0]["content"][:600])

75 chunks; first content preview:

Title: Connecting the Dots: Multivariate Time Series Forecasting with Graph Neural Networks

URL Source: https://ar5iv.org/html/2005.11650

Markdown Content:
(2020)

###### Abstract.

Modeling multivariate time series has long been a subject that has attracted researchers from a diverse range of fields including economics, finance, and traffic. A basic assumption behind multivariate time series forecasting is that its variables depend on one another but, upon looking closely, it’s fair to say that existing methods fail to fully exploit latent spatial dependencies between pairs of variables. In
