In [1]:
import dspy
import os

ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")
TOGETHER_API_KEY = os.getenv("TOGETHER_API_KEY")

lm=dspy.LM('together_ai/deepseek-ai/DeepSeek-R1', temperature=0.1, max_tokens=2500, stop=None, cache=False, api_key=TOGETHER_API_KEY)
dspy.configure(lm=lm)

In [2]:
import os
from huggingface_hub import login

HF_TOKEN = os.getenv("HF_TOKEN")
if HF_TOKEN:
    login(token=HF_TOKEN)
    print("Logged in to Hugging Face Hub")
else:
    print("HF_TOKEN not found in environment. Set this to authenticate with Hugging Face Hub.")

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


Logged in to Hugging Face Hub


In [None]:
import dspy
from dspy.datasets import HotPotQA
import numpy as np
from collections import defaultdict

dataset = HotPotQA(train_seed=1, train_size=50, eval_seed=2, dev_size=50)

# Create corpus from dataset
corpus = {}
for example in dataset.train:
    for title, text in zip(example.titles, example.paragraphs):
        doc_id = f"{title}"
        corpus[doc_id] = {
            "title": title,
            "text": text
        }

class SimpleRetriever:
    def __init__(self, corpus):
        self.corpus = corpus
        self.doc_ids = list(corpus.keys())
        self.index = self._build_index()

    def _build_index(self):
        index = defaultdict(list)
        for doc_id, doc in self.corpus.items():
            text = doc.get("text", "").lower()
            words = set(text.split())
            for word in words:
                index[word].append(doc_id)
        return index

    def __call__(self, query, k=3):
        query_words = set(query.lower().split())
        scores = defaultdict(int)

        for word in query_words:
            for doc_id in self.index.get(word, []):
                scores[doc_id] += 1

        sorted_docs = sorted(scores.items(), key=lambda x: x[1], reverse=True)
        top_docs = sorted_docs[:k]

        results = []
        for doc_id, _ in top_docs:
            doc = self.corpus[doc_id]
            results.append({
                "id": doc_id,
                "text": doc["text"],
                "title": doc.get("title", "")
            })

        return results

retriever = SimpleRetriever(corpus)
dspy.configure(rm=retriever, lm=lm)

test_results = retriever("Who won the Nobel Prize in Physics in 1921?", k=3)
print("Retriever test:", [p["text"][:100] + "..." for p in test_results])

class Hop(dspy.Module):
    def __init__(self, num_docs=10, num_hops=4):
        self.num_docs, self.num_hops = num_docs, num_hops
        self.generate_query = dspy.ChainOfThought('claim, notes -> query')
        self.append_notes = dspy.ChainOfThought('claim, notes, context -> new_notes: list[str], titles: list[str]')

    def forward(self, claim: str) -> list[str]:
        notes = []
        titles = []

        for _ in range(self.num_hops):
            query = self.generate_query(claim=claim, notes=notes).query
            context = retriever(query, k=self.num_docs)
            prediction = self.append_notes(claim=claim, notes=notes, context=context)
            notes.extend(prediction.new_notes)
            titles.extend(prediction.titles)

        return dspy.Prediction(notes=notes, titles=list(set(titles)))

hop_search = Hop(num_docs=3, num_hops=2)

result = hop_search(claim="The first Olympic Games were held in Athens in 1896.")

print("\nNotes:")
for i, note in enumerate(result.notes):
    print(f"{i+1}. {note}")

print("\nTitles:")
for title in result.titles:
    print(f"- {title}")

In [None]:
import dspy
import random

mock_corpus = {
    f"doc_{i}": {
        "title": f"Document {i}",
        "text": f"This is document {i} about {topic}." +
                f" It contains information about {subtopic}." +
                f" Some additional details about {details}."
    }
    for i, (topic, subtopic, details) in enumerate([
        ("Ancient Olympics", "origin in Greece", "started in 776 BCE in Olympia"),
        ("Modern Olympics", "revival in Athens", "first held in 1896"),
        ("Pierre de Coubertin", "Olympic movement", "founder of the International Olympic Committee"),
        ("Olympic medals", "gold, silver and bronze", "design and symbolism"),
        ("Olympic torch", "lighting ceremony", "relay to the host city"),
        ("Olympic rings", "symbol design", "five interlocking rings representing continents"),
        ("Winter Olympics", "winter sports", "first held in 1924 in Chamonix"),
        ("Olympic host cities", "selection process", "bidding and voting by IOC members"),
        ("Olympic boycotts", "political protests", "notable boycotts in history"),
        ("Olympic records", "world records", "fastest, highest, strongest performances")
    ])
}

class MockRetriever:
    def __init__(self, corpus):
        self.corpus = corpus

    def __call__(self, query, k=3):
        matches = []
        query_words = set(query.lower().split())

        for doc_id, doc in self.corpus.items():
            text = doc["text"].lower()
            match_score = sum(1 for word in query_words if word in text)
            if match_score > 0:
                matches.append((doc_id, match_score, doc))

        matches.sort(key=lambda x: x[1], reverse=True)
        return [
            {
                "id": doc_id,
                "title": doc["title"],
                "text": doc["text"],
                "long_text": doc["text"]
            }
            for doc_id, _, doc in matches[:k]
        ]

retriever = MockRetriever(mock_corpus)
dspy.configure(rm=retriever)

test_results = retriever("Olympic Games in Athens", k=3)
print("Retriever test:", [p["text"][:100] + "..." for p in test_results])

class CustomRetrieve(dspy.Module):
    def __init__(self, k=3):
        super().__init__()
        self.k = k

    def forward(self, query):
        if dspy.settings.rm is None:
            raise AssertionError("No RM is loaded.")

        passages = dspy.settings.rm(query, k=self.k)
        return dspy.Prediction(passages=passages)

class Hop(dspy.Module):
    def __init__(self, num_docs=10, num_hops=4):
        self.num_docs, self.num_hops = num_docs, num_hops
        self.generate_query = dspy.ChainOfThought('claim, notes -> query')
        self.append_notes = dspy.ChainOfThought('claim, notes, context -> new_notes: list[str], titles: list[str]')
        self.retrieve = CustomRetrieve()

    def forward(self, claim: str) -> list[str]:
        notes = []
        titles = []

        for _ in range(self.num_hops):
            query = self.generate_query(claim=claim, notes=notes).query
            context = self.retrieve(query).passages
            prediction = self.append_notes(claim=claim, notes=notes, context=context)
            notes.extend(prediction.new_notes)
            titles.extend(prediction.titles)

        return dspy.Prediction(notes=notes, titles=list(set(titles)))

# Create an instance of the Hop class (with fewer hops for testing)
hop_search = Hop(num_docs=3, num_hops=2)

result = hop_search(claim="The first Olympic Games were held in Athens in 1896.")

print("\nNotes:")
for i, note in enumerate(result.notes):
    print(f"{i+1}. {note}")

print("\nTitles:")
for title in result.titles:
    print(f"- {title}")

Retriever test: ['This is document 1 about Modern Olympics. It contains information about revival in Athens. Some addi...', 'This is document 0 about Ancient Olympics. It contains information about origin in Greece. Some addi...', 'This is document 2 about Pierre de Coubertin. It contains information about Olympic movement. Some a...']

Notes:
1. Document 1 confirms the first modern Olympic Games were held in Athens in 1896, as part of their revival.
2. «Document 1 confirms the first modern Olympic Games were held in Athens in 1896, as part of their revival.»

Titles:
- Document 1
