In [5]:
import requests
import pandas as pd
import os
from tqdm import tqdm

def extract_fact(filename, language, ollama_url="http://localhost:11434/api/generate"):
    with open(filename, "r", encoding="utf-8") as file:
        text = file.read()

    en_prompt = f"""
    Please breakdown the following paragraph into a list of independent facts. Output the facts as a list using * (Markdown), and say nothing else.\n
    {text}
    """

    ru_prompt = f"""
    Пожалуйста, разбейте следующий абзац на список независимых фактов.Выведите факты в виде списка, используя * (Markdown), и больше ничего не говорите.\n 
    {text}
    """ # TODO add the output format to be the same as the English one

    payload = {
        "model": "gemma3:1b",
        "prompt": en_prompt if language == 'en' else ru_prompt,
        "stream": False
    }
    response = requests.post(ollama_url, json=payload)
    response.raise_for_status()
    return response.json().get("response", "")

facts = []
for filename in tqdm(os.listdir('scraped'), desc="Processing files"):
    filepath = os.path.join('scraped', filename)
    if os.path.isfile(filepath):
        language = 'en' if filename.startswith('en_') else 'ru'
        facts_md = extract_fact(filepath, language)
        # print(f"Extracted facts from {filename}: {facts_md}")
        facts_split = facts_md.split('*')
        # print(f"Number of facts extracted from {filename}: {len(facts_split)}")
        facts_split = [fact.strip() for fact in facts_split if fact.strip()]
        facts.append({"filename": filename, "facts": facts_split})

df = pd.DataFrame(facts)
df.to_json("output/facts.json", orient="records", force_ascii=False, indent=2)

Processing files: 100%|██████████| 12/12 [00:44<00:00,  3.69s/it]
