# Create Q/A from Wikipédia

### Retrieve portal links

In [1]:
import requests
from bs4 import BeautifulSoup

portail_link = "https://fr.wikipedia.org/wiki/Portail:Accueil"
response = requests.get(portail_link)
soup = BeautifulSoup(response.text, "html.parser")
soup.encode('utf-8')

# Get the Portail links
pattern = "/wiki/Portail:"
portail_links = []
for match in soup.find_all("a"):
    if match.get("href") and pattern in match.get("href"):
        link = match.get("href")
        name = link.split(":")[1]
        portail_links.append({"name": name, "link": link})

# Remove Accueil from the list
portail_links = portail_links[6:]

### Prepare usefull functions

In [None]:
import requests
import dotenv
import os
import json
import re
dotenv.load_dotenv()

# Define Constants
OPENROUTER_API_KEY = os.getenv('OPENROUTER_API_KEY')
# MODEL = "deepseek/deepseek-chat"
MODEL = "google/gemini-2.0-flash-001"
PROMPT_FILE = "prompt2.md"

# Define request function
def request_llm(article):
    # Load prompt & add article
    with open(PROMPT_FILE, 'r') as f:
        prompt = f.read()
        prompt = prompt.replace('wikipedia_article', article)
    
    # Request
    try :
      response = requests.post(
        url="https://openrouter.ai/api/v1/chat/completions",
        headers={
          "Authorization": f"Bearer {OPENROUTER_API_KEY}",
        },
        data=json.dumps({
          "model": MODEL,
          "messages": [{"role": "user", "content": prompt}],
        })
      )

    # Handle request error
    except Exception as e:
      print(f"Error: {e}")
      return None
    
    # Check response
    if response.status_code != 200:
      print(f"Error: {response.json()}")
      return None
    
    return response.json()['choices'][0]['message']['content'] # C'est possible de parraléliser les requetes

# Define function to extract json objects
def extract_json(answer):
    pattern = r'```json\n(.*?)\n```'
    matches = re.findall(pattern, answer, re.DOTALL)
    objects = [eval(m.replace('null', 'None').replace('true', 'True')) for m in matches] 
    return objects[0]


### Initialize progress

In [3]:
# Save file
def save_progress(portail_links, index_left):
    with open('progress.json', 'w', encoding='utf-8') as f:
        json.dump({"portail_links": portail_links, "index": index_left}, f, ensure_ascii=False, indent=2)

def load_progress():
    with open('progress.json', 'r', encoding='utf-8') as f:
        return json.load(f) 

def save_qa(qa):
    with open('qa.json', 'w', encoding='utf-8') as f:
        json.dump(qa, f, ensure_ascii=False, indent=2)

def load_qa():
    try:
        with open('qa.json', 'r', encoding='utf-8') as f:
            return json.load(f)
    except:
        return {}

save_progress(portail_links, list(range(len(portail_links))))
save_qa([])

### Make QA

In [None]:
import random
from tqdm import tqdm

NB_REQUESTS = 1

# Load progress
progress = load_progress()
portail_links = progress["portail_links"]
index_left = progress["index"]

# Load QA
qa = load_qa()

# Call the LLM for each Portail
for _ in tqdm(range(NB_REQUESTS)):
    try:
        # Select a random portail
        index = random.choice(index_left)
        index_left.remove(index)
        p = portail_links[index]

        # Request LLM
        page = requests.get("https://fr.wikipedia.org" + p["link"])
        page.encoding = 'utf-8'
        soup = BeautifulSoup(page.text, "html.parser")
        text = soup.get_text()
        answer = request_llm(text)

        # Extract JSON objects
        objects = extract_json(answer)
        for o in objects:
            o['source'] = p['link']
            o['source_name'] = p['name']
        
        # Add to QA
        qa = [*qa, *objects]
        
        # Save progress
        save_progress(portail_links, index_left)
        save_qa(qa)
    
    except Exception as e:
        print(f"Error: {e}")
        continue


print('{} questions generated'.format(len(qa)))
print('Subject left: {}'.format(len(index_left)))

100%|██████████| 1/1 [00:01<00:00,  1.59s/it]





Portail:Charleroi — Wikipédia



























Aller au contenu







Menu principal





Menu principal
déplacer vers la barre latérale
masquer



		Navigation
	


AccueilPortails thématiquesArticle au hasardContactPages spéciales





		Contribuer
	


Débuter sur WikipédiaAideCommunautéModifications récentes



















Rechercher











Rechercher






















Apparence
















Faire un don

Créer un compte

Se connecter








Outils personnels





Faire un don Créer un compte Se connecter





		Pages pour les contributeurs déconnectés en savoir plus



ContributionsDiscussion



























Portail:Charleroi



Ajouter des langues





Ajouter des liens











PortailDiscussion





français

















LireModifier le codeVoir l’historique







Outils





Outils
déplacer vers la barre latérale
masquer



		Actions
	


LireModifier le codeVoir l’historique





		Général
	


Pages liéesSuivi des pages liéesTélév




In [6]:
import json

with open('qa.json', 'r', encoding='utf-8') as f:
    data = json.load(f)
    data