# Create Q/A from Wikipédia

### Retrieve portal links

In [7]:
import requests
from bs4 import BeautifulSoup

portail_link = "https://fr.wikipedia.org/wiki/Portail:Accueil"
response = requests.get(portail_link)
soup = BeautifulSoup(response.text, "html.parser")
soup.encode('utf-8')

# Get the Portail links
pattern = "/wiki/Portail:"
portail_links = []
for match in soup.find_all("a"):
    if match.get("href") and pattern in match.get("href"):
        link = match.get("href")
        name = link.split(":")[1]
        portail_links.append({"name": name, "link": link})

# Remove Accueil from the list
portail_links = portail_links[6:]

### Prepare usefull functions

In [8]:
import requests
import dotenv
import os
import json
import re
dotenv.load_dotenv()

# Define Constants
OPENROUTER_API_KEY = os.getenv('OPENROUTER_API_KEY')
# MODEL = "deepseek/deepseek-chat"
MODEL = "google/gemini-2.0-flash-001"
PROMPT_FILE = "prompt2.md"

# Define request function
def request_llm(article):
    # Load prompt & add article
    with open(PROMPT_FILE, 'r') as f:
        prompt = f.read()
        prompt = prompt.replace('wikipedia_article', article)
    
    # Request
    try :
      response = requests.post(
        url="https://openrouter.ai/api/v1/chat/completions",
        headers={
          "Authorization": f"Bearer {OPENROUTER_API_KEY}",
        },
        data=json.dumps({
          "model": MODEL,
          "messages": [{"role": "user", "content": prompt}],
        })
      )

    # Handle request error
    except Exception as e:
      print(f"Error: {e}")
      return None
    
    # Check response
    if response.status_code != 200:
      print(f"Error: {response.json()}")
      return None
    
    return response.json()['choices'][0]['message']['content'] # C'est possible de parraléliser les requetes

# Define function to extract json objects
def extract_json(answer):
    pattern = r'```json\n(.*?)\n```'
    matches = re.findall(pattern, answer, re.DOTALL)
    objects = [eval(m.replace('null', 'None').replace('true', 'True')) for m in matches] 
    return objects[0]


### Initialize progress (olf stuff)

In [9]:
# Save file
def save_progress(portail_links, index_left):
    with open('progress.json', 'w', encoding='utf-8') as f:
        json.dump({"portail_links": portail_links, "index": index_left}, f, ensure_ascii=False, indent=2)

def load_progress():
    with open('progress.json', 'r', encoding='utf-8') as f:
        return json.load(f) 

def save_qa(qa):
    with open('qa.json', 'w', encoding='utf-8') as f:
        json.dump(qa, f, ensure_ascii=False, indent=2)

def load_qa():
    try:
        with open('qa.json', 'r', encoding='utf-8') as f:
            return json.load(f)
    except:
        return {}

save_progress(portail_links, list(range(len(portail_links))))
save_qa([])

### Make QA (old stuff)

In [None]:
import random
from tqdm import tqdm

NB_REQUESTS = 10

# Load progress
progress = load_progress()
portail_links = progress["portail_links"]
index_left = progress["index"]

# Load QA
qa = load_qa()

# Call the LLM for each Portail
for _ in tqdm(range(NB_REQUESTS)):
    try:
        # Select a random portail
        index = random.choice(index_left)
        index_left.remove(index)
        p = portail_links[index]

        # Try to get article page of the portail
        try:
            page = requests.get("https://fr.wikipedia.org" + p["link"].replace('Portail:', ''))
            if page.status_code != 200:
                raise Exception(f"Error: {page.status_code}")
        # If not possible, get the portail page
        except Exception as e:
            page = requests.get("https://fr.wikipedia.org" + p["link"])
        # Get text
        page.encoding = 'utf-8'
        soup = BeautifulSoup(page.text, "html.parser")
        text = soup.get_text()

        # Request LLM
        answer = request_llm(text)

        # Extract JSON objects
        objects = extract_json(answer)
        for o in objects:
            o['source'] = p['link']
            o['source_name'] = p['name']
        
        # Add to QA
        qa = [*qa, *objects]
        
        # Save progress
        save_progress(portail_links, index_left)
        save_qa(qa)
    
    except Exception as e:
        print(f"Error: {e}")
        continue


print('{} questions generated'.format(len(qa)))
print('Subject left: {}'.format(len(index_left)))

100%|██████████| 10/10 [01:26<00:00,  8.66s/it]

100 questions generated
Subject left: 1797





# Make QA by category

In [None]:
import json
from tqdm import tqdm

with open('sujet_thematic.json', 'r', encoding='utf-8') as f:
    data = json.load(f)


NB_REQUESTS = 10

# Load progress
progress = load_progress()
portail_links = progress["portail_links"]
index_left = progress["index"]

# Load QA
qa = load_qa()

# Call the LLM for each Portail
for _ in tqdm(range(NB_REQUESTS)):
    try:
        # Select a random portail
        index = random.choice(index_left)
        index_left.remove(index)
        p = portail_links[index]

        # Try to get article page of the portail
        try:
            page = requests.get("https://fr.wikipedia.org" + p["link"].replace('Portail:', ''))
            if page.status_code != 200:
                raise Exception(f"Error: {page.status_code}")
        # If not possible, get the portail page
        except Exception as e:
            page = requests.get("https://fr.wikipedia.org" + p["link"])
        # Get text
        page.encoding = 'utf-8'
        soup = BeautifulSoup(page.text, "html.parser")
        text = soup.get_text()

        # Request LLM
        answer = request_llm(text)

        # Extract JSON objects
        objects = extract_json(answer)
        for o in objects:
            o['source'] = p['link']
            o['source_name'] = p['name']
        
        # Add to QA
        qa = [*qa, *objects]
        
        # Save progress
        save_progress(portail_links, index_left)
        save_qa(qa)
    
    except Exception as e:
        print(f"Error: {e}")
        continue


print('{} questions generated'.format(len(qa)))
print('Subject left: {}'.format(len(index_left)))

Histoire & Géographie 100
Politique & Religion 101
Sport & Loisirs 100
Sciences & Technologie 100
Art & Culture 100
Société & Economie 100


In [13]:
portail_links

[{'name': 'Recherche_scientifique',
  'link': '/wiki/Portail:Recherche_scientifique'},
 {'name': 'Histoire_des_sciences',
  'link': '/wiki/Portail:Histoire_des_sciences'},
 {'name': 'Histoire_de_la_zoologie_et_de_la_botanique',
  'link': '/wiki/Portail:Histoire_de_la_zoologie_et_de_la_botanique'},
 {'name': 'Biologie', 'link': '/wiki/Portail:Biologie'},
 {'name': 'Anatomie', 'link': '/wiki/Portail:Anatomie'},
 {'name': 'Biochimie', 'link': '/wiki/Portail:Biochimie'},
 {'name': 'Bio%C3%A9thique', 'link': '/wiki/Portail:Bio%C3%A9thique'},
 {'name': 'Biologie_cellulaire_et_mol%C3%A9culaire',
  'link': '/wiki/Portail:Biologie_cellulaire_et_mol%C3%A9culaire'},
 {'name': 'Biologie_marine', 'link': '/wiki/Portail:Biologie_marine'},
 {'name': 'Conservation_de_la_nature',
  'link': '/wiki/Portail:Conservation_de_la_nature'},
 {'name': '%C3%89coatlas', 'link': '/wiki/Portail:%C3%89coatlas'},
 {'name': '%C3%89cologie', 'link': '/wiki/Portail:%C3%89cologie'},
 {'name': '%C3%89cotoxicologie', 'link

In [29]:
p = portail_links[0]
try:
    page = requests.get("https://fr.wikipedia.org" + p["link"].replace('Portail:', ''))
    if page.status_code != 200:
        raise Exception(f"Error: {page.status_code}")
    print('success')
except Exception as e:
    page = requests.get("https://fr.wikipedia.org" + p["link"])
page.encoding = 'utf-8'
soup = BeautifulSoup(page.text, "html.parser")
text = soup.get_text()
text

success


"\n\n\n\nRecherche scientifique — Wikipédia\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nAller au contenu\n\n\n\n\n\n\n\nMenu principal\n\n\n\n\n\nMenu principal\ndéplacer vers la barre latérale\nmasquer\n\n\n\n\t\tNavigation\n\t\n\n\nAccueilPortails thématiquesArticle au hasardContactPages spéciales\n\n\n\n\n\n\t\tContribuer\n\t\n\n\nDébuter sur WikipédiaAideCommunautéModifications récentes\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nRechercher\n\n\n\n\n\n\n\n\n\n\n\nRechercher\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nApparence\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nFaire un don\n\nCréer un compte\n\nSe connecter\n\n\n\n\n\n\n\n\nOutils personnels\n\n\n\n\n\nFaire un don Créer un compte Se connecter\n\n\n\n\n\n\t\tPages pour les contributeurs déconnectés en savoir plus\n\n\n\nContributionsDiscussion\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSommaire\ndéplacer vers la barre latérale\nmasquer\n\n\n\n\nDébut\n\n\n\n\n\n1\nHistoire\n\n\n\n\nAfficher\u20

In [26]:
page.__dict__

{'_content': b'<!DOCTYPE html>\n<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-sticky-header-enabled vector-toc-not-available" lang="fr" dir="ltr">\n<head>\n<meta charset="UTF-8">\n<title>Dada et Surr\xc3\xa9alisme \xe2\x80\x94 Wikip\xc3\xa9dia</title>\n<script>(function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 ve