### Retrieve portal links

In [None]:
import requests
from bs4 import BeautifulSoup

portail_link = "https://fr.wikipedia.org/wiki/Portail:Accueil"
response = requests.get(portail_link)
soup = BeautifulSoup(response.text, "html.parser")
soup.encode('utf-8')

# Get the Portail links
pattern = "/wiki/Portail:"
portail_links = []
for match in soup.find_all("a"):
    if match.get("href") and pattern in match.get("href"):
        link = match.get("href")
        name = link.split(":")[1]
        portail_links.append({"name": name, "link": link})

# Remove Accueil from the list
portail_links = portail_links[6:]

### Prepare usefull functions

In [1]:
import requests
import dotenv
import os
import json
import re
dotenv.load_dotenv()

# Define Constants
OPENROUTER_API_KEY = os.getenv('OPENROUTER_API_KEY')
# MODEL = "deepseek/deepseek-chat"
MODEL = "google/gemini-2.0-flash-001"
PROMPT_FILE = "prompt_category.md"

# Define request function
def request_llm(article, category):
    # Load prompt & add article
    with open(PROMPT_FILE, 'r') as f:
        prompt = f.read()
        prompt = prompt.replace('{wikipedia_article}', article)
        prompt = prompt.replace('{question_category}', category)
    
    # Request
    try :
      response = requests.post(
        url="https://openrouter.ai/api/v1/chat/completions",
        headers={
          "Authorization": f"Bearer {OPENROUTER_API_KEY}",
        },
        data=json.dumps({
          "model": MODEL,
          "messages": [{"role": "user", "content": prompt}],
        })
      )

    # Handle request error
    except Exception as e:
      print(f"Error: {e}")
      return None
    
    # Check response
    if response.status_code != 200:
      print(f"Error: {response.json()}")
      return None
    
    return response.json()['choices'][0]['message']['content'] # C'est possible de parraléliser les requetes

# Define function to extract json objects
def extract_json(answer):
    pattern = r'```json\n(.*?)\n```'
    matches = re.findall(pattern, answer, re.DOTALL)
    objects = [eval(m.replace('null', 'None').replace('true', 'True')) for m in matches] 
    return objects[0]


### Make QA 

In [2]:
import json
from tqdm import tqdm
import requests
from bs4 import BeautifulSoup

NB_REQUESTS = 1


# Load data
with open('sujet_thematic.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# Load progress
try:
    with open('progress.json', 'r', encoding='utf-8') as f:
        progress = json.load(f)
# If no progress file, create one
except:
    progress = {k:list(range(len(v))) for k,v in data.items()}

# Load QA
try:
    with open('qa.json', 'r', encoding='utf-8') as f:
        qa = json.load(f)
# If no QA file, create one
except:
    qa = []

# For each thematic
for thematic in data.keys():

    print(f"Thematic: {thematic}")

    # Make requests
    for i in tqdm(range(NB_REQUESTS)):
        
        # If no more articles, break
        if len(progress[thematic]) == 0:
            break
        
        try:
            # Try to get article page
            index = progress[thematic][i]
            link = "/wiki/" + data[thematic][index]
            page = requests.get("https://fr.wikipedia.org" + link)
            
            # If not possible, get the portail page
            if page.status_code != 200:
                link = "/wiki/Portail:" + data[thematic][index]
                page = requests.get("https://fr.wikipedia.org" + link)

            # Get text
            page.encoding = 'utf-8'
            soup = BeautifulSoup(page.text, "html.parser")
            text = soup.get_text()

            # Request LLM
            answer = request_llm(text, thematic)

            # Extract JSON objects
            objects = extract_json(answer)
            for o in objects:
                o['category'] = thematic
                o['source'] = link
                o['source_name'] = data[thematic][index]
            
            # Add to QA
            qa = [*qa, *objects]
            
            # Save progress
            progress[thematic].remove(index)
            with open('progress.json', 'w+', encoding='utf-8') as f:
                json.dump(progress, f, ensure_ascii=False, indent=2)
            
            # Save QA
            with open('qa.json', 'w+', encoding='utf-8') as f:
                json.dump(qa, f, ensure_ascii=False, indent=2)
        
        except Exception as e:
            print(f"Error: {e}")
            continue


print('{} questions generated'.format(len(qa)))

Thematic: Histoire & Géographie


100%|██████████| 1/1 [00:13<00:00, 13.26s/it]


Thematic: Politique & Religion


100%|██████████| 1/1 [00:11<00:00, 11.05s/it]


Thematic: Sport & Loisirs


100%|██████████| 1/1 [00:09<00:00,  9.73s/it]


Thematic: Sciences & Technologie


100%|██████████| 1/1 [00:12<00:00, 12.17s/it]


Thematic: Art & Culture


100%|██████████| 1/1 [00:10<00:00, 10.50s/it]


Thematic: Société & Economie


100%|██████████| 1/1 [00:10<00:00, 10.06s/it]

72 questions generated



