In [6]:
import requests
from bs4 import BeautifulSoup

def scrape_fandom_page(url):
    response = requests.get(url)
    response.raise_for_status()
    html = response.text

    soup = BeautifulSoup(html, 'html.parser')
    title = soup.find('h1').text.strip()
    content_div = soup.find('div', {'class': 'mw-parser-output'})

    chunks = []
    current_chunk = []
    current_section_title = "Introduction"

    for tag in content_div.find_all(['h2', 'h3', 'p']):
        if tag.name in ['h2', 'h3']:
            if current_chunk:
                chunks.append({
                    "section": current_section_title,
                    "text": " ".join(current_chunk)
                })
                current_chunk = []
            current_section_title = tag.get_text(strip=True)
        elif tag.name == 'p':
            text = tag.get_text(strip=True)
            if text:
                current_chunk.append(text)

    if current_chunk:
        chunks.append({
            "section": current_section_title,
            "text": " ".join(current_chunk)
        })

    return {
        'title': title,
        'url': url,
        'chunks': chunks
    }


In [7]:
from final_question_creator import (
    generate_questions, generate_answer,
    save_qa_pairs_to_csv, clean_section, clean_context,
    save_qa_pairs_to_excel
)

In [8]:
CSV_FILE = "/data/qa_dataset.csv"
EXCEL_FILE = "/data/qa_dataset.xlsx"

In [9]:
def process_chunk(chunk, chunk_index):
    questions = generate_questions(chunk, num_questions=3)

    if not questions:
        questions = generate_questions(chunk, num_questions=1, simple_prompt=True)

        if not questions:
            return []

    qa_pairs = []
    for q in questions:
        a = generate_answer(chunk, q)
        if not a:
            continue

        qa_pairs.append({
            "question": q,
            "answer": a,
            "section": clean_section(chunk.get("section", "")),
            "context": clean_context(chunk["text"])
        })

    return qa_pairs


In [10]:
def run_pipeline(url):
    data = scrape_fandom_page(url)
    chunks = data['chunks']
    all_qa_pairs = []
    for i, chunk in enumerate(chunks):
        qa_pairs = process_chunk(chunk, i)
        if qa_pairs:
            save_qa_pairs_to_csv(qa_pairs, file_path=CSV_FILE)
            save_qa_pairs_to_excel(qa_pairs, file_path=EXCEL_FILE)
            print(f" Saved {len(qa_pairs)} QA pairs.")
        else:
            print(" No valid QA pairs for this chunk.")

    print(f"\n Done! QA pairs saved to {CSV_FILE} and {EXCEL_FILE}")


In [11]:
def run_multiple_pipelines(links):
    for idx, link in enumerate(links):
        print(f"\n [{idx+1}/{len(links)}] Processing: {link}")
        try:
            run_pipeline(link)
        except Exception as e:
            print(f" Error processing {link}: {e}")

In [12]:
anime_fandom_links = [
    # Naruto
    "https://naruto.fandom.com/wiki/Naruto_Uzumaki",
    "https://naruto.fandom.com/wiki/Sasuke_Uchiha",
    "https://naruto.fandom.com/wiki/Sakura_Haruno",
    "https://naruto.fandom.com/wiki/Kakashi_Hatake",
    "https://naruto.fandom.com/wiki/Itachi_Uchiha",
    # One Piece
    "https://onepiece.fandom.com/wiki/Monkey_D._Luffy",
    "https://onepiece.fandom.com/wiki/Roronoa_Zoro",
    "https://onepiece.fandom.com/wiki/Nami",
    "https://onepiece.fandom.com/wiki/Sanji",
    "https://onepiece.fandom.com/wiki/Usopp",
    # Bleach
    "https://bleach.fandom.com/wiki/Ichigo_Kurosaki",
    "https://bleach.fandom.com/wiki/Rukia_Kuchiki",
    "https://bleach.fandom.com/wiki/Uryu_Ishida",
    "https://bleach.fandom.com/wiki/Orihime_Inoue",
    "https://bleach.fandom.com/wiki/Yasutora_Sado",
    # Dragon Ball
    "https://dragonball.fandom.com/wiki/Goku",
    "https://dragonball.fandom.com/wiki/Vegeta",
    "https://dragonball.fandom.com/wiki/Gohan",
    "https://dragonball.fandom.com/wiki/Piccolo",
    "https://dragonball.fandom.com/wiki/Bulma",
    # My Hero Academia
    "https://myheroacademia.fandom.com/wiki/Izuku_Midoriya",
    "https://myheroacademia.fandom.com/wiki/Katsuki_Bakugo",
    "https://myheroacademia.fandom.com/wiki/Todoroki_Shouto",
    "https://myheroacademia.fandom.com/wiki/All_Might",
    "https://myheroacademia.fandom.com/wiki/Ochaco_Uraraka",
    # Attack on Titan
    "https://attackontitan.fandom.com/wiki/Eren_Yeager",
    "https://attackontitan.fandom.com/wiki/Mikasa_Ackerman",
    "https://attackontitan.fandom.com/wiki/Armin_Arlert",
    "https://attackontitan.fandom.com/wiki/Levi_Ackerman",
    "https://attackontitan.fandom.com/wiki/Erwin_Smith",
    # Demon Slayer
    "https://demon-slayer.fandom.com/wiki/Tanjiro_Kamado",
    "https://demon-slayer.fandom.com/wiki/Nezuko_Kamado",
    "https://demon-slayer.fandom.com/wiki/Zenitsu_Agatsuma",
    "https://demon-slayer.fandom.com/wiki/Inosuke_Hashibira",
    "https://demon-slayer.fandom.com/wiki/Kyojuro_Rengoku",
    # Jujutsu Kaisen
    "https://jujutsu-kaisen.fandom.com/wiki/Yuji_Itadori",
    "https://jujutsu-kaisen.fandom.com/wiki/Megumi_Fushiguro",
    "https://jujutsu-kaisen.fandom.com/wiki/Nobara_Kugisaki",
    "https://jujutsu-kaisen.fandom.com/wiki/Satoru_Gojo",
    "https://jujutsu-kaisen.fandom.com/wiki/Yuta_Okkotsu",
    # Sword Art Online
    "https://swordartonline.fandom.com/wiki/Kirito",
    "https://swordartonline.fandom.com/wiki/Asuna",
    "https://swordartonline.fandom.com/wiki/Silica",
    "https://swordartonline.fandom.com/wiki/Lisbeth",
    "https://swordartonline.fandom.com/wiki/Klein",
    # Fullmetal Alchemist
    "https://fma.fandom.com/wiki/Edward_Elric",
    "https://fma.fandom.com/wiki/Alphonse_Elric",
    "https://fma.fandom.com/wiki/Roy_Mustang",
    "https://fma.fandom.com/wiki/Winry_Rockbell",
    "https://fma.fandom.com/wiki/Scar",
    # Tokyo Ghoul
    "https://tokyoghoul.fandom.com/wiki/Ken_Kaneki",
    "https://tokyoghoul.fandom.com/wiki/Touka_Kirishima",
    "https://tokyoghoul.fandom.com/wiki/Rize_Kamishiro",
    "https://tokyoghoul.fandom.com/wiki/Hideyoshi_Nishiki",
    "https://tokyoghoul.fandom.com/wiki/Ayato_Kirishima",
    # Hunter x Hunter
    "https://hunterxhunter.fandom.com/wiki/Gon_Freecss",
    "https://hunterxhunter.fandom.com/wiki/Killua_Zoldyck",
    "https://hunterxhunter.fandom.com/wiki/Leorio_Paradinight",
    "https://hunterxhunter.fandom.com/wiki/Kurapika",
    "https://hunterxhunter.fandom.com/wiki/Hisoka",
    # Fairy Tail
    "https://fairytail.fandom.com/wiki/Natsu_Dragneel",
    "https://fairytail.fandom.com/wiki/Lucy_Heartfilia",
    "https://fairytail.fandom.com/wiki/Gray_Fullbuster",
    "https://fairytail.fandom.com/wiki/Erza_Scarlet",
    "https://fairytail.fandom.com/wiki/Wendy_Marvell",
    # Black Clover
    "https://blackclover.fandom.com/wiki/Asta",
    "https://blackclover.fandom.com/wiki/Yuno",
    "https://blackclover.fandom.com/wiki/Noelle_Silva",
    "https://blackclover.fandom.com/wiki/Yami_Sukehiro",
    "https://blackclover.fandom.com/wiki/Mimosa_Vernos",
    # Re:Zero
    "https://rezero.fandom.com/wiki/Subaru_Natsuki",
    "https://rezero.fandom.com/wiki/Emilia",
    "https://rezero.fandom.com/wiki/Rem",
    "https://rezero.fandom.com/wiki/Rem",
    "https://rezero.fandom.com/wiki/Ram",
    # Steins;Gate
    "https://steinsgate.fandom.com/wiki/Rintarou_Okabe",
    "https://steinsgate.fandom.com/wiki/Kurisu_Makise",
    "https://steinsgate.fandom.com/wiki/Mayuri_Shri",
    "https://steinsgate.fandom.com/wiki/Ruka_Ushiya",
    "https://steinsgate.fandom.com/wiki/Daru",
    # Death Note
    "https://deathnote.fandom.com/wiki/Light_Yagami",
    "https://deathnote.fandom.com/wiki/L_(Dead_Or_Alive)",
    "https://deathnote.fandom.com/wiki/Misa_Amane",
    "https://deathnote.fandom.com/wiki/Near",
    "https://deathnote.fandom.com/wiki/Mello",
    # Code Geass
    "https://codegeass.fandom.com/wiki/Zero_(Char)",
    "https://codegeass.fandom.com/wiki/C.C.",
    "https://codegeass.fandom.com/wiki/Lelouch_Lamperouge",
    "https://codegeass.fandom.com/wiki/Kururugi_Suzaku",
    "https://codegeass.fandom.com/wiki/Nunnally_Lamperouge",
    # Gintama
    "https://gintama.fandom.com/wiki/Gintoki_Sakata",
    "https://gintama.fandom.com/wiki/Shinpachi_Shinso",
    "https://gintama.fandom.com/wiki/Kagura",
    "https://gintama.fandom.com/wiki/Toshiro_Kondo",
    "https://gintama.fandom.com/wiki/Sakamoto",
    # Mob Psycho 100
    "https://mobpsycho100.fandom.com/wiki/Shigeo_Kageyama",
    "https://mobpsycho100.fandom.com/wiki/Reigen_Arataka",
    "https://mobpsycho100.fandom.com/wiki/Ritsu_Kageyama",
    "https://mobpsycho100.fandom.com/wiki/Tsubomi_Kageyama",
    "https://mobpsycho100.fandom.com/wiki/Dimple",
    # One-Punch Man
    "https://onepunchman.fandom.com/wiki/Saitama",
    "https://onepunchman.fandom.com/wiki/Genos",
    "https://onepunchman.fandom.com/wiki/Speed_Oscillation_Expert",
    "https://onepunchman.fandom.com/wiki/Tornado_of_Terror",
    "https://onepunchman.fandom.com/wiki/Bang",
    # Haikyuu!!
    "https://haikyuu.fandom.com/wiki/Shoyo_Hinata",
    "https://haikyuu.fandom.com/wiki/Tobio_Kageyama",
    "https://haikyuu.fandom.com/wiki/Koshi_Sugawara",
    "https://haikyuu.fandom.com/wiki/Tetsuro_Kenma",
    "https://haikyuu.fandom.com/wiki/Kei_Tsukishima",
    # JoJo’s Bizarre Adventure
    "https://jojowiki.fandom.com/wiki/Jotaro_Kujo",
    "https://jojowiki.fandom.com/wiki/Joseph_Joestar",
    "https://jojowiki.fandom.com/wiki/Dio_Brando",
    "https://jojowiki.fandom.com/wiki/Holy_Van_Helsing",
    "https://jojowiki.fandom.com/wiki/Bruno_Bucciarati",
    # Blue Exorcist
    "https://blueexorcist.fandom.com/wiki/Rin_Okumura",
    "https://blueexorcist.fandom.com/wiki/Yukio_Okumura",
    "https://blueexorcist.fandom.com/wiki/Shiemi_Moriyama",
    "https://blueexorcist.fandom.com/wiki/Mephisto_Pheles",
    "https://blueexorcist.fandom.com/wiki/Ryuji_Suguro",
    # Noragami
    "https://noragami.fandom.com/wiki/Yato",
    "https://noragami.fandom.com/wiki/Hiyori_Iki",
    "https://noragami.fandom.com/wiki/Yukine",
    "https://noragami.fandom.com/wiki/Bishamon",
    "https://noragami.fandom.com/wiki/Kazuma",
    # The Seven Deadly Sins
    "https://nanatsu-no-taizai.fandom.com/wiki/Meliodas",
    "https://nanatsu-no-taizai.fandom.com/wiki/Elizabeth_Liones",
    "https://nanatsu-no-taizai.fandom.com/wiki/Ban",
    "https://nanatsu-no-taizai.fandom.com/wiki/Diane",
    "https://nanatsu-no-taizai.fandom.com/wiki/King",
    # Akame ga Kill!
    "https://akamegakill.fandom.com/wiki/Akame",
    "https://akamegakill.fandom.com/wiki/Tatsumi",
    "https://akamegakill.fandom.com/wiki/Esdeath",
    "https://akamegakill.fandom.com/wiki/Mine",
    "https://akamegakill.fandom.com/wiki/Leone",
    # Danganronpa
    "https://danganronpa.fandom.com/wiki/Makoto_Naegi",
    "https://danganronpa.fandom.com/wiki/Kyoko_Kirigiri",
    "https://danganronpa.fandom.com/wiki/Byakuya_Togami",
    "https://danganronpa.fandom.com/wiki/Aoi_Asahina",
    "https://danganronpa.fandom.com/wiki/Junko_Enoshima",
    # Soul Eater
    "https://souleater.fandom.com/wiki/Maka_Albarn",
    "https://souleater.fandom.com/wiki/Soul_Eater",
    "https://souleater.fandom.com/wiki/Death_the_Kid",
    "https://souleater.fandom.com/wiki/Black%E2%98%85Star",
    "https://souleater.fandom.com/wiki/Medusa_Gorgon",
    # Claymore
    "https://claymore.fandom.com/wiki/Clare",
    "https://claymore.fandom.com/wiki/Teresa",
    "https://claymore.fandom.com/wiki/Miria",
    "https://claymore.fandom.com/wiki/Helen",
    "https://claymore.fandom.com/wiki/Deneve",
    # Elfen Lied
    "https://elfenlied.fandom.com/wiki/Lucy",
    "https://elfenlied.fandom.com/wiki/Nyu",
    "https://elfenlied.fandom.com/wiki/Kohta",
    "https://elfenlied.fandom.com/wiki/Yuka",
    "https://elfenlied.fandom.com/wiki/Nana",
    # Psycho-Pass
    "https://psychopass.fandom.com/wiki/Akane_Tsunemori",
    "https://psychopass.fandom.com/wiki/Shinya_Kogami",
    "https://psychopass.fandom.com/wiki/Nobuchika_Ginoza",
    "https://psychopass.fandom.com/wiki/Shogo_Makishima",
    "https://psychopass.fandom.com/wiki/Yayoi_Kunizuka",
    # Vinland Saga
    "https://vinlandsaga.fandom.com/wiki/Thorfinn",
    "https://vinlandsaga.fandom.com/wiki/Askeladd",
    "https://vinlandsaga.fandom.com/wiki/Canute",
    "https://vinlandsaga.fandom.com/wiki/Bjorn",
    "https://vinlandsaga.fandom.com/wiki/Thors",
    # Trigun
    "https://trigun.fandom.com/wiki/Vash_the_Stampede",
    "https://trigun.fandom.com/wiki/Nicholas_D._Wolfwood",
    "https://trigun.fandom.com/wiki/Milly_Thompson",
    "https://trigun.fandom.com/wiki/Meryl_Stryfe",
    "https://trigun.fandom.com/wiki/Knives_Millions",
    # Devilman Crybaby
    "https://devilman.fandom.com/wiki/Akira_Fudo",
    "https://devilman.fandom.com/wiki/Ryo_Asuka",
    "https://devilman.fandom.com/wiki/Miki_Makimura",
    "https://devilman.fandom.com/wiki/Miko",
    "https://devilman.fandom.com/wiki/Silene",
    # Toradora!
    "https://toradora.fandom.com/wiki/Taiga_Aisaka",
    "https://toradora.fandom.com/wiki/Ryuuji_Takasu",
    "https://toradora.fandom.com/wiki/Minori_Kushieda",
    "https://toradora.fandom.com/wiki/Ami_Kawashima",
    "https://toradora.fandom.com/wiki/Yuusaku_Kitamura"
] 


In [13]:
run_multiple_pipelines(anime_fandom_links)


 [1/180] Processing: https://naruto.fandom.com/wiki/Naruto_Uzumaki

 Question prompt sent to model:
 You are an expert in anime trivia. Given the text below, generate exactly 3 questions only, each starting with 'Q:' on a new line.

TEXT:
Naruto Uzumaki(,Uzumaki Naruto) is ashinobiof Konohagakure's Uzumaki clanand areincarnationof Asura tsutsuki. He became thejinchrikiof the Nine-Tailson theday of his birth— a fate that caused him to be shunned by most of Konoha throughout his childhood. After joining Team Kakashi, Naruto worked hard to gain the village's acknowledgement all the while chasing h ...

Raw questions output:
 A: Who is Naruto Uzumaki?
Extracted 0 questions.

 Question prompt sent to model:
 Generate 1 simple questions from the text below, each starting with 'Q:' on a new line.

TEXT:
Naruto Uzumaki(,Uzumaki Naruto) is ashinobiof Konohagakure's Uzumaki clanand areincarnationof Asura tsutsuki. He became thejinchrikiof the Nine-Tailson theday of his birth— a fate that caused

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset



Raw questions output:
 A: What is Sakura's first name?
Extracted 0 questions.

 Question prompt sent to model:
 Generate 1 simple questions from the text below, each starting with 'Q:' on a new line.

TEXT:
Sakura as an Academy student. Sakura is the only child of Kizashi and Mebuki Haruno. She had a healthy childhood, raised by her parents without any serious tragedy or complications, unlike her team members. When she entered Konoha's Academy, a few of the girls in her class started picking on her because of her broad forehead. Sakura tried to combat their teasing by hiding her forehead with her bangs, b ...

Raw questions output:
 Q: What did Ino Yamanaka do for Sakura?
Extracted 1 questions.

 Answer prompt sent to model:
 You are an expert in anime trivia. Given the text below and a question, provide a clear and concise answer. Format your output as 'A: ...'

TEXT:
Sakura as an Academy student. Sakura is the only child of Kizashi and Mebuki Haruno. She had a healthy childhood, rai

KeyboardInterrupt: 