*** Scrapping to JSON ***

In [3]:
import requests
from bs4 import BeautifulSoup
import json
import os
import re
from datetime import datetime

base_url = "https://albumkisahwayang.blogspot.com/2014/07/daftar-isi.html"
res = requests.get(base_url)
soup = BeautifulSoup(res.text, "html.parser")

section = soup.find("b", string="Kisah Pandawa dan Para Putra")
linkPandawa = []

if section:
    ul = section.find_next("ul")
    for i, a in enumerate(ul.find_all("a", href=True)):
        linkPandawa.append((f"pandawa{i+1:03}", a.text.strip(), a["href"]))

print("Jumlah cerita Pandawa ditemukan:", len(linkPandawa))

def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  
    text = re.sub(r'\[.*?\]', '', text) 
    return text.strip()

data = []

infoKarakterNya = {
    "Pandawa": ["Yudhishthira", "Bhima", "Arjuna", "Nakula", "Sahadeva"],
    "Arjuna": ["Pemuda pemanah ulung, sahabat Krishna, peran utama dalam Kurukshetra"],
    "Bhima": ["Saudara Pandawa yang kuat, terkenal karena kekuatannya"],
    "Krishna": ["Sahabat Pandawa, penyelamat, pembimbing spiritual Arjuna"]
}

for asin, title, url in linkPandawa:
    try:
        page = requests.get(url)
        page_soup = BeautifulSoup(page.text, "html.parser")
        content_div = page_soup.find("div", class_="post-body")
        content = content_div.get_text(separator=" ", strip=True) if content_div else ""
        cleaned_content = clean_text(content)
        word_count = len(cleaned_content.split())

        karakterdiCerita = []
        for character, descriptions in infoKarakterNya.items():
            if character.lower() in title.lower():
                karakterdiCerita.append({
                    "name": character,
                    "description": descriptions
                })

        unix_time = int(datetime.now().timestamp())

        entry = {
            "asin": asin,
            "question": f"Ceritakan tentang {title}",
            "questionType": "open-ended",
            "answer": cleaned_content,
            "answerType": "informative",
            "characters": karakterdiCerita,
            "tags": ["Pandawa", "Wayang", title],
            "metadata": {
                "source_url": url,
                "author": "Unspecified",
                "date_published": datetime.now().strftime('%Y-%m-%d'),
                "word_count": word_count
            },
            "categories": ["Perjalanan", "Perang", "Kehidupan Pandawa"],
            "unixTime": unix_time
        }

        data.append(entry)
        print("Scraped:", title)

    except Exception as e:
        print("Gagal scraping:", title, "-", e)

os.makedirs("output", exist_ok=True)

with open("output/pandawa_dataset_v2.json", "w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

print("Dataset sudah disimpan di output/pandawa_dataset_v2.json")


Jumlah cerita Pandawa ditemukan: 33
Scraped: Sitija Takon Bapa
Scraped: Abimanyu Rabi
Scraped: Gatutkaca Rabi
Scraped: Wahyu Cakraningrat
Scraped: Gatutkaca Rante
Scraped: Irawan Maling
Scraped: Prabu Gambiranom
Scraped: Irawan Rabi
Scraped: Antasena Takon Bapa
Scraped: Wisanggeni Lahir
Scraped: Gandawardaya
Scraped: Bambang Danasalira
Scraped: Bratalaras Rabi
Scraped: Sumitra Rabi
Scraped: Endrasekti - Sugatawati
Scraped: Samba Rabi
Scraped: Bambang Pramusinta
Scraped: Partajumena Rabi
Scraped: Wisata Rabi
Scraped: Petruk Nagih Janji
Scraped: Wahyu Topeng Waja
Scraped: Antareja Mbalela
Scraped: Gatutkaca Jumeneng Ratu
Scraped: Kikis Tunggarana
Scraped: Purwaganti Takon Bapa
Scraped: Prabu Tuguwasesa
Scraped: Dewa Amral
Scraped: Bimasuci
Scraped: Gatutkaca Nagih Janji
Scraped: Talirasa - Rasatali
Scraped: Boma Rabi
Scraped: Wisanggeni Rabi
Scraped: Perang Gojalisuta
Dataset sudah disimpan di output/pandawa_dataset_v2.json
