In [2]:
import requests
from bs4 import BeautifulSoup
import os
import re
import csv
import string

# Create folders if they don't exist
RAW_DATA_FOLDER = 'Data/raw/'
METADATA_FILE = 'Data/metadata.csv'
os.makedirs(RAW_DATA_FOLDER, exist_ok=True)

def clean_text(text):
    return re.sub(r'\s+', ' ', text).strip()

def sanitize_filename(title):
    valid_chars = f"-_.() {string.ascii_letters}{string.digits}"
    return ''.join(c for c in title if c in valid_chars).strip() + ".txt"

def scrape_article(url):
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        res = requests.get(url, headers=headers, timeout=10)
        soup = BeautifulSoup(res.text, 'html.parser')
        paragraphs = soup.find_all('p')

        content_list = []
        for p in paragraphs:
            text = p.get_text().strip()
            if len(text) > 40:
                content_list.append(text)

        return soup, "\n\n".join(content_list).strip()
    except Exception as e:
        print(f"[Error] Could not scrape {url}: {e}")
        return None, None

def extract_title(soup, fallback_url):
    title_tag = soup.find('title')
    if title_tag:
        return clean_text(title_tag.get_text().split("-")[0])
    return fallback_url.split("/")[-1].replace("_", " ")

def save_to_txt_and_log(index, title, url, content):
    numbered_title = f"{index}. {title}"
    filename = sanitize_filename(numbered_title)
    path = os.path.join(RAW_DATA_FOLDER, filename)

    # Skip if already downloaded
    if os.path.exists(path):
        print(f"⏩ Skipped (already exists): {filename}")
        return

    with open(path, 'w', encoding='utf-8') as f:
        f.write(content)

    # Log metadata
    with open(METADATA_FILE, 'a', encoding='utf-8', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow([numbered_title, url, filename])

    print(f"✅ Saved: {filename}")

urls = [
    # Kekauman Malaysia - General Knowledge
    "https://cabaranmasadepan5s5.blogspot.com/2014/03/kekauman-tempatan.html",
    "https://ms.wikipedia.org/wiki/Kaum-kaum_di_Malaysia",
    "https://ms.wikipedia.org/wiki/Perkauman_di_Malaysia",
    "https://ecentral.my/kaum-di-malaysia/",
    "https://sejkssrt6t2u4u5.wordpress.com/11-1-kaum-dan-etnik-di-malaysia/",
    "https://www.studocu.com/my/document/saito-university-college/ethics/pengenalan-pelbagai-kaum/33231064",
    "https://asamjawe.blogspot.com/2014/01/sejarah-suku-kaum-di-malaysia.html",
    "https://www.wikiwand.com/ms/articles/Kaum-kaum_di_Malaysia",
    "https://www.usim.edu.my/ms/berita/in-our-words-ms/kepelbagaian-kaum-dan-etnik-di-sarawak/",

    # Kekauman Malaysia - Permainan
    "https://pendidikansivikk3.wordpress.com/2012/05/14/permainan-tradisional-kaum-kaum-di-malaysia-17/",

    # Bumiputera di Sabah
    "https://ms.wikipedia.org/wiki/Bumiputera_di_Sabah",
    "https://asmidaling85.blogspot.com/2017/04/kepelbagaian-etnik-di-sabah.html",
    "https://pakaiantradisionaldimalaysia.weebly.com/sabah.html",
    "https://rolandanthonypereira.blogspot.com/2017/07/alat-alat-muzik-tradisional-sabah.html",

    # Kaum Murut (Sabah)
    "https://ms.wikipedia.org/wiki/Murut",
    "https://sites.google.com/student.kuptm.edu.my/kepelbagaianetniksabahdimalays/kaum-murut",
    "https://alliwannadoinmyheart.blogspot.com/p/etnik-murut-baju-nie-sangat-cantik-n.html",
    # Tarian
    "https://pemetaanbudaya.jkkn.gov.my/senibudaya/detail/1132/pengenalan",
    # Permainan
    "https://pemetaanbudaya.jkkn.gov.my/senibudaya/detail/712/pengenalan",
    # Adat Resam dan Budaya
    "https://pemetaanbudaya.jkkn.gov.my/senibudaya/detail/1168/pengenalan",
    # Pakaian
    "https://budayapakaiandimalaysia.blogspot.com/2010/08/srchttph1_21.html",
    "https://dat3bcun.blogspot.com/2011/03/pakaian-tradisional.html",

    # Kaum Kadazan (Sabah)
    "https://ms.wikipedia.org/wiki/Kadazan",
    "https://sites.google.com/student.kuptm.edu.my/kepelbagaianetniksabahdimalays/kaum-kadazan",
    # Adat Resam dan Budaya
    "https://pemetaanbudaya.jkkn.gov.my/senibudaya/detail/1160/pengenalan",
    # Tarian
    "https://pemetaanbudaya.jkkn.gov.my/senibudaya/detail/1115/pengenalan",
    # Alat Muzik
    "https://pemetaanbudaya.jkkn.gov.my/senibudaya/detail/1122/pengenalan",
    "https://pemetaanbudaya.jkkn.gov.my/senibudaya/detail/1124/pengenalan",
    "https://pemetaanbudaya.jkkn.gov.my/senibudaya/detail/1125/pengenalan",
    "https://pemetaanbudaya.jkkn.gov.my/senibudaya/detail/1126/pengenalan",
    # Makanan
    "https://pemetaanbudaya.jkkn.gov.my/senibudaya/detail/1179/pengenalan",
    "https://pemetaanbudaya.jkkn.gov.my/senibudaya/detail/1174/pengenalan",
    # Pakaian
    "https://nycgkb1053.blogspot.com/2016/09/kaum-kadazan.html",

    # Kaum Bajau (Sabah)
    "https://ms.wikipedia.org/wiki/Bajau",
    "https://sites.google.com/student.kuptm.edu.my/kepelbagaianetniksabahdimalays/1",
    "https://alliwannadoinmyheart.blogspot.com/p/etnik-bajau.html",
    "https://leynaatourism.blogspot.com/p/assalamualaikum-hello-guys-harinie-saya.html",
    # Adat Resam dan Budaya
    "https://pemetaanbudaya.jkkn.gov.my/senibudaya/detail/1154/pengenalan",
    # Tarian
    "https://pemetaanbudaya.jkkn.gov.my/senibudaya/detail/715/pengenalan",
    "https://sites.google.com/view/beritadimensi/urban/tarian-tradisi-kaum-bajau-igal-igal",
    # Makanan
    "https://pemetaanbudaya.jkkn.gov.my/senibudaya/detail/1177/pengenalan",
    # Pakaian
    "https://tintanynaa.blogspot.com/2016/10/pakaian-tradisi-orang-sabah.html",
    # Alat Muzik
    "https://alunanmustika.blogspot.com/2010/08/alat-muzik-tradisi-sabah.html",

    # Bumiputera di Sarawak (General)
    "https://ms.wikipedia.org/wiki/Bumiputera_di_Sarawak",
    "https://vacazzee.wordpress.com/2019/07/02/tahukah-anda-bahawa-sarawak-mempunyai-jumlah-suku-kaum-yang-terbesar-ini-jawapannya/",
    # Permainan
    "https://www.rakansarawak.com/v3/2023/10/29/kenali-permainan-tradisional-masyarakat-sarawak/",
    # Pakaian
    "https://pakaiantradisionaldimalaysia.weebly.com/sarawak.html",
    "https://maruwiah.wordpress.com/tag/pakaian-tradisional-melanau/",
    "https://upimbtpnsarawak.blogspot.com/2010/02/pakaian-tradisional-etnik-di-sarawak.html",
    # Alat Muzik
    "https://upimbtpnsarawak.blogspot.com/2010/02/alat-muzik-tradisional-sarawak.html",

    # Kaum Iban (Sarawak)
    "https://ms.wikipedia.org/wiki/Iban",
    "https://kpgbudayasarawak.blogspot.com/2013/07/kaum-iban.html",
    # Adat Resam dan Budaya
    "https://pemetaanbudaya.jkkn.gov.my/senibudaya/detail/1050/pengenalan",
    "https://pemetaanbudaya.jkkn.gov.my/senibudaya/detail/1053/pengenalan",
    # Bahasa
    "https://core.ac.uk/download/pdf/322457022.pdf",
    # Perayaan
    "https://kaumibandimalaysia.wordpress.com/2017/06/05/perayaan/",
    # Makanan
    "https://ibans502sivik.wordpress.com/2017/06/23/makanan-tradisional-masyarakat-iban/",
    "http://mengenalimakanantradisionaliban.blogspot.com/2017/08/mengenali-makanan-tradisional-iban.html",
    # Permainan
    "https://suarasarawak.my/batak-lampung-permainan-tradisional-iban/",
    # Alat Muzik
    "https://pemetaanbudaya.jkkn.gov.my/senibudaya/detail/707/pengenalan",
    "https://pemetaanbudaya.jkkn.gov.my/senibudaya/detail/1084/pengenalan",
    # Pakaian
    "https://tebilangendaberupai2012.wordpress.com/pakaian-tradisional-iban/",
    "https://ms.wikipedia.org/wiki/Ngepan_Iban",
    # Tarian
    "https://ms.wikipedia.org/wiki/Ngajat",
    "https://kpgbudayasarawak.blogspot.com/2013/07/tarian-kaum-iban.html",

    # Kaum Bidayuh (Sarawak)
    "https://ms.wikipedia.org/wiki/Suku_Bidayuh",
    "https://bidayuhiban.blogspot.com/2010/05/suku-kaum-bidayuh-sarawak.html",
    # Pakaian
    "https://5duppertour.blogspot.com/2014/02/pakaian-tradisional-kaum-bidayuh.html",
    # Adat Resam dan Perayaan
    "https://masyarakatsabahdansarawak.blogspot.com/2018/02/adat-resam-dan-budaya-kaum-bidayuh.html",
    "https://pemetaanbudaya.jkkn.gov.my/senibudaya/detail/1049/pengenalan",
    "https://www.rakansarawak.com/v3/2023/02/16/pesta-bung-bratak-menyingkap-peristiwa-penting-sejarah-dan-warisan-masyarakat-bidayuh-jagoi/",
    # Tarian
    "https://museum.sarawak.gov.my/web/subpage/webpage_view/335",
    "https://pemetaanbudaya.jkkn.gov.my/senibudaya/detail/1090/pengenalan",
    "https://pemetaanbudaya.jkkn.gov.my/senibudaya/detail/1097/pengenalan",
    # Makanan
    "https://www.iloveborneo.my/ketahui-5-jenis-makanan-kaum-bidayuh-yang-sedap-dan-underrated-di-sarawak/",

    # Kaum Melanau (Sarawak)
    "https://ms.wikipedia.org/wiki/Melanau",
    "https://bmpengurusan2010.blogspot.com/2010/09/masyarakat-melanau.html",
    "https://masyarakatsabahdansarawak.blogspot.com/2018/02/adat-resam-dan-budaya-kaum-melanau-di.html",
    # Makanan
    "https://hillblog95.blogspot.com/p/makanan-melanau-sagu.html",
    # Perayaan dan Adat
    "https://dewanbudaya.jendeladbp.my/2023/05/18/5466/",
    # Tarian
    "https://hillblog95.blogspot.com/p/tarian-melanau.html",
    "https://pemetaanbudaya.jkkn.gov.my/senibudaya/detail/704/pengenalan",
    "https://pemetaanbudaya.jkkn.gov.my/senibudaya/detail/1085/pengenalan",
    # Bahasa
    "https://isswak.weebly.com/kaum-melanau.html",
    # Permainan
    "https://www.iloveborneo.my/pisa-tibow-permainan-masyarakat-melanau/",
    # Alat Muzik
    "https://coconote.app/notes/6b47903d-54c1-4957-9010-da54e81fc6c5",

    # Kaum Melayu
    "https://ms.wikipedia.org/wiki/Orang_Melayu_Malaysia",
    "https://ms.wikipedia.org/wiki/Orang_Melayu",
    "https://ms.wikipedia.org/wiki/Kaum_Melayu",
    "https://ms.wikipedia.org/wiki/Melayu",
    # Pakaian
    "https://www.omarali.com.my/sejarah-pakaian-tradisional-melayu/",
    "https://ms.wikipedia.org/wiki/Baju_tradisional_Melayu",
    "https://ms.wikipedia.org/wiki/Baju_Melayu",
    "https://ms.wikipedia.org/wiki/Baju_kebaya",
    "https://ms.wikipedia.org/wiki/Baju_Kurung",
    "https://ms.wikipedia.org/wiki/Sarung",
    "https://ms.wikipedia.org/wiki/Pelikat",
    "https://ecentral.my/kain-pelikat/",
    # Makanan
    "https://craftla.co/blog/makanan-tempatan-orang-melayu/",
    "https://cariblogger.com/makanan-tradisional-melayu/",
    "https://groupmakanan.blogspot.com/2017/03/makanan-tradisional-kaum-di-malaysia_12.html",
    "https://ms.wikipedia.org/wiki/Belacan",
    "https://ms.wikipedia.org/wiki/Kuih_koci",
    "https://ms.wikipedia.org/wiki/Keropok_lekor",
    "https://ms.wikipedia.org/wiki/Sate",
    "https://ms.wikipedia.org/wiki/Nasi_lemak",
    "https://ms.wikipedia.org/wiki/Nasi_kerabu",
    "https://ms.wikipedia.org/wiki/Nasi_dagang",
    "https://ms.wikipedia.org/wiki/Nasi_kandar",
    "https://ms.wikipedia.org/wiki/Nasi_Tupe",
    "https://ms.wikipedia.org/wiki/Nasi_campur",
    "https://ms.wikipedia.org/wiki/Nasi_ulam",
    "https://ms.wikipedia.org/wiki/Keropok_keping",
    "https://ms.wikipedia.org/wiki/Budu",
    "https://ms.wikipedia.org/wiki/Sata",
    "https://ms.wikipedia.org/wiki/Tempoyak",
    "https://ms.wikipedia.org/wiki/Otak-otak",
    "https://ms.wikipedia.org/wiki/Roti_Jala",
    "https://ms.wikipedia.org/wiki/Lontong",
    "https://ms.wikipedia.org/wiki/Pecal",
    "https://ms.wikipedia.org/wiki/Burasak",
    "https://ms.wikipedia.org/wiki/Ketupat",
    "https://ms.wikipedia.org/wiki/Lemang",
    "https://ms.wikipedia.org/wiki/Wajik",
    "https://ms.wikipedia.org/wiki/Dodol",
    # Permainan
    "https://infopelajar.com.my/senarai-permainan-tradisional/",
    "https://blissbies.com/my/blog/permainan-tradisional-melayu/",
    "https://ms.wikipedia.org/wiki/Congkak",
    "https://ms.wikipedia.org/wiki/Gasing",
    "https://ms.wikipedia.org/wiki/Wau_bulan",
    # Alat Muzik
    "https://janecheahblogger.blogspot.com/p/alat-alat-muzik-tradisional.html",
    "http://rafidah-maribelajar.blogspot.com/2011/12/alat-alat-muzik-masyarakat-melayu.html",
    "https://ms.wikipedia.org/wiki/Kompang",
    "https://ms.wikipedia.org/wiki/Gong",
    "https://ms.wikipedia.org/wiki/Rebana",
    "https://ms.wikipedia.org/wiki/Gambus",
    # Perayaan
    "http://sosiobudayadanagamamalaysia.blogspot.com/2018/02/perayaan-masyarakat-melayu.html",
    "https://ms.wikipedia.org/wiki/Hari_Raya_Aidilfitri",
    "https://ms.wikipedia.org/wiki/Hari_Raya_Haji",
    # Tarian
    "https://ecentral.my/tarian-tradisional-melayu/",
    "https://uniqueofmalaysia.home.blog/2018/11/23/tarian-tarian-tradisional-di-malaysia/",
    "http://tarianmalaysia29.blogspot.com/",
    "https://baskl.com.my/12-persembahan-kebudayaan-anda-perlu-tahu/",
    "https://ms.wikipedia.org/wiki/Keris",
    "https://ms.wikipedia.org/wiki/Tarian_Inai",
    "https://ms.wikipedia.org/wiki/Tarian_Ceracap_Inai",
    "https://ms.wikipedia.org/wiki/Tarian_Piring",
    # Kepercayaan
    "https://ms.wikipedia.org/wiki/Islam",
    "https://ms.wikipedia.org/wiki/Sejarah_Islam",
    "https://ms.wikipedia.org/wiki/Islam_di_Malaysia",
    # Bahasa
    "https://ms.wikipedia.org/wiki/Bahasa_Melayu",
    "https://ms.wikipedia.org/wiki/Bahasa_Melayu_Malaysia",

    # Kaum Cina
    # Pakaian
    "https://id.wikipedia.org/wiki/Congsam",
    "https://pakaiantradisionaldimalaysia.weebly.com/cina.html",
    "https://ms.wikipedia.org/wiki/Hanfu",
    # Suku Kaum
    "https://ms.wikipedia.org/wiki/Bahasa_Hokkien",
    "https://ms.wikipedia.org/wiki/Bahasa_Hakka",
    "https://ms.wikipedia.org/wiki/Bahasa_Kantonis",
    "https://ms.wikipedia.org/wiki/Bahasa_Hainan",
    "https://ms.wikipedia.org/wiki/Loghat_Fuzhou",
    # Makanan
    "https://dewanbudaya.jendeladbp.my/2024/05/05/8918/",
    "https://pemetaanbudaya.jkkn.gov.my/senibudaya/detail/825/pengenalan",
    "https://ms.wikipedia.org/wiki/Lo_mai_gai",
    "https://id.wikipedia.org/wiki/Dimsum",
    # Permainan
    "https://ms.wikipedia.org/wiki/Xiangqi",
    "https://id.wikipedia.org/wiki/Mahyong",
    # Alat Muzik
    "https://id.wikipedia.org/wiki/Erhu",
    "https://malay.cri.cn/741/2008/12/29/121s93600.htm",
    "https://id.wikipedia.org/wiki/Pipa_(alat_musik)",
    # Perayaan
    "https://ms.wikipedia.org/wiki/Tahun_Baru_Cina",
    "https://ms.wikipedia.org/wiki/Cap_Goh_Mei",
    "https://ms.wikipedia.org/wiki/Perayaan_Kuih_Bulan",
    "https://ms.wikipedia.org/wiki/Pesta_Tanglung",
    "https://ms.wikipedia.org/wiki/Perayaan_Duan_Wu",
    "https://id.wikipedia.org/wiki/Titik_balik_matahari_musim_dingin",
    # Tarian
    "https://pemetaanbudaya.jkkn.gov.my/senibudaya/detail/708/pengenalan",
    "https://ms.wikipedia.org/wiki/Tarian_singa",
    "https://id.wikipedia.org/wiki/Tari_Payung",
    "https://ms.wikipedia.org/wiki/Tarian_naga",
    # Kepercayaan
    "https://bjn.wikipedia.org/wiki/Buddha",
    "https://ms.wikipedia.org/wiki/Taoisme",
    "https://ms.wikipedia.org/wiki/Kristian",
    # Bahasa
    "https://ms.wikipedia.org/wiki/Bahasa_Mandarin",
    "https://ms.wikipedia.org/wiki/Bahasa_Hokkien",
    "https://ms.wikipedia.org/wiki/Bahasa_Hakka",
    "https://ms.wikipedia.org/wiki/Bahasa_Kantonis",
    "https://ms.wikipedia.org/wiki/Bahasa_Teochew",

    # Kaum India
    "https://ms.wikipedia.org/wiki/Orang_India_Malaysia",
    # Pakaian
    "https://ms.wikipedia.org/wiki/Sari",
    "https://ms.wikipedia.org/wiki/Serban",
    "https://id.wikipedia.org/wiki/Kurta",
    "https://ms.wikipedia.org/wiki/Dhoti",
    "https://ms.wikipedia.org/wiki/Sarung",
    "https://www.scribd.com/doc/49427201/SEJARAH-BAJU-PUNJABI",
    # Makanan
    "https://ms.wikipedia.org/wiki/Tosai",
    "https://ms.wikipedia.org/wiki/Idli",
    "https://ms.wikipedia.org/wiki/Muruku",
    "https://www.indianhealthyrecipes.com/rava-kesari/",
    "https://ms.wikipedia.org/wiki/Apam",
    "https://id.wikipedia.org/wiki/Laddu",
    "https://ms.wikipedia.org/wiki/Putu_mayam",
    "https://id.wikipedia.org/wiki/Naan",
    "https://ms.wikipedia.org/wiki/Nasi_daun_pisang",
    "https://ms.wikipedia.org/wiki/Panipuri",
    "https://ms.wikipedia.org/wiki/Vadai",
    # Permainan
    "https://ms.wikipedia.org/wiki/Kho-kho",
    "https://ms.wikipedia.org/wiki/Chaturanga",
    "https://id.wikipedia.org/wiki/Kabaddi",
    # Alat Muzik
    "https://id.wikipedia.org/wiki/Vina",
    "https://ms.wikipedia.org/wiki/Sitar",
    "https://id.wikipedia.org/wiki/Damaru",
    "https://blogspotnai.blogspot.com/2016/09/jenisalat-muzik-idiofon-manjira.html",
    "https://ms.wikipedia.org/wiki/Tabla",
    "https://subjeksivik.wordpress.com/alat-muzik/tradisional-india/tempura/",
    "https://azharhusin.blogspot.com/2008/07/persembahan-klasik.html",
    # Perayaan
    "https://iba.wikipedia.org/wiki/Hari_Deepavali",
    "https://ms.wikipedia.org/wiki/Thaipusam",
    "https://ms.wikipedia.org/wiki/Hari_Thaiponggal",
    "https://ms.wikipedia.org/wiki/Onam",
    "https://ms.wikipedia.org/wiki/Hari_Durga_Puja",
    "https://ms.wikipedia.org/wiki/Parchu_Bhogi",
    "https://ms.wikipedia.org/wiki/Puthandu",
    # Tarian
    "https://id.wikipedia.org/wiki/Srimpi",
    "https://ms.wikipedia.org/wiki/Bharathanatayam",
    "https://www.hmetro.com.my/nuansa/2019/11/521098/rentak-unik-tarian-kolattam",
    "https://ms.wikipedia.org/wiki/Kathakali",
    # Kepercayaan
    "https://ms.wikipedia.org/wiki/Hinduisme",
    "https://ms.wikipedia.org/wiki/Kristian",
    # Bahasa
    "https://ms.wikipedia.org/wiki/Bahasa_Telugu",
    "https://ms.wikipedia.org/wiki/Bahasa_Tamil",
    "https://ms.wikipedia.org/wiki/Bahasa_Hindi",
    "https://ms.wikipedia.org/wiki/Bahasa_Malayalam"
]

# Initialize metadata file if new
if not os.path.exists(METADATA_FILE):
    with open(METADATA_FILE, 'w', encoding='utf-8', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["Title", "URL", "Filename"])

# Scrape and save articles with numbered filenames
for idx, url in enumerate(urls, 1):
    print(f"[{idx}/{len(urls)}] Processing: {url}")
    soup, content = scrape_article(url)
    if content:
        title = extract_title(soup, url)
        save_to_txt_and_log(idx, title, url, content)

[1/232] Processing: https://cabaranmasadepan5s5.blogspot.com/2014/03/kekauman-tempatan.html
[2/232] Processing: https://ms.wikipedia.org/wiki/Kaum-kaum_di_Malaysia
✅ Saved: 2. Kaum.txt
[3/232] Processing: https://ms.wikipedia.org/wiki/Perkauman_di_Malaysia
✅ Saved: 3. Perkauman di Malaysia.txt
[4/232] Processing: https://ecentral.my/kaum-di-malaysia/
✅ Saved: 4. Senarai Etnik  Kaum Di Malaysia.txt
[5/232] Processing: https://sejkssrt6t2u4u5.wordpress.com/11-1-kaum-dan-etnik-di-malaysia/
✅ Saved: 5. 11.1 KAUM DAN ETNIK DI MALAYSIA  SEJARAH TAHUN 6.txt
[6/232] Processing: https://www.studocu.com/my/document/saito-university-college/ethics/pengenalan-pelbagai-kaum/33231064
[7/232] Processing: https://asamjawe.blogspot.com/2014/01/sejarah-suku-kaum-di-malaysia.html
[8/232] Processing: https://www.wikiwand.com/ms/articles/Kaum-kaum_di_Malaysia
✅ Saved: 8. Kaum.txt
[9/232] Processing: https://www.usim.edu.my/ms/berita/in-our-words-ms/kepelbagaian-kaum-dan-etnik-di-sarawak/
✅ Saved: 9. Kepelb