In [1]:
import json
import requests
from bs4 import BeautifulSoup
import re
import time
import os

In [2]:
def clean_text(text):
    # Replace newline (\n), carriage return (\r), and multiple spaces with a single space
    return re.sub(r'[\n\r]+', ' ', text).strip()

In [7]:
def extract_website_info(session, url):
    try:
        # Send request to website
        response = session.get(url)
        response.raise_for_status()  # Raise error for bad response
        
        # Parse HTML content
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extract Judul
        judul = soup.find('h1').text.strip() if soup.find('h1') else '-'
        judul = clean_text(judul)
        
        def get_table_data(header_name):
            header = soup.find('th', string=header_name)
            if header:
                td = header.find_next_sibling('td')
                if td:
                    link = td.find('a')
                    if link:
                        # Extract the href attribute of the <a> tag
                        return 'https://peraturan.go.id' + link.get('href')
                    else:
                        # Return the text content of the table cell
                        return clean_text(td.text.strip())
            return '-'
        
        # Extracting specific data from the table
        jenis_bentuk_peraturan = get_table_data('Jenis/Bentuk Peraturan')
        pemrakarsa = get_table_data('Pemrakarsa')
        nomor = get_table_data('Nomor')
        tahun = get_table_data('Tahun')
        tentang = get_table_data('Tentang')
        tempat_penetapan = get_table_data('Tempat Penetapan')
        tanggal_penetapan = get_table_data('Ditetapkan Tanggal')
        pejabat_penetapan = get_table_data('Pejabat yang Menetapkan')
        status = get_table_data('Status')
        dokumen_link = get_table_data('Dokumen Peraturan')  # Update header name here
        tahun_pengundangan = get_table_data('Tahun Pengundangan')
        no_pengundangan = get_table_data('Nomor Pengundangan')
        no_tambahan = get_table_data('Nomor Tambahan')
        tanggal_pengundangan = get_table_data('Tanggal Pengundangan')
        pejabat_pengundangan = get_table_data('Pejabat Pengundangan')
        
        # Extract mengubah and dasar hukum information list
        mengubah = []
        dasar_hukum = []
        
        div_card_body = soup.find_all('div', class_='card-body')
        
        # Print the index of each div.card-body
        for index, section in enumerate(div_card_body):
            if 'Mengubah :' in section.text:
                for item in section.find_all('li'):
                    link = item.find('a')
                    if link:
                        mengubah.append({
                            'text': clean_text(link.text.strip()),
                            'link': link.get('href')
                        })
                    else:
                        mengubah.append({'text': clean_text(item.text.strip()), 'link': None})
        
        # Extract the second 'ul' element within the second 'div.card-body'
        if len(div_card_body) > 1:
            sec_div_card_body = div_card_body[2]
            ul_tags = sec_div_card_body.find_all('ul')
            if ul_tags:
                sec_ul_tag = ul_tags[0]  # Second ul in the second div
                li_tags = sec_ul_tag.find_all('li')
                for item in li_tags:
                    link = item.find('a')
                    if link:
                        dasar_hukum.append({
                            'text': clean_text(item.text.strip()),
                            'link': link.get('href')
                        })
                    else:
                        dasar_hukum.append({'text': clean_text(item.text.strip()), 'link': None})
        
        data = {
            'judul': judul,
            'jenis_bentuk_peraturan': jenis_bentuk_peraturan,
            'pemrakarsa': pemrakarsa,
            'nomor': nomor,
            'tahun': tahun,
            'tentang': tentang,
            'tempat_penetapan': tempat_penetapan,
            'tanggal_penetapan': tanggal_penetapan,
            'pejabat_menetapkan': pejabat_penetapan,
            'status': status,
            'dokumen_link': dokumen_link,
            'tahun_pengundangan': tahun_pengundangan,
            'no_pengundangan': no_pengundangan,
            'no_tambahan': no_tambahan,
            'tanggal_pengundangan': tanggal_pengundangan,
            'pejabat_pengundangan': pejabat_pengundangan,
            'mengubah': mengubah,
            'dasar_hukum': dasar_hukum
        }
        
        return data
        
    except requests.RequestException as e:
        print(f"Error fetching the URL: {e}")
        return None

In [8]:
def send_to_api(data):
    api_url = 'http://localhost:3000/api/peraturan'
    try:
        response = requests.post(api_url, json=data)
        response.raise_for_status()
        print(f"Successfully sent data to API: {response.json()}")
    except requests.RequestException as e:
        print(f"Error sending data to API: {e}")

In [9]:
def extract_and_send(category_name):
    file_path = "Link Peraturan Indonesia.json"
    delay = 10
    
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            url_data = json.load(file)
    except Exception as e:
        print(f"Error reading JSON file: {e}")
        return

    with requests.Session() as session:
        for category in url_data:
            if category.get('category') == category_name:
                urls = category.get('wrapper_URL', [])
                for url in urls:
                    extracted_data = extract_website_info(session, url)
                    if extracted_data:
                        send_to_api(extracted_data)
                    else:
                        print(f"Failed to extract data from {url}")
                    time.sleep(delay)

    print("Data extraction and sending completed")

In [10]:
if __name__ == '__main__':
    category_to_extract = "PERPPU"
    extract_and_send(category_to_extract)

Successfully sent data to API: {'judul': 'Peraturan Pemerintah Pengganti Undang-undang Nomor 1 Tahun 2022 Tentang Perubahan Atas Undang-undang Nomor 7 Tahun 2017 Tentang pemilihan Umum', 'jenis_bentuk_peraturan': 'PERATURAN PEMERINTAH PENGGANTI UNDANG-UNDANG', 'pemrakarsa': 'PEMERINTAH PUSAT', 'nomor': '1', 'tahun': 2022, 'tentang': 'PERUBAHAN ATAS UNDANG-UNDANG NOMOR 7 TAHUN 2017 TENTANG PEMILIHAN UMUM', 'tempat_penetapan': 'Jakarta', 'tanggal_penetapan': '12 Desember 2022', 'status': 'Berlaku', 'dokumen_link': 'https://peraturan.go.id/files/Salinan+Perpu+Nomor+1+tahun+2022.pdf', 'tahun_pengundangan': 2022, 'no_pengundangan': 224, 'no_tambahan': 6832, 'tanggal_pengundangan': '12 Desember 2022', 'pejabat_pengundangan': '', 'mengubah': [{'text': 'Undang-Undang Nomor 7 Tahun 2017', 'link': '/id/uu-no-7-tahun-2017', '_id': '6692e9186a72990078966f6f'}], 'dasar_hukum': [{'text': 'Tentang Undang-undang Dasar Negara Republik Indonesia Tahun 1945', 'link': '/id/uud-1945', '_id': '6692e9186a729

KeyboardInterrupt: 

In [None]:
if __name__ == '__main__':
    category_to_extract = "Peraturan Badan"
    extract_and_send(category_to_extract)

In [None]:
if __name__ == '__main__':
    category_to_extract = "Peraturan Daerah"
    extract_and_send(category_to_extract)

In [None]:
if __name__ == '__main__':
    category_to_extract = "Peraturan Menteri"
    extract_and_send(category_to_extract)

In [None]:
if __name__ == '__main__':
    category_to_extract = "Peraturan Pemerintah"
    extract_and_send(category_to_extract)

In [None]:
if __name__ == '__main__':
    category_to_extract = "Peraturan Presiden"
    extract_and_send(category_to_extract)

In [None]:
if __name__ == '__main__':
    category_to_extract = "Peraturan lainnya"
    extract_and_send(category_to_extract)

In [None]:
if __name__ == '__main__':
    category_to_extract = "Undang undang"
    extract_and_send(category_to_extract)