In [11]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from pymongo import MongoClient
import time
import os
import zipfile
import xmltodict
import json
import xml.etree.ElementTree as ET

# === Konfigurasi Paths ===
BASE_DIR = "C:/College/Big Data/downloads"
os.makedirs(BASE_DIR, exist_ok=True)

# === Koneksi MongoDB ===
MONGO_URI = "mongodb+srv://kelompok-5:FwJP0h7Bo6cTpEol@big-data.do3of.mongodb.net/"
DB_NAME = "Big_Data_kel_5"
client = MongoClient(MONGO_URI)
db = client[DB_NAME]
collection = db["Tugas_1"]

# === Konfigurasi Selenium WebDriver ===
options = webdriver.ChromeOptions()
options.add_experimental_option("prefs", {
    "download.default_directory": BASE_DIR,
    "download.prompt_for_download": False,
    "download.directory_upgrade": True,
    "safebrowsing.enabled": True
})
options.add_experimental_option("detach", True)  # Keep browser open
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# === URL IDX Laporan Keuangan ===
url = "https://www.idx.co.id/id/perusahaan-tercatat/laporan-keuangan-dan-tahunan"
company_codes = [
    "BBCA",  # PT Bank Central Asia Tbk
    "BBNI",  # PT Bank Negara Indonesia (Persero) Tbk
    "BBRI",  # PT Bank Rakyat Indonesia (Persero) Tbk
    "BMRI",  # PT Bank Mandiri (Persero) Tbk
    "ASII",  # PT Astra International Tbk
    "UNVR",  # PT Unilever Indonesia Tbk
    "ICBP",  # PT Indofood CBP Sukses Makmur Tbk
    "INDF",  # PT Indofood Sukses Makmur Tbk
    "KLBF",  # PT Kalbe Farma Tbk
    "ADRO",  # PT Adaro Energy Indonesia Tbk
    "ANTM",  # PT Aneka Tambang Tbk
    "ITMG",  # PT Indo Tambangraya Megah Tbk
    "PTBA",  # PT Bukit Asam Tbk
    "TPIA",  # PT Chandra Asri Petrochemical Tbk
    "SMGR",  # PT Semen Indonesia (Persero) Tbk
    "INKP",  # PT Indah Kiat Pulp & Paper Tbk
    "BRPT",  # PT Barito Pacific Tbk
    "UNTR",  # PT United Tractors Tbk
    "AMRT",  # PT Sumber Alfaria Trijaya Tbk
    "MPMX"   # PT Mitra Pinasthika Mustika Tbk
]

# === Fungsi Konversi XML ke Dictionary ===
def xml_to_dict(element):
    """Mengubah XML menjadi dictionary secara rekursif"""
    data = {}
    for child in element:
        tag = child.tag.split("}")[-1]  # Hapus namespace
        if len(child) > 0:
            data[tag] = xml_to_dict(child)
        else:
            data[tag] = child.text
    return data

# === Fungsi Parsing Taxonomy (.xsd) ===
def parse_taxonomy(xsd_path):
    """Parsing file Taxonomy (.xsd) untuk mendapatkan definisi elemen"""
    if not os.path.exists(xsd_path):
        print(f"❌ Taxonomy file {xsd_path} not found!")
        return {}

    try:
        tree = ET.parse(xsd_path)
        root = tree.getroot()
        taxonomy_dict = {}

        for elem in root.iter():
            tag = elem.tag.split("}")[-1]  # Hapus namespace
            if "name" in elem.attrib:
                taxonomy_dict[elem.attrib["name"]] = {
                    "type": elem.attrib.get("type", "unknown"),
                    "documentation": elem.attrib.get("documentation", "No description")
                }

        print(f"✅ Parsed taxonomy {xsd_path}")
        return taxonomy_dict

    except Exception as e:
        print(f"❌ Error parsing taxonomy: {e}")
        return {}

# === Proses Scraping, Download, Ekstraksi, Parsing, dan Insert ke MongoDB ===
for company in company_codes:
    try:
        driver.get(url)
        wait = WebDriverWait(driver, 10)

        # Input kode perusahaan
        search_box = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "input.vs__search")))
        search_box.clear()
        search_box.send_keys(company)
        time.sleep(1)

        # Pilih hasil pertama
        first_suggestion = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, ".vs__dropdown-option")))
        first_suggestion.click()

        # Pilih filter laporan keuangan, saham, tahun 2024, tahunan
        driver.find_element(By.ID, "FinancialStatement").click()
        driver.find_element(By.ID, "TypeSaham").click()
        driver.find_element(By.ID, "year1").click()
        driver.find_element(By.ID, "period3").click()
        driver.find_element(By.XPATH, "//button[contains(text(), 'Terapkan')]").click()

        # Tunggu tabel laporan muncul
        wait.until(EC.presence_of_element_located((By.TAG_NAME, "table")))

        # Cari dan download file "instance.zip"
        rows = driver.find_elements(By.TAG_NAME, "tr")
        for row in rows:
            if "instance.zip" in row.text:
                link_element = row.find_element(By.TAG_NAME, "a")
                driver.execute_script("arguments[0].scrollIntoView();", link_element)
                driver.execute_script("arguments[0].click();", link_element)
                print(f"✅ Download started for {company}")
                break

        # Tunggu beberapa detik agar download selesai
        time.sleep(5)

        # === Ekstraksi File ZIP ===
        zip_path = os.path.join(BASE_DIR, "instance.zip")
        extract_dir = os.path.join(BASE_DIR, company)
        os.makedirs(extract_dir, exist_ok=True)

        if os.path.exists(zip_path):
            try:
                with zipfile.ZipFile(zip_path, "r") as zip_ref:
                    zip_ref.extractall(extract_dir)
                print(f"✅ Extracted {company} instance.zip")
            except zipfile.BadZipFile:
                print(f"❌ Error: {company} instance.zip is not a valid ZIP file")
                continue  # Lewati perusahaan ini jika ZIP rusak
            finally:
                os.remove(zip_path)  # Hapus ZIP setelah ekstraksi

        # === Parsing Taxonomy ===
        xsd_path = os.path.join(extract_dir, "taxonomy.xsd")
        taxonomy_dict = parse_taxonomy(xsd_path)

        # === Konversi XBRL ke JSON & Insert ke MongoDB ===
        xbrl_path = os.path.join(extract_dir, "instance.xbrl")

        if os.path.exists(xbrl_path):
            try:
                tree = ET.parse(xbrl_path)
                root = tree.getroot()
                xbrl_dict = xml_to_dict(root)

                # Gabungkan XBRL dengan Taxonomy
                enriched_data = {
                    "company": company,
                    "taxonomy": taxonomy_dict,
                    "xbrl_data": xbrl_dict
                }

                # Insert ke MongoDB dengan upsert (update jika sudah ada)
                filter_query = {"company": company}
                update_query = {"$set": enriched_data}
                collection.update_one(filter_query, update_query, upsert=True)

                print(f"✅ {company} data inserted/updated in MongoDB!")

            except Exception as e:
                print(f"❌ Error processing {company}.xbrl: {e}")
        else:
            print(f"❌ instance.xbrl not found for {company}")

    except Exception as e:
        print(f"❌ Error processing {company}: {e}")

# === Selesai, Tutup Browser ===
driver.quit()
print(f"📂 Semua data berhasil di-download dan dimasukkan ke MongoDB!")


✅ Download started for BBCA
✅ Parsed taxonomy C:/College/Big Data/downloads\BBCA\taxonomy.xsd
✅ BBCA data inserted/updated in MongoDB!
✅ Download started for BBNI
✅ Parsed taxonomy C:/College/Big Data/downloads\BBNI\taxonomy.xsd
✅ BBNI data inserted/updated in MongoDB!
✅ Download started for BBRI
✅ Parsed taxonomy C:/College/Big Data/downloads\BBRI\taxonomy.xsd
✅ BBRI data inserted/updated in MongoDB!
✅ Download started for BMRI
❌ Taxonomy file C:/College/Big Data/downloads\BMRI\taxonomy.xsd not found!
❌ instance.xbrl not found for BMRI
✅ Download started for ASII
❌ Taxonomy file C:/College/Big Data/downloads\ASII\taxonomy.xsd not found!
❌ instance.xbrl not found for ASII
✅ Download started for UNVR
❌ Taxonomy file C:/College/Big Data/downloads\UNVR\taxonomy.xsd not found!
❌ instance.xbrl not found for UNVR
❌ Error processing ICBP: Message: 
Stacktrace:
	GetHandleVerifier [0x008D0B43+25139]
	(No symbol) [0x008613F4]
	(No symbol) [0x007404E3]
	(No symbol) [0x007883D7]
	(No symbol) [0x00