In [1]:
# Import library dan setup folder output
import os
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

OUTPUT_DIR = "hasil_scraping_snbp"
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [2]:
# URL awal SNBP
BASE_URLS = {
    "akademik": "https://sidatagrun-public-1076756628210.asia-southeast2.run.app/ptn_sn.php",
    "vokasi": "https://sidatagrun-public-1076756628210.asia-southeast2.run.app/ptn_sn.php?ptn=-2",
    "kin": "https://sidatagrun-public-1076756628210.asia-southeast2.run.app/ptn_sn.php?ptn=-3"
}

In [36]:
# Jalankan scraping daftar PTN dan simpan hasilnya
options = webdriver.ChromeOptions()
# options.add_argument("--headless")
driver = webdriver.Chrome(options=options)
wait = WebDriverWait(driver, 12)

# Scrape PTN per kategori with selenium
def scrape_ptn_page(url, kategori):
    driver.get(url)
    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "table.table tbody tr")))
    rows = driver.find_elements(By.CSS_SELECTOR, "table.table tbody tr")
    data = []
    for r in rows:
        cols = r.find_elements(By.TAG_NAME, "td")
        if len(cols) >= 6:
            kode = cols[1].text.strip() if cols[1].text.strip() else cols[0].text.strip()
            nama = cols[2].text.strip() if len(cols) > 2 else cols[1].text.strip()
            kota = cols[3].text.strip() if len(cols) > 3 else ""
            prov1 = cols[4].text.strip() if len(cols) > 4 else ""
            prov2 = cols[5].text.strip() if len(cols) > 5 else ""
            try:
                link = cols[2].find_element(By.TAG_NAME, "a").get_attribute("href")
            except:
                try:
                    link = cols[1].find_element(By.TAG_NAME, "a").get_attribute("href")
                except:
                    link = url
            data.append({
                "Kategori": kategori,
                "Kode PTN": kode,
                "Nama PTN": nama,
                "Kota": kota,
                "Provinsi1": prov1,
                "Provinsi2": prov2,
                "Link Prodi": link
            })
    df = pd.DataFrame(data)
    out = os.path.join(OUTPUT_DIR, f"ptn_{kategori}.csv")
    df.to_csv(out, index=False, encoding="utf-8-sig")
    return df

In [37]:
dfs_ptn = {}
for k,u in BASE_URLS.items():
    print("PTN:", k)
    dfs_ptn[k] = scrape_ptn_page(u, k)
    time.sleep(1)

df_ptn_all = pd.concat(list(dfs_ptn.values()), ignore_index=True)
df_ptn_all.to_csv(os.path.join(OUTPUT_DIR, "ptn_all.csv"), index=False, encoding="utf-8-sig")
print("Data PTN digabung disimpan ke ptn_all.csv")

driver.quit()

PTN: akademik
PTN: vokasi
PTN: kin
Data PTN digabung disimpan ke ptn_all.csv


In [42]:
# Baca file ptn_all untuk digunakan pada tahap berikutnya
df_ptn = pd.read_csv(os.path.join(OUTPUT_DIR, "ptn_all.csv"))
print(f"Total PTN: {len(df_ptn)}")
df_ptn.head()

Total PTN: 146


Unnamed: 0,Kategori,Kode PTN,Nama PTN,Kota,Provinsi1,Provinsi2,Link Prodi
0,akademik,1111,UNIVERSITAS SYIAH KUALA\n(https://pmb.usk.ac.id/),Kota Banda Aceh,Aceh,,https://sidatagrun-public-1076756628210.asia-s...
1,akademik,1112,UNIVERSITAS MALIKUSSALEH\n(http://pmb.unimal.a...,Kabupaten Aceh Utara,Aceh,,https://sidatagrun-public-1076756628210.asia-s...
2,akademik,1113,UNIVERSITAS TEUKU UMAR\n(https://pmb.utu.ac.id/),Kabupaten Aceh Barat,Aceh,,https://sidatagrun-public-1076756628210.asia-s...
3,akademik,1114,UNIVERSITAS SAMUDRA\n(https://camaba.unsam.ac....,Kota Langsa,Aceh,,https://sidatagrun-public-1076756628210.asia-s...
4,akademik,1115,ISBI ACEH\n(http://pmb.isbiaceh.ac.id/),Kabupaten Aceh Besar,Aceh,,https://sidatagrun-public-1076756628210.asia-s...


In [43]:
# Fungsi ambil soup dari URL (requests + BeautifulSoup)
def get_soup(url):
    res = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
    res.raise_for_status()
    return BeautifulSoup(res.text, "html.parser")

In [65]:
# Fungsi scrape prodi per PTN (tanpa Selenium)
def scrape_prodi_page(link_prodi, kode_ptn, nama_ptn):
    soup = get_soup(link_prodi)
    rows = soup.select("table.table tbody tr")
    prodi_data = []
    for r in rows:
        cols = r.find_all("td")
        if len(cols) >= 6:
            kode_prodi = cols[1].text.strip()
            nama_prodi = cols[2].text.strip()
            jenjang = cols[3].text.strip()
            daya = cols[4].text.strip()
            peminat = cols[5].text.strip()
            jenis_portofolio = cols[6].text.strip() if len(cols) > 6 else "Tidak Ada"
            try:
                href = cols[2].find("a")["href"]
                full_link = "https://sidatagrun-public-1076756628210.asia-southeast2.run.app/ptn_sn.php?" + href.lstrip("?")
            except:
                full_link = ""
            prodi_data.append({
                "Kode PTN": kode_ptn,
                "Nama PTN": nama_ptn,
                "Kode Prodi": kode_prodi,
                "Nama Prodi": nama_prodi,
                "Jenjang": jenjang,
                "Daya Tampung 2025": daya,
                "Peminat 2024": peminat,
                "Jenis Portofolio": jenis_portofolio,
                "Link Detail": full_link
            })
    return prodi_data

In [66]:
# Uji coba scrape 3 PTN pertama untuk melihat hasil struktur
prodi_all = []
for i, row in df_ptn.head(3).iterrows():
    print(f"[{i+1}] {row['Nama PTN']}")
    data = scrape_prodi_page(row["Link Prodi"], row["Kode PTN"], row["Nama PTN"])
    prodi_all.extend(data)
    time.sleep(1)

df_prodi_test = pd.DataFrame(prodi_all)
print(f"Jumlah data uji coba: {len(df_prodi_test)}")
df_prodi_test.head()

[1] UNIVERSITAS SYIAH KUALA
(https://pmb.usk.ac.id/)
[2] UNIVERSITAS MALIKUSSALEH
(http://pmb.unimal.ac.id/)
[3] UNIVERSITAS TEUKU UMAR
(https://pmb.utu.ac.id/)
Jumlah data uji coba: 134


Unnamed: 0,Kode PTN,Nama PTN,Kode Prodi,Nama Prodi,Jenjang,Daya Tampung 2025,Peminat 2024,Jenis Portofolio,Link Detail
0,1111,UNIVERSITAS SYIAH KUALA\n(https://pmb.usk.ac.id/),11111001,PENDIDIKAN DOKTER HEWAN,Sarjana,72,350,Tidak Ada,https://sidatagrun-public-1076756628210.asia-southeast2.run.app/ptn_sn.php?ptn=111&prodi=111001&jenis=0
1,1111,UNIVERSITAS SYIAH KUALA\n(https://pmb.usk.ac.id/),11111002,TEKNIK SIPIL,Sarjana,84,525,Tidak Ada,https://sidatagrun-public-1076756628210.asia-southeast2.run.app/ptn_sn.php?ptn=111&prodi=111002&jenis=0
2,1111,UNIVERSITAS SYIAH KUALA\n(https://pmb.usk.ac.id/),11111003,TEKNIK MESIN,Sarjana,36,259,Tidak Ada,https://sidatagrun-public-1076756628210.asia-southeast2.run.app/ptn_sn.php?ptn=111&prodi=111003&jenis=0
3,1111,UNIVERSITAS SYIAH KUALA\n(https://pmb.usk.ac.id/),11111004,TEKNIK KIMIA,Sarjana,54,207,Tidak Ada,https://sidatagrun-public-1076756628210.asia-southeast2.run.app/ptn_sn.php?ptn=111&prodi=111004&jenis=0
4,1111,UNIVERSITAS SYIAH KUALA\n(https://pmb.usk.ac.id/),11111005,ARSITEKTUR,Sarjana,48,272,Tidak Ada,https://sidatagrun-public-1076756628210.asia-southeast2.run.app/ptn_sn.php?ptn=111&prodi=111005&jenis=0


In [None]:
# Jalankan full scraping prodi untuk semua PTN
prodi_all = []
for i, row in df_ptn.iterrows():
    print(f"[{i+1}/{len(df_ptn)}] {row['Nama PTN']}")
    try:
        data = scrape_prodi_page(row["Link Prodi"], row["Kode PTN"], row["Nama PTN"])
        prodi_all.extend(data)
    except Exception as e:
        print(f"Error {row['Nama PTN']}: {e}")
    time.sleep(1)

df_prodi_all = pd.DataFrame(prodi_all)
output_path = os.path.join(OUTPUT_DIR, "prodi_all.csv")
df_prodi_all.to_csv(output_path, index=False, encoding="utf-8-sig")
print(f"Total prodi: {len(df_prodi_all)}")
print(f"File disimpan di: {output_path}")

[1/146] UNIVERSITAS SYIAH KUALA
(https://pmb.usk.ac.id/)
[2/146] UNIVERSITAS MALIKUSSALEH
(http://pmb.unimal.ac.id/)
[3/146] UNIVERSITAS TEUKU UMAR
(https://pmb.utu.ac.id/)
[4/146] UNIVERSITAS SAMUDRA
(https://camaba.unsam.ac.id/)
[5/146] ISBI ACEH
(http://pmb.isbiaceh.ac.id/)
[6/146] UNIVERSITAS SUMATERA UTARA
(https://www.usu.ac.id/id/penerimaan)
[7/146] UNIVERSITAS NEGERI MEDAN
(https://www.unimed.ac.id/pmb/)
[8/146] UNIVERSITAS RIAU
(https://registrasiulang.unri.ac.id/ )
[9/146] UNIVERSITAS MARITIM RAJA ALI HAJI
(https://penerimaan.umrah.ac.id/)
[10/146] UNIVERSITAS ANDALAS
(https://pmb.unand.ac.id/)
[11/146] UNIVERSITAS NEGERI PADANG
(https://spmb.unp.ac.id)
[12/146] ISI PADANG PANJANG
(https://www.isi-padangpanjang.ac.id/)
[13/146] UNIVERSITAS JAMBI
(https://regis.unja.ac.id/)
[14/146] UNIVERSITAS BENGKULU
(https://regmaba.unib.ac.id)
[15/146] UNIVERSITAS SRIWIJAYA
(https://pmb.unsri.ac.id/)
[16/146] UNIVERSITAS BANGKA BELITUNG
(https://pmb.ubb.ac.id/hasilseleksi/snbt)
[17/146] U

In [68]:
df_prodi_all.head()

Unnamed: 0,Kode PTN,Nama PTN,Kode Prodi,Nama Prodi,Jenjang,Daya Tampung 2025,Peminat 2024,Jenis Portofolio,Link Detail
0,1111,UNIVERSITAS SYIAH KUALA\n(https://pmb.usk.ac.id/),11111001,PENDIDIKAN DOKTER HEWAN,Sarjana,72,350,Tidak Ada,https://sidatagrun-public-1076756628210.asia-southeast2.run.app/ptn_sn.php?ptn=111&prodi=111001&jenis=0
1,1111,UNIVERSITAS SYIAH KUALA\n(https://pmb.usk.ac.id/),11111002,TEKNIK SIPIL,Sarjana,84,525,Tidak Ada,https://sidatagrun-public-1076756628210.asia-southeast2.run.app/ptn_sn.php?ptn=111&prodi=111002&jenis=0
2,1111,UNIVERSITAS SYIAH KUALA\n(https://pmb.usk.ac.id/),11111003,TEKNIK MESIN,Sarjana,36,259,Tidak Ada,https://sidatagrun-public-1076756628210.asia-southeast2.run.app/ptn_sn.php?ptn=111&prodi=111003&jenis=0
3,1111,UNIVERSITAS SYIAH KUALA\n(https://pmb.usk.ac.id/),11111004,TEKNIK KIMIA,Sarjana,54,207,Tidak Ada,https://sidatagrun-public-1076756628210.asia-southeast2.run.app/ptn_sn.php?ptn=111&prodi=111004&jenis=0
4,1111,UNIVERSITAS SYIAH KUALA\n(https://pmb.usk.ac.id/),11111005,ARSITEKTUR,Sarjana,48,272,Tidak Ada,https://sidatagrun-public-1076756628210.asia-southeast2.run.app/ptn_sn.php?ptn=111&prodi=111005&jenis=0


In [27]:
def scrape_sebaran_data(link_detail, kode_ptn, nama_ptn, kode_prodi, nama_prodi, jenis_portofolio):
    try:
        headers = {"User-Agent": "Mozilla/5.0"}
        resp = requests.get(link_detail, headers=headers, timeout=10)
        resp.raise_for_status()
    except Exception as e:
        print(f"Error akses {link_detail}: {e}")
        return []

    soup = BeautifulSoup(resp.text, "html.parser")

    # Temukan panel yang berisi tabel "SEBARAN DATA"
    panel = next(
        (p for p in soup.find_all("div", class_="panel panel-info") 
         if "SEBARAN DATA" in p.get_text(strip=True).upper()), 
        None
    )
    if not panel:
        return []

    table = panel.find("table")
    if not table:
        return []

    # Ambil header tahun (2020â€“2024)
    headers = [
        th.get_text(strip=True)
        for th in table.find("thead").find_all("th")
        if th.get_text(strip=True).isdigit()
    ]

    rows = table.find("tbody").find_all("tr")
    if len(rows) < 2:
        return []

    peminat_cells = rows[0].find_all("td")[1:]
    tampung_cells = rows[1].find_all("td")[1:]

    data_tahun = []
    for i, tahun in enumerate(headers):
        try:
            data_tahun.append({
                "Kode PTN": kode_ptn,
                "Nama PTN": nama_ptn.strip(),
                "Kode Prodi": kode_prodi,
                "Nama Prodi": nama_prodi.strip(),
                "Tahun": tahun,
                "Peminat": peminat_cells[i].get_text(strip=True),
                "Daya Tampung": tampung_cells[i].get_text(strip=True),
                "Jenis Portofolio": jenis_portofolio
            })
        except IndexError:
            continue

    return data_tahun

In [37]:
df_prodi_all = pd.read_csv(os.path.join(OUTPUT_DIR, "prodi_all.csv"))
df_prodi_sample = df_prodi_all.head(5)

all_detail = []

for i, row in df_prodi_sample.iterrows():
    print(f"[{i+1}/{len(df_prodi_sample)}] {row['Nama PTN']} - {row['Nama Prodi']}")
    data = scrape_sebaran_data(
        row["Link Detail"],
        row["Kode PTN"],
        row["Nama PTN"],
        row["Kode Prodi"],
        row["Nama Prodi"],
        row["Jenis Portofolio"]
    )
    all_detail.extend(data)
    # time.sleep(0.1)

df_sebaran_test = pd.DataFrame(all_detail)
print(f"Jumlah data uji: {len(df_sebaran_test)}")

[1/5] UNIVERSITAS SYIAH KUALA
(https://pmb.usk.ac.id/) - PENDIDIKAN DOKTER HEWAN
[2/5] UNIVERSITAS SYIAH KUALA
(https://pmb.usk.ac.id/) - TEKNIK SIPIL
[3/5] UNIVERSITAS SYIAH KUALA
(https://pmb.usk.ac.id/) - TEKNIK MESIN
[4/5] UNIVERSITAS SYIAH KUALA
(https://pmb.usk.ac.id/) - TEKNIK KIMIA
[5/5] UNIVERSITAS SYIAH KUALA
(https://pmb.usk.ac.id/) - ARSITEKTUR
Jumlah data uji: 25


In [38]:
df_sebaran_test.head()

Unnamed: 0,Kode PTN,Nama PTN,Kode Prodi,Nama Prodi,Tahun,Peminat,Daya Tampung,Jenis Portofolio
0,1111,UNIVERSITAS SYIAH KUALA\n(https://pmb.usk.ac.id/),11111001,PENDIDIKAN DOKTER HEWAN,2020,413,55(13.32%),Tidak Ada
1,1111,UNIVERSITAS SYIAH KUALA\n(https://pmb.usk.ac.id/),11111001,PENDIDIKAN DOKTER HEWAN,2021,391,55(14.07%),Tidak Ada
2,1111,UNIVERSITAS SYIAH KUALA\n(https://pmb.usk.ac.id/),11111001,PENDIDIKAN DOKTER HEWAN,2022,389,55(14.14%),Tidak Ada
3,1111,UNIVERSITAS SYIAH KUALA\n(https://pmb.usk.ac.id/),11111001,PENDIDIKAN DOKTER HEWAN,2023,401,50(12.47%),Tidak Ada
4,1111,UNIVERSITAS SYIAH KUALA\n(https://pmb.usk.ac.id/),11111001,PENDIDIKAN DOKTER HEWAN,2024,350,72(20.57%),Tidak Ada


In [41]:
# Jalankan full scraping sebaran data
all_detail = []

for i, row in df_prodi_all.iterrows():
    print(f"[{i+1}/{len(df_prodi_all)}] {row['Nama PTN']} - {row['Nama Prodi']}")
    try:
        data = scrape_sebaran_data(
            row["Link Detail"],
            row["Kode PTN"],
            row["Nama PTN"],
            row["Kode Prodi"],
            row["Nama Prodi"],
            row["Jenis Portofolio"]
        )
        all_detail.extend(data)
    except Exception as e:
        print(f"Error {row['Nama Prodi']}: {e}")
    # time.sleep(1)

df_sebaran = pd.DataFrame(all_detail)
output_path = os.path.join(OUTPUT_DIR, "sebaran_all.csv")
df_sebaran.to_csv(output_path, index=False, encoding="utf-8-sig")

print(f"Total baris: {len(df_sebaran)}")
print(f"File disimpan di: {output_path}")

[1/4967] UNIVERSITAS SYIAH KUALA
(https://pmb.usk.ac.id/) - PENDIDIKAN DOKTER HEWAN
[2/4967] UNIVERSITAS SYIAH KUALA
(https://pmb.usk.ac.id/) - TEKNIK SIPIL
[3/4967] UNIVERSITAS SYIAH KUALA
(https://pmb.usk.ac.id/) - TEKNIK MESIN
[4/4967] UNIVERSITAS SYIAH KUALA
(https://pmb.usk.ac.id/) - TEKNIK KIMIA
[5/4967] UNIVERSITAS SYIAH KUALA
(https://pmb.usk.ac.id/) - ARSITEKTUR
[6/4967] UNIVERSITAS SYIAH KUALA
(https://pmb.usk.ac.id/) - TEKNIK ELEKTRO
[7/4967] UNIVERSITAS SYIAH KUALA
(https://pmb.usk.ac.id/) - AGROTEKNOLOGI
[8/4967] UNIVERSITAS SYIAH KUALA
(https://pmb.usk.ac.id/) - AGRIBISNIS
[9/4967] UNIVERSITAS SYIAH KUALA
(https://pmb.usk.ac.id/) - PETERNAKAN
[10/4967] UNIVERSITAS SYIAH KUALA
(https://pmb.usk.ac.id/) - TEKNOLOGI HASIL PERTANIAN
[11/4967] UNIVERSITAS SYIAH KUALA
(https://pmb.usk.ac.id/) - TEKNIK PERTANIAN
[12/4967] UNIVERSITAS SYIAH KUALA
(https://pmb.usk.ac.id/) - PENDIDIKAN BIOLOGI
[13/4967] UNIVERSITAS SYIAH KUALA
(https://pmb.usk.ac.id/) - PENDIDIKAN MATEMATIKA
[14/496