In [4]:
import mysql.connector

database = mysql.connector.connect(
    host="localhost",
    user="root",
    password="",
)
cursor = database.cursor()
cursor.execute("USE nlp_thesis_similarity")

In [3]:
cursor.execute("SHOW TABLES")
tables = cursor.fetchall()
print("Tables in the database:")
for table in tables:
    print(table[0])

Tables in the database:
contributors
creators
dewey_papers
faculties
faculty_program
paper_contributors
paper_creators
paper_subjects
programs
subjects


In [4]:
cursor.execute("SELECT * FROM programs")
programs_data = cursor.fetchall()

# Get column names from cursor description
column_names = [column[0] for column in cursor.description]

# Print header
print("Programs table:")
print(column_names)
print("-" * 80)

# Print all rows
for row in programs_data:
    print(row)

Programs table:
['id', 'name', 'url']
--------------------------------------------------------------------------------
(1, 'Civil Engineering', None)
(2, 'Architecture', None)
(3, 'Architecture of Sustainable Housing and Real Estate', None)
(4, 'Master’s Program in Civil Engineering', None)
(5, 'Master’s Program in Architecture', None)
(6, 'Doctoral Program in Civil Engineering', None)
(7, 'Electrical Engineering', 'https://petra.ac.id/faculty/fti')
(8, 'Internet of Things', 'https://petra.ac.id/program/internet-of-things')
(9, 'Sustainable Mechanical Engineering and Design', 'https://petra.ac.id/program/sustainable-mechanical-engineering-and-design')
(10, 'Automotive', 'https://petra.ac.id/program/automotive')
(11, 'Industrial Engineering', 'https://petra.ac.id/program/industrial-engineering')
(12, 'Global Logistics and Supply Chain', 'https://petra.ac.id/program/global-logistics-and-supply-chain')
(13, 'International Business Engineering', 'https://petra.ac.id/program/international-b

In [8]:
# Check the faculties table structure
cursor.execute("SELECT * FROM faculties")
faculties_data = cursor.fetchall()

# Get column names from cursor description
column_names = [column[0] for column in cursor.description]

# Print header
print("Faculties table:")
print(column_names)
print("-" * 80)

# Print all rows
for row in faculties_data:
    print(row)

Faculties table:
['id', 'name', 'url']
--------------------------------------------------------------------------------
(17, 'Faculty of Civil Engineering and Planning', 'https://petra.ac.id/faculty/ftsp')
(18, 'Faculty of Industrial Technology', 'https://petra.ac.id/faculty/fti')
(19, 'School of Business and Management', 'https://petra.ac.id/faculty/sbm')
(20, 'Faculty of Teacher Education', 'https://petra.ac.id/faculty/fkip')
(21, 'Faculty of Humanities and Creative Industries', 'https://petra.ac.id/faculty/fhik')
(22, 'Fakultas Kedokteran', 'https://petra.ac.id/faculty/fk')
(23, 'Petra Business School', 'https://petra.ac.id/faculty/pbs')
(24, 'Fakultas Kedokteran Gigi', 'https://petra.ac.id/faculty/fkg')


In [9]:
# Check if faculty_program table exists
cursor.execute("SHOW TABLES LIKE 'faculty_program'")
if cursor.fetchone():
    cursor.execute("SELECT * FROM faculty_program LIMIT 10")
    fp_data = cursor.fetchall()
    
    # Get column names from cursor description
    column_names = [column[0] for column in cursor.description]
    
    # Print header
    print("Faculty_program table:")
    print(column_names)
    print("-" * 80)
    
    # Print rows
    for row in fp_data:
        print(row)
else:
    print("faculty_program table doesn't exist yet")

Faculty_program table:
['id', 'faculty_id', 'program_id']
--------------------------------------------------------------------------------
(1, 17, 1)
(2, 17, 2)
(3, 17, 3)
(4, 17, 4)
(5, 17, 5)
(6, 17, 6)
(7, 18, 7)
(8, 18, 8)
(9, 18, 9)
(10, 18, 10)


In [3]:
# Import necessary libraries for web scraping
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import re

In [5]:
def scrape_program_urls_bener(faculty_id, faculty_name, faculty_url):
    """
    Scrape program URLs from a faculty page and update the database accordingly.
    
    Args:
        faculty_id (int): The ID of the faculty in the database
        faculty_name (str): The name of the faculty
        faculty_url (str): The URL of the faculty page to scrape
    """
    print(f"\nProcessing faculty: {faculty_name} (ID: {faculty_id})")
    
    # Setup Chrome options
    chrome_options = Options()
    # Comment out headless mode for debugging
    # chrome_options.add_argument("--headless")
    
    # Setup the Chrome WebDriver
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

    try:
        # Navigate to the faculty URL
        print(f"Scraping URL: {faculty_url}")
        driver.get(faculty_url)
        
        # Wait for JavaScript to load
        time.sleep(5)
        
        # Find all divs with class 'prodi'
        prodi_divs = driver.find_elements(By.CLASS_NAME, 'prodi')
        print(f"Found {len(prodi_divs)} programs for this faculty")
        
        print(prodi_divs)
        
        # Process each program
        for i, div in enumerate(prodi_divs):
            try:
                # Extract program name
                program_name = div.find_element(By.TAG_NAME, 'h5').text.strip()
                print(f"\nProgram found: {program_name}")
                
                # Check if the div has any attributes that might contain a URL
                div_attributes = driver.execute_script(
                    "var items = {}; for (index = 0; index < arguments[0].attributes.length; ++index) { items[arguments[0].attributes[index].name] = arguments[0].attributes[index].value }; return items;", 
                    div
                )
                print(f"Div attributes: {div_attributes}")
                
                # Store the current window handle (tab)
                original_window = driver.current_window_handle
                # Store the number of windows before clicking
                original_window_count = len(driver.window_handles)
                
                program_url = None
                
                # Try clicking the div to see if it opens a new tab or navigates
                try:
                    # Click the div
                    div.click()
                    time.sleep(3)  # Wait for any new tab or navigation
                    
                    # Check if a new tab was opened
                    if len(driver.window_handles) > original_window_count:
                        # Switch to the new tab
                        new_window = [window for window in driver.window_handles if window != original_window][0]
                        driver.switch_to.window(new_window)
                        
                        # Get the URL of the new tab
                        program_url = driver.current_url
                        print(f"Found URL after clicking (new tab): {program_url}")
                        
                        # Close the new tab and switch back to original
                        driver.close()
                        driver.switch_to.window(original_window)
                    else:
                        # Check if we navigated to a new page in the same tab
                        current_url = driver.current_url
                        if current_url != faculty_url:
                            program_url = current_url
                            print(f"Found URL after clicking (same tab): {program_url}")
                            
                            # Navigate back to the faculty page
                            driver.get(faculty_url)
                            time.sleep(3)
                            
                            # Refresh the prodi_divs as the page has changed
                            prodi_divs = driver.find_elements(By.CLASS_NAME, 'prodi')
                
                except Exception as e:
                    print(f"Error clicking div: {e}")
                    # Make sure we're back on the faculty page
                    if driver.current_url != faculty_url:
                        driver.get(faculty_url)
                        time.sleep(3)
                        # Refresh the prodi_divs as the page has changed
                        prodi_divs = driver.find_elements(By.CLASS_NAME, 'prodi')
                
                # If no URL found from clicking, try to find links within the div
                if not program_url:
                    links = div.find_elements(By.TAG_NAME, 'a')
                    if links:
                        for link in links:
                            href = link.get_attribute('href')
                            if href and href != '#' and not href.startswith('javascript:'):
                                program_url = href
                                print(f"Found URL in link: {program_url}")
                                break
                
                # As a last resort, try to construct a URL based on the program name
                if not program_url:
                    # Extract base URL (domain)
                    base_url_match = re.match(r'(https?://[^/]+)', faculty_url)
                    if base_url_match:
                        base_url = base_url_match.group(1)
                        # Convert program name to slug format
                        slug = program_name.lower().replace(' ', '-')
                        # Use as fallback
                        program_url = f"{base_url}/program/{slug}"
                        print(f"Using fallback constructed URL: {program_url}")
                
                # Store the URL in the database (actual database operations would be in the full function)
                if program_url:
                    try:
                        # Check if program already exists by name
                        cursor.execute("SELECT id, url FROM programs WHERE name = %s", (program_name,))
                        existing_program = cursor.fetchone()
                        
                        if existing_program:
                            program_id = existing_program[0]
                            existing_url = existing_program[1]
                            
                            print(f"Program already exists with ID: {program_id}")
                            
                            # Update the URL if it was previously NULL or empty or different
                            if not existing_url or existing_url != program_url:
                                cursor.execute(
                                    "UPDATE programs SET url = %s WHERE id = %s", 
                                    (program_url, program_id)
                                )
                                database.commit()
                                print(f"Updated program URL from '{existing_url}' to: '{program_url}'")
                            else:
                                print(f"URL already correct in database: {existing_url}")
                        else:
                            # Insert new program
                            cursor.execute(
                                "INSERT INTO programs (name, url) VALUES (%s, %s)",
                                (program_name, program_url)
                            )
                            database.commit()
                            program_id = cursor.lastrowid
                            print(f"Added new program with ID: {program_id}")
                        
                        # Check if relationship already exists
                        cursor.execute(
                            "SELECT id FROM faculty_program WHERE faculty_id = %s AND program_id = %s",
                            (faculty_id, program_id)
                        )
                        existing_relation = cursor.fetchone()
                        
                        if not existing_relation:
                            # Create relationship
                            cursor.execute(
                                "INSERT INTO faculty_program (faculty_id, program_id) VALUES (%s, %s)",
                                (faculty_id, program_id)
                            )
                            database.commit()
                            print(f"Created faculty-program relationship")
                        else:
                            print(f"Faculty-program relationship already exists")
                            
                    except Exception as e:
                        print(f"Error processing program {program_name}: {e}")
                        database.rollback()
                else:
                    print(f"Could not find a URL for program: {program_name}")
                    
            except Exception as e:
                print(f"Error processing program div: {e}")
                # Make sure we're back on the faculty page
                if driver.current_url != faculty_url:
                    driver.get(faculty_url)
                    time.sleep(3)
                    # Refresh the prodi_divs as the page has changed
                    prodi_divs = driver.find_elements(By.CLASS_NAME, 'prodi')
    
    finally:
        # Always close the browser
        driver.quit()
        print(f"Completed processing faculty: {faculty_name}")

# Test with one faculty
# scrape_program_urls(18, "Faculty of Industrial Technology", "https://petra.ac.id/faculty/fti")

In [None]:
# Get all faculties from the database
cursor.execute("SELECT id, name, url FROM faculties")
faculties = cursor.fetchall()
print(f"Found {len(faculties)} faculties in the database")

# Process each faculty
for faculty_id, faculty_name, faculty_url in faculties:
    print(f"\nProcessing faculty: {faculty_name}")
    # Ask for confirmation before processing each faculty
    # confirm = input(f"Process {faculty_name}? (y/n): ")
    scrape_program_urls_bener(faculty_id, faculty_name, faculty_url)

print("\nCompleted processing all faculties")


Found 8 faculties in the database

Processing faculty: Faculty of Civil Engineering and Planning

Processing faculty: Faculty of Civil Engineering and Planning (ID: 17)
Scraping URL: https://petra.ac.id/faculty/ftsp
Found 6 programs for this faculty
[<selenium.webdriver.remote.webelement.WebElement (session="58c0ce272206a2188a49975afee8171e", element="f.B68800CCD468FE8CEC1DEC046C6A081D.d.EE5AC31D031648211A8EFC93B09B3A8A.e.37")>, <selenium.webdriver.remote.webelement.WebElement (session="58c0ce272206a2188a49975afee8171e", element="f.B68800CCD468FE8CEC1DEC046C6A081D.d.EE5AC31D031648211A8EFC93B09B3A8A.e.38")>, <selenium.webdriver.remote.webelement.WebElement (session="58c0ce272206a2188a49975afee8171e", element="f.B68800CCD468FE8CEC1DEC046C6A081D.d.EE5AC31D031648211A8EFC93B09B3A8A.e.39")>, <selenium.webdriver.remote.webelement.WebElement (session="58c0ce272206a2188a49975afee8171e", element="f.B68800CCD468FE8CEC1DEC046C6A081D.d.EE5AC31D031648211A8EFC93B09B3A8A.e.40")>, <selenium.webdriver.r

In [4]:
def scrape_program_urls(faculty_id, faculty_name, faculty_url):
    """
    Scrape program URLs from a faculty page and update the database accordingly.
    
    Args:
        faculty_id (int): The ID of the faculty in the database
        faculty_name (str): The name of the faculty
        faculty_url (str): The URL of the faculty page to scrape
    """
    print(f"\nProcessing faculty: {faculty_name} (ID: {faculty_id})")
    
    # Setup Chrome options
    chrome_options = Options()
    # Comment out headless mode for debugging
    # chrome_options.add_argument("--headless")
    
    # Setup the Chrome WebDriver
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    
    try:
        # Navigate to the faculty URL
        print(f"Scraping URL: {faculty_url}")
        driver.get(faculty_url)
        
        # Wait for JavaScript to load
        time.sleep(5)
        
        # Find all divs with class 'prodi'
        prodi_divs = driver.find_elements(By.CLASS_NAME, 'prodi')
        print(f"Found {len(prodi_divs)} programs for this faculty")
        
        print(prodi_divs)
        
        # Process each program
        for i, div in enumerate(prodi_divs):
            try:
                # Extract program name
                program_name = div.find_element(By.TAG_NAME, 'h5').text.strip()
                print(f"\nProgram found: {program_name}")
                
                # Check if the div has any attributes that might contain a URL
                div_attributes = driver.execute_script(
                    "var items = {}; for (index = 0; index < arguments[0].attributes.length; ++index) { items[arguments[0].attributes[index].name] = arguments[0].attributes[index].value }; return items;", 
                    div
                )
                print(f"Div attributes: {div_attributes}")
                
                # Check for any href or data-* attributes that might contain a URL
                program_url = None
                
                # 1. First check if div has a data-url or similar attribute
                for attr_name, attr_value in div_attributes.items():
                    if attr_name.startswith('data-') and ('url' in attr_name or 'link' in attr_name or 'href' in attr_name):
                        program_url = attr_value
                        print(f"Found URL in attribute {attr_name}: {program_url}")
                        break
                    
                print(attr_name, attr_value)
                
                # 2. If no URL found in attributes, check for any links inside the div
                if not program_url:
                    links = div.find_elements(By.TAG_NAME, 'a')
                    if links:
                        for link in links:
                            href = link.get_attribute('href')
                            if href and href != '#' and not href.startswith('javascript:'):
                                program_url = href
                                print(f"Found URL in link: {program_url}")
                                break
                
                # 3. If still no URL, try clicking the div and see if any modal appears with links
                if not program_url:
                    # Store the current URL to return to after clicking
                    current_url = driver.current_url
                    
                    # Click the div
                    try:
                        # Refresh the element reference if needed
                        if i > 0:
                            prodi_divs = driver.find_elements(By.CLASS_NAME, 'prodi')
                            div = prodi_divs[i]
                        
                        # Click and wait for any modals or popups
                        div.click()
                        time.sleep(2)
                        
                        # Check if a modal or popup appeared
                        modals = driver.find_elements(By.CSS_SELECTOR, '.modal, .popup, .dialog, [role="dialog"]')
                        for modal in modals:
                            if modal.is_displayed():
                                # Look for links in the modal
                                modal_links = modal.find_elements(By.TAG_NAME, 'a')
                                for link in modal_links:
                                    href = link.get_attribute('href')
                                    if href and href != '#' and not href.startswith('javascript:'):
                                        program_url = href
                                        print(f"Found URL in modal: {program_url}")
                                        break
                                
                                # Try to close the modal
                                close_buttons = modal.find_elements(By.CSS_SELECTOR, '.close, .btn-close, [aria-label="Close"]')
                                if close_buttons:
                                    close_buttons[0].click()
                                    time.sleep(1)
                        
                        # If URL still not found, check if we navigated to a new page
                        new_url = driver.current_url
                        if new_url != current_url:
                            program_url = new_url
                            print(f"Navigation detected: {program_url}")
                            # Navigate back
                            driver.get(current_url)
                            time.sleep(2)
                    except Exception as e:
                        print(f"Error clicking div: {e}")
                        driver.get(current_url)  # Make sure we're back on the faculty page
                        time.sleep(2)
                
                # 4. As a last resort, try to construct a URL based on the program name
                if not program_url:
                    # Extract base URL (domain)
                    base_url_match = re.match(r'(https?://[^/]+)', faculty_url)
                    if base_url_match:
                        base_url = base_url_match.group(1)
                        # Convert program name to slug format
                        slug = program_name.lower().replace(' ', '-')
                        # Try different URL patterns
                        possible_urls = [
                            f"{base_url}/program/{slug}",
                            f"{base_url}/programs/{slug}",
                            f"{base_url}/academic-programs/{slug}",
                            f"{base_url}/study-program/{slug}",
                            f"{base_url}/study-programs/{slug}"
                        ]
                        
                        # Try to visit each possible URL and see if it exists
                        for url in possible_urls:
                            try:
                                response = requests.head(url, timeout=5)
                                if response.status_code < 400:  # If URL returns a success status code
                                    program_url = url
                                    print(f"Verified constructed URL: {program_url}")
                                    break
                            except Exception as e:
                                print(f"Error checking URL {url}: {e}")
                        
                        if not program_url:
                            # Just use the first constructed URL as a fallback
                            program_url = possible_urls[0]
                            print(f"Using fallback constructed URL: {program_url}")
                
                # Check if program already exists and update database
                if program_url:
                    try:
                        # Check if program already exists by name
                        cursor.execute("SELECT id, url FROM programs WHERE name = %s", (program_name,))
                        existing_program = cursor.fetchone()
                        
                        if existing_program:
                            program_id = existing_program[0]
                            existing_url = existing_program[1]
                            
                            print(f"Program already exists with ID: {program_id}")
                            
                            # Update the URL if it was previously NULL or empty or different
                            if not existing_url or existing_url != program_url:
                                cursor.execute(
                                    "UPDATE programs SET url = %s WHERE id = %s", 
                                    (program_url, program_id)
                                )
                                database.commit()
                                print(f"Updated program URL from '{existing_url}' to: '{program_url}'")
                            else:
                                print(f"URL already correct in database: {existing_url}")
                        else:
                            # Insert new program
                            cursor.execute(
                                "INSERT INTO programs (name, url) VALUES (%s, %s)",
                                (program_name, program_url)
                            )
                            database.commit()
                            program_id = cursor.lastrowid
                            print(f"Added new program with ID: {program_id}")
                        
                        # Check if relationship already exists
                        cursor.execute(
                            "SELECT id FROM faculty_program WHERE faculty_id = %s AND program_id = %s",
                            (faculty_id, program_id)
                        )
                        existing_relation = cursor.fetchone()
                        
                        if not existing_relation:
                            # Create relationship
                            cursor.execute(
                                "INSERT INTO faculty_program (faculty_id, program_id) VALUES (%s, %s)",
                                (faculty_id, program_id)
                            )
                            database.commit()
                            print(f"Created faculty-program relationship")
                        else:
                            print(f"Faculty-program relationship already exists")
                            
                    except Exception as e:
                        print(f"Error processing program {program_name}: {e}")
                        database.rollback()
                else:
                    print(f"Could not find a URL for program: {program_name}")
                    
            except Exception as e:
                print(f"Error processing program div: {e}")
    
    finally:
        # Always close the browser
        driver.quit()
        print(f"Completed processing faculty: {faculty_name}")


In [5]:
# Get all faculties from the database
cursor.execute("SELECT id, name, url FROM faculties")
faculties = cursor.fetchall()
print(f"Found {len(faculties)} faculties in the database")

# Process each faculty
for faculty_id, faculty_name, faculty_url in faculties:
    print(f"\nProcessing faculty: {faculty_name}")
    # Ask for confirmation before processing each faculty
    confirm = input(f"Process {faculty_name}? (y/n): ")
    if confirm.lower() == 'y':
        scrape_program_urls(faculty_id, faculty_name, faculty_url)
    else:
        print(f"Skipping {faculty_name}")

print("\nCompleted processing all faculties")


Found 8 faculties in the database

Processing faculty: Faculty of Civil Engineering and Planning
Skipping Faculty of Civil Engineering and Planning

Processing faculty: Faculty of Industrial Technology

Processing faculty: Faculty of Industrial Technology (ID: 18)
Scraping URL: https://petra.ac.id/faculty/fti
Found 13 programs for this faculty
[<selenium.webdriver.remote.webelement.WebElement (session="b1cb8828f5ab00e72eb8753d86affbfe", element="f.DFB75E5846F3942B636B04F608F0750F.d.DFEAA4ABFB1BCC6E71032B783514D7D2.e.37")>, <selenium.webdriver.remote.webelement.WebElement (session="b1cb8828f5ab00e72eb8753d86affbfe", element="f.DFB75E5846F3942B636B04F608F0750F.d.DFEAA4ABFB1BCC6E71032B783514D7D2.e.38")>, <selenium.webdriver.remote.webelement.WebElement (session="b1cb8828f5ab00e72eb8753d86affbfe", element="f.DFB75E5846F3942B636B04F608F0750F.d.DFEAA4ABFB1BCC6E71032B783514D7D2.e.39")>, <selenium.webdriver.remote.webelement.WebElement (session="b1cb8828f5ab00e72eb8753d86affbfe", element="f.DF

KeyboardInterrupt: 

In [None]:
# Process a single faculty for testing
# Change the faculty_id to the one you want to process
faculty_id_to_process = 17  # ID for Faculty of Civil Engineering and Planning

cursor.execute("SELECT id, name, url FROM faculties WHERE id = %s", (faculty_id_to_process,))
faculty = cursor.fetchone()

if faculty:
    faculty_id, faculty_name, faculty_url = faculty
    print(f"Processing single faculty: {faculty_name} (ID: {faculty_id})")
    scrape_program_urls(faculty_id, faculty_name, faculty_url)
else:
    print(f"Faculty with ID {faculty_id_to_process} not found")


In [None]:
# Check the programs table after updates
cursor.execute("SELECT * FROM programs")
programs_data = cursor.fetchall()

# Get column names from cursor description
column_names = [column[0] for column in cursor.description]

# Print header
print("Updated Programs table:")
print(column_names)
print("-" * 80)

# Print all rows
for row in programs_data:
    print(row)


In [None]:
# Check faculty-program relationships
cursor.execute("""
    SELECT f.id AS faculty_id, f.name AS faculty_name, 
           p.id AS program_id, p.name AS program_name, p.url AS program_url
    FROM faculties f
    JOIN faculty_program fp ON f.id = fp.faculty_id
    JOIN programs p ON fp.program_id = p.id
    ORDER BY f.id, p.id
""")
relationships = cursor.fetchall()

# Get column names from cursor description
column_names = [column[0] for column in cursor.description]

# Print header
print("Faculty-Program Relationships:")
print(column_names)
print("-" * 100)

# Print rows
for row in relationships:
    print(row)


Faculty-Program Relationships:
['faculty_id', 'faculty_name', 'program_id', 'program_name', 'program_url']
----------------------------------------------------------------------------------------------------
(17, 'Faculty of Civil Engineering and Planning', 1, 'Civil Engineering', None)
(17, 'Faculty of Civil Engineering and Planning', 2, 'Architecture', None)
(17, 'Faculty of Civil Engineering and Planning', 3, 'Architecture of Sustainable Housing and Real Estate', None)
(17, 'Faculty of Civil Engineering and Planning', 4, 'Master’s Program in Civil Engineering', None)
(17, 'Faculty of Civil Engineering and Planning', 5, 'Master’s Program in Architecture', None)
(17, 'Faculty of Civil Engineering and Planning', 6, 'Doctoral Program in Civil Engineering', None)
(18, 'Faculty of Industrial Technology', 7, 'Electrical Engineering', 'https://petra.ac.id/faculty/fti')
(18, 'Faculty of Industrial Technology', 8, 'Internet of Things', 'https://petra.ac.id/program/internet-of-things')
(18, 'F

In [5]:
# Define a function that will process all faculties automatically
def process_all_faculties(headless=True):
    """
    Process all faculties in the database and update program URLs
    
    Args:
        headless (bool): Whether to run Chrome in headless mode
    """
    # Get all faculties from the database
    cursor.execute("SELECT id, name, url FROM faculties")
    faculties = cursor.fetchall()
    print(f"Found {len(faculties)} faculties in the database")
    
    # Track statistics
    total_programs_found = 0
    total_programs_updated = 0
    total_programs_added = 0
    
    # Process each faculty
    for faculty_id, faculty_name, faculty_url in faculties:
        print(f"\n{'='*80}")
        print(f"Processing faculty: {faculty_name} (ID: {faculty_id})")
        print(f"{'='*80}")
        
        # Setup Chrome options
        chrome_options = Options()
        if headless:
            chrome_options.add_argument("--headless")
        
        # Setup the Chrome WebDriver
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
        
        try:
            # Navigate to the faculty URL
            print(f"Scraping URL: {faculty_url}")
            driver.get(faculty_url)
            
            # Wait for JavaScript to load
            time.sleep(5)
            
            # Find all divs with class 'prodi'
            prodi_divs = driver.find_elements(By.CLASS_NAME, 'prodi')
            faculty_programs_found = len(prodi_divs)
            print(f"Found {faculty_programs_found} programs for this faculty")
            total_programs_found += faculty_programs_found
            
            faculty_programs_updated = 0
            faculty_programs_added = 0
            
            # Process each program
            for i, div in enumerate(prodi_divs):
                try:
                    # Extract program name
                    program_name = div.find_element(By.TAG_NAME, 'h5').text.strip()
                    print(f"\nProgram found: {program_name}")
                    
                    # Check if the div has any attributes that might contain a URL
                    div_attributes = driver.execute_script(
                        "var items = {}; for (index = 0; index < arguments[0].attributes.length; ++index) { items[arguments[0].attributes[index].name] = arguments[0].attributes[index].value }; return items;", 
                        div
                    )
                    
                    # Check for any href or data-* attributes that might contain a URL
                    program_url = None
                    
                    # 1. First check if div has a data-url or similar attribute
                    for attr_name, attr_value in div_attributes.items():
                        if attr_name.startswith('data-') and ('url' in attr_name or 'link' in attr_name or 'href' in attr_name):
                            program_url = attr_value
                            print(f"Found URL in attribute {attr_name}: {program_url}")
                            break
                    
                    # 2. If no URL found in attributes, check for any links inside the div
                    if not program_url:
                        links = div.find_elements(By.TAG_NAME, 'a')
                        if links:
                            for link in links:
                                href = link.get_attribute('href')
                                if href and href != '#' and not href.startswith('javascript:'):
                                    program_url = href
                                    print(f"Found URL in link: {program_url}")
                                    break
                    
                    # 3. If still no URL, try clicking the div and see if any modal appears with links
                    if not program_url:
                        # Store the current URL to return to after clicking
                        current_url = driver.current_url
                        
                        # Click the div
                        try:
                            # Refresh the element reference if needed
                            if i > 0:
                                prodi_divs = driver.find_elements(By.CLASS_NAME, 'prodi')
                                div = prodi_divs[i]
                            
                            # Click and wait for any modals or popups
                            div.click()
                            time.sleep(2)
                            
                            # Check if a modal or popup appeared
                            modals = driver.find_elements(By.CSS_SELECTOR, '.modal, .popup, .dialog, [role="dialog"]')
                            for modal in modals:
                                if modal.is_displayed():
                                    # Look for links in the modal
                                    modal_links = modal.find_elements(By.TAG_NAME, 'a')
                                    for link in modal_links:
                                        href = link.get_attribute('href')
                                        if href and href != '#' and not href.startswith('javascript:'):
                                            program_url = href
                                            print(f"Found URL in modal: {program_url}")
                                            break
                                    
                                    # Try to close the modal
                                    close_buttons = modal.find_elements(By.CSS_SELECTOR, '.close, .btn-close, [aria-label="Close"]')
                                    if close_buttons:
                                        close_buttons[0].click()
                                        time.sleep(1)
                            
                            # If URL still not found, check if we navigated to a new page
                            new_url = driver.current_url
                            if new_url != current_url:
                                program_url = new_url
                                print(f"Navigation detected: {program_url}")
                                # Navigate back
                                driver.get(current_url)
                                time.sleep(2)
                        except Exception as e:
                            print(f"Error clicking div: {e}")
                            driver.get(current_url)  # Make sure we're back on the faculty page
                            time.sleep(2)
                    
                    # 4. As a last resort, try to construct a URL based on the program name
                    if not program_url:
                        # Extract base URL (domain)
                        base_url_match = re.match(r'(https?://[^/]+)', faculty_url)
                        if base_url_match:
                            base_url = base_url_match.group(1)
                            # Convert program name to slug format
                            slug = program_name.lower().replace(' ', '-')
                            # Try different URL patterns
                            possible_urls = [
                                f"{base_url}/program/{slug}",
                                f"{base_url}/programs/{slug}",
                                f"{base_url}/academic-programs/{slug}",
                                f"{base_url}/study-program/{slug}",
                                f"{base_url}/study-programs/{slug}"
                            ]
                            
                            # Try to visit each possible URL and see if it exists
                            for url in possible_urls:
                                try:
                                    response = requests.head(url, timeout=5)
                                    if response.status_code < 400:  # If URL returns a success status code
                                        program_url = url
                                        print(f"Verified constructed URL: {program_url}")
                                        break
                                except Exception as e:
                                    print(f"Error checking URL {url}: {e}")
                            
                            if not program_url:
                                # Just use the first constructed URL as a fallback
                                program_url = possible_urls[0]
                                print(f"Using fallback constructed URL: {program_url}")
                    
                    # Check if program already exists and update database
                    if program_url:
                        try:
                            # Check if program already exists by name
                            cursor.execute("SELECT id, url FROM programs WHERE name = %s", (program_name,))
                            existing_program = cursor.fetchone()
                            
                            if existing_program:
                                program_id = existing_program[0]
                                existing_url = existing_program[1]
                                
                                print(f"Program already exists with ID: {program_id}")
                                
                                # Update the URL if it was previously NULL or empty or different
                                if not existing_url or existing_url != program_url:
                                    cursor.execute(
                                        "UPDATE programs SET url = %s WHERE id = %s", 
                                        (program_url, program_id)
                                    )
                                    database.commit()
                                    print(f"Updated program URL from '{existing_url}' to: '{program_url}'")
                                    faculty_programs_updated += 1
                                    total_programs_updated += 1
                                else:
                                    print(f"URL already correct in database: {existing_url}")
                            else:
                                # Insert new program
                                cursor.execute(
                                    "INSERT INTO programs (name, url) VALUES (%s, %s)",
                                    (program_name, program_url)
                                )
                                database.commit()
                                program_id = cursor.lastrowid
                                print(f"Added new program with ID: {program_id}")
                                faculty_programs_added += 1
                                total_programs_added += 1
                            
                            # Check if relationship already exists
                            cursor.execute(
                                "SELECT id FROM faculty_program WHERE faculty_id = %s AND program_id = %s",
                                (faculty_id, program_id)
                            )
                            existing_relation = cursor.fetchone()
                            
                            if not existing_relation:
                                # Create relationship
                                cursor.execute(
                                    "INSERT INTO faculty_program (faculty_id, program_id) VALUES (%s, %s)",
                                    (faculty_id, program_id)
                                )
                                database.commit()
                                print(f"Created faculty-program relationship")
                            else:
                                print(f"Faculty-program relationship already exists")
                                
                        except Exception as e:
                            print(f"Error processing program {program_name}: {e}")
                            database.rollback()
                    else:
                        print(f"Could not find a URL for program: {program_name}")
                        
                except Exception as e:
                    print(f"Error processing program div: {e}")
            
            print(f"\nFaculty Summary for {faculty_name}:")
            print(f"Programs found: {faculty_programs_found}")
            print(f"Programs updated: {faculty_programs_updated}")
            print(f"Programs added: {faculty_programs_added}")
            
        finally:
            # Always close the browser
            driver.quit()
            print(f"Completed processing faculty: {faculty_name}")
    
    # Print overall summary
    print(f"\n{'='*80}")
    print("OVERALL SUMMARY:")
    print(f"{'='*80}")
    print(f"Total programs found: {total_programs_found}")
    print(f"Total programs updated: {total_programs_updated}")
    print(f"Total programs added: {total_programs_added}")
    print(f"{'='*80}")

# Run the function to process all faculties
# Set headless=False if you want to see the browser while it works
process_all_faculties(headless=False)

Found 8 faculties in the database

Processing faculty: Faculty of Civil Engineering and Planning (ID: 17)
Scraping URL: https://petra.ac.id/faculty/ftsp
Found 6 programs for this faculty

Program found: Civil Engineering
Verified constructed URL: https://petra.ac.id/program/civil-engineering
Program already exists with ID: 1
Updated program URL from 'https://petra.ac.id/programs/civil-engineering' to: 'https://petra.ac.id/program/civil-engineering'
Faculty-program relationship already exists

Program found: Architecture
Verified constructed URL: https://petra.ac.id/program/architecture
Program already exists with ID: 2
URL already correct in database: https://petra.ac.id/program/architecture
Faculty-program relationship already exists

Program found: Architecture of Sustainable Housing and Real Estate
Verified constructed URL: https://petra.ac.id/program/architecture-of-sustainable-housing-and-real-estate
Program already exists with ID: 3
URL already correct in database: https://petra.

KeyboardInterrupt: 

# Scraping Academic Program URLs

This notebook demonstrates how to scrape and populate the URL field for academic programs in a university database. The process involves:

1. Connecting to the MySQL database
2. Checking the structure of tables (faculties, programs, faculty_program)
3. Setting up web scraping tools (Selenium with Chrome WebDriver)
4. Defining a function to scrape program URLs from faculty pages
5. Processing each faculty page to find and extract program URLs
6. Updating the database with the extracted URLs

The final cell provides a comprehensive function that:
- Processes all faculties automatically
- Uses multiple strategies to find program URLs:
  - Checking for data attributes in HTML elements
  - Looking for links within program divs
  - Clicking elements to detect navigation or modal popups
  - Constructing and validating potential URLs based on program names
- Maintains database relationships between faculties and programs
- Provides detailed statistics about the scraping process

You can adjust the `headless` parameter when calling the function to run Chrome in visible or invisible mode.

In [None]:
# Display final statistics and verify data

# Count programs with and without URLs
cursor.execute("SELECT COUNT(*) FROM programs")
total_programs = cursor.fetchone()[0]

cursor.execute("SELECT COUNT(*) FROM programs WHERE url IS NOT NULL AND url != ''")
programs_with_url = cursor.fetchone()[0]

programs_without_url = total_programs - programs_with_url

print(f"Final Database Statistics:")
print(f"{'='*50}")
print(f"Total programs in database: {total_programs}")
print(f"Programs with URLs: {programs_with_url} ({programs_with_url/total_programs*100:.2f}%)")
print(f"Programs without URLs: {programs_without_url} ({programs_without_url/total_programs*100:.2f}%)")
print(f"{'='*50}")

# Check the faculty_program relationships
cursor.execute("""
    SELECT f.name AS faculty_name, 
           COUNT(p.id) AS total_programs,
           SUM(CASE WHEN p.url IS NOT NULL AND p.url != '' THEN 1 ELSE 0 END) AS programs_with_url
    FROM faculties f
    JOIN faculty_program fp ON f.id = fp.faculty_id
    JOIN programs p ON fp.program_id = p.id
    GROUP BY f.id
    ORDER BY f.name
""")

faculty_stats = cursor.fetchall()

print("\nURL Coverage by Faculty:")
print(f"{'='*80}")
print(f"{'Faculty Name':<40} | {'Total Programs':<15} | {'With URL':<10} | {'Coverage':<10}")
print(f"{'-'*40}-+-{'-'*15}-+-{'-'*10}-+-{'-'*10}")

for faculty_name, total, with_url in faculty_stats:
    coverage = (with_url / total * 100) if total > 0 else 0
    print(f"{faculty_name:<40} | {total:<15} | {with_url:<10} | {coverage:.2f}%")

# Close database connection
cursor.close()
database.close()
print("\nDatabase connection closed.")