In [2]:
import mysql.connector

database = mysql.connector.connect(
    host="localhost",
    user="root",
    password="",
)
cursor = database.cursor()
cursor.execute("USE nlp_thesis_similarity")

In [2]:
# Check if the programs table exists, if not create it
try:
    # Check if the programs table exists
    cursor.execute("SHOW TABLES LIKE 'programs'")
    if not cursor.fetchone():
        # Create the programs table if it doesn't exist
        cursor.execute("""
        CREATE TABLE programs (
            id INT PRIMARY KEY AUTO_INCREMENT,
            name VARCHAR(1024) NOT NULL,
            url VARCHAR(1024) NULL
        )
        """)
        
    database.commit()
    print("Programs table checked and created if needed")
except Exception as e:
    print(f"Error managing programs table: {e}")
    database.rollback()  # Rollback in case of error

Programs table checked and created if needed


In [3]:
# Check if the faculty_program table exists, if not create it
try:
    # Check if the faculty_program table exists
    cursor.execute("SHOW TABLES LIKE 'faculty_program'")
    if not cursor.fetchone():
        # Create the faculty_program table if it doesn't exist
        cursor.execute("""
        CREATE TABLE faculty_program (
            id INT PRIMARY KEY AUTO_INCREMENT,
            faculty_id INT NOT NULL,
            program_id INT NOT NULL,
            FOREIGN KEY (faculty_id) REFERENCES faculties(id),
            FOREIGN KEY (program_id) REFERENCES programs(id),
            UNIQUE KEY unique_faculty_program (faculty_id, program_id)
        )
        """)
        
    database.commit()
    print("faculty_program table checked and created if needed")
except Exception as e:
    print(f"Error managing faculty_program table: {e}")
    database.rollback()  # Rollback in case of error

faculty_program table checked and created if needed


In [6]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import re

# Modify the Selenium code to work with the database

# First, let's get all faculties from the database
cursor.execute("SELECT id, name, url FROM faculties where name = 'Faculty of Industrial Technology'")
faculties = cursor.fetchall()
print(f"Found {len(faculties)} faculties in the database")

for faculty_id, faculty_name, faculty_url in faculties:
    print(f"\nProcessing faculty: {faculty_name} (ID: {faculty_id})")
    
    # Setup Chrome options
    chrome_options = Options()
    # Comment out headless mode for debugging
    # chrome_options.add_argument("--headless")
    
    # Setup the Chrome WebDriver
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    
    # Navigate to the faculty URL
    url = faculty_url
    print(f"Scraping URL: {url}")
    driver.get(url)
    
    # Wait for JavaScript to load
    time.sleep(5)
    
    # Find all divs with class 'prodi'
    prodi_divs = driver.find_elements(By.CLASS_NAME, 'prodi')
    print(f"Found {len(prodi_divs)} programs for this faculty")
    
    # Process each program
    for i, div in enumerate(prodi_divs):
        try:
            # Extract program name
            program_name = div.find_element(By.TAG_NAME, 'h5').text.strip()
            print(f"\nProgram found: {program_name}")
            
            # Check if the div has any attributes that might contain a URL
            div_attributes = driver.execute_script(
                "var items = {}; for (index = 0; index < arguments[0].attributes.length; ++index) { items[arguments[0].attributes[index].name] = arguments[0].attributes[index].value }; return items;", 
                div
            )
            print(f"Div attributes: {div_attributes}")
            
            # Check for any href or data-* attributes that might contain a URL
            program_url = None
            
            # 1. First check if div has a data-url or similar attribute
            for attr_name, attr_value in div_attributes.items():
                if attr_name.startswith('data-') and ('url' in attr_name or 'link' in attr_name or 'href' in attr_name):
                    program_url = attr_value
                    print(f"Found URL in attribute {attr_name}: {program_url}")
                    break
            
            # 2. If no URL found in attributes, check for any links inside the div
            if not program_url:
                links = div.find_elements(By.TAG_NAME, 'a')
                if links:
                    for link in links:
                        href = link.get_attribute('href')
                        if href and href != '#' and not href.startswith('javascript:'):
                            program_url = href
                            print(f"Found URL in link: {program_url}")
                            break
            
            # 3. If still no URL, try clicking the div and see if any modal appears with links
            if not program_url:
                # Store the current URL to return to after clicking
                current_url = driver.current_url
                
                # Click the div
                try:
                    # Refresh the element reference if needed
                    if i > 0:
                        prodi_divs = driver.find_elements(By.CLASS_NAME, 'prodi')
                        div = prodi_divs[i]
                    
                    # Click and wait for any modals or popups
                    div.click()
                    time.sleep(2)
                    
                    # Check if a modal or popup appeared
                    modals = driver.find_elements(By.CSS_SELECTOR, '.modal, .popup, .dialog, [role="dialog"]')
                    for modal in modals:
                        if modal.is_displayed():
                            # Look for links in the modal
                            modal_links = modal.find_elements(By.TAG_NAME, 'a')
                            for link in modal_links:
                                href = link.get_attribute('href')
                                if href and href != '#' and not href.startswith('javascript:'):
                                    program_url = href
                                    print(f"Found URL in modal: {program_url}")
                                    break
                            
                            # Try to close the modal
                            close_buttons = modal.find_elements(By.CSS_SELECTOR, '.close, .btn-close, [aria-label="Close"]')
                            if close_buttons:
                                close_buttons[0].click()
                                time.sleep(1)
                    
                    # If URL still not found, check if we navigated to a new page
                    new_url = driver.current_url
                    if new_url != current_url:
                        program_url = new_url
                        print(f"Navigation detected: {program_url}")
                        # Navigate back
                        driver.get(current_url)
                        time.sleep(2)
                except Exception as e:
                    print(f"Error clicking div: {e}")
                    driver.get(current_url)  # Make sure we're back on the faculty page
                    time.sleep(2)
            
            # 4. As a last resort, try to construct a URL based on the program name
            if not program_url:
                # Extract base URL (domain)
                base_url_match = re.match(r'(https?://[^/]+)', faculty_url)
                if base_url_match:
                    base_url = base_url_match.group(1)
                    # Convert program name to slug format
                    slug = program_name.lower().replace(' ', '-')
                    # Construct a possible URL
                    program_url = f"{base_url}/program/{slug}"
                    print(f"Constructed URL based on program name: {program_url}")
            
            # Update database with the URL (if found)
            if program_url:
                try:
                    # Check if program already exists
                    cursor.execute("SELECT id FROM programs WHERE name = %s", (program_name,))
                    existing_program = cursor.fetchone()
                    
                    if existing_program:
                        program_id = existing_program[0]
                        print(f"Program already exists with ID: {program_id}")
                        
                        # Update the URL if it was previously NULL
                        cursor.execute(
                            "UPDATE programs SET url = %s WHERE id = %s AND (url IS NULL OR url = '')", 
                            (program_url, program_id)
                        )
                        database.commit()
                        print(f"Updated program URL to: {program_url}")
                    else:
                        # Insert new program
                        cursor.execute(
                            "INSERT INTO programs (name, url) VALUES (%s, %s)",
                            (program_name, program_url)
                        )
                        database.commit()
                        program_id = cursor.lastrowid
                        print(f"Added new program with ID: {program_id}")
                    
                    # Check if relationship already exists
                    cursor.execute(
                        "SELECT id FROM faculty_program WHERE faculty_id = %s AND program_id = %s",
                        (faculty_id, program_id)
                    )
                    existing_relation = cursor.fetchone()
                    
                    if not existing_relation:
                        # Create relationship
                        cursor.execute(
                            "INSERT INTO faculty_program (faculty_id, program_id) VALUES (%s, %s)",
                            (faculty_id, program_id)
                        )
                        database.commit()
                        print(f"Created faculty-program relationship")
                    else:
                        print(f"Faculty-program relationship already exists")
                        
                except Exception as e:
                    print(f"Error processing program {program_name}: {e}")
                    database.rollback()
            else:
                print(f"Could not find a URL for program: {program_name}")
                
        except Exception as e:
            print(f"Error processing program div: {e}")
    
    # Close the browser
    driver.quit()

print("Completed processing all faculties")

Found 1 faculties in the database

Processing faculty: Faculty of Industrial Technology (ID: 18)
Scraping URL: https://petra.ac.id/faculty/fti
Scraping URL: https://petra.ac.id/faculty/fti
Found 13 programs for this faculty

Program found: Electrical Engineering
Div attributes: {'class': 'prodi', 'data-v-0d5227c3': ''}
Found 13 programs for this faculty

Program found: Electrical Engineering
Div attributes: {'class': 'prodi', 'data-v-0d5227c3': ''}
Constructed URL based on program name: https://petra.ac.id/program/electrical-engineering
Program already exists with ID: 7
Updated program URL to: https://petra.ac.id/program/electrical-engineering
Faculty-program relationship already exists
Constructed URL based on program name: https://petra.ac.id/program/electrical-engineering
Program already exists with ID: 7
Updated program URL to: https://petra.ac.id/program/electrical-engineering
Faculty-program relationship already exists

Program found: Internet of Things
Div attributes: {'class': 