## Selenium to scrap web content

Selenium is a browser automation framework. When we use Selenium, we are actually browsing the site using a real browser and then to scrap the HTML content from the webpage.

<Note that here we set `options.headless = True`, so the browser will be "headless", so we won't actually render anything on the screen>


In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By  # For finding elements
from selenium.webdriver.support.ui import WebDriverWait # For explicit waits
from selenium.webdriver.support import expected_conditions as EC # For explicit waits
from selenium.common.exceptions import TimeoutException, NoSuchElementException, ElementClickInterceptedException
import time
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from tqdm import tqdm
import requests


## Get cities and amount of programs

In [None]:
#Get the initial web page HTML content that includes the brief study program details in the netherlands
options = Options()
options.headless = True
options.add_argument("--window-size=1920,1080")
url = "https://www.studyinnl.org/dutch-education/studies"
driver = webdriver.Chrome(options=options)
driver.get(url)
page_source = driver.page_source
#Save the html content
with open("html_scrapped/study_of_netherlands.html", "w") as f:
    f.write(page_source)
driver.quit()

In [None]:
#Parse the content to extract city and program info

#Set up a BeautifulSoup HTML parser to parse the HTML file
with open("html_scrapped/study_of_netherlands.html") as fp:
    soup = BeautifulSoup(fp, 'html.parser')

cities = {}

h3_city_title = soup.find('h3', class_='filter__title', string=lambda text: text and "City" in text.strip())
city_options_container = h3_city_title.find_parent('div', class_='filter')

# Find all 'div' tags that have the class 'filter__option' (and 'more')
# and also have the 'data-filter-value' attribute.
# The `attrs` argument is used to check for the presence of an attribute.
city_divs = city_options_container.find_all('div', class_='filter__option', attrs={'data-filter-value': True})
# If the 'more' class isn't always present or necessary for uniqueness, you could simplify to:
# city_divs = soup.find_all('div', class_='filter__option', attrs={'data-filter-value': True})


for div_tag in city_divs:
    city_name = div_tag['data-filter-value'] # Access the attribute value like a dictionary
    amount_of_programs = div_tag.find('span', class_='checkbox__amount')
    amount_of_programs = amount_of_programs.get_text(strip=True)
    cities[city_name]=amount_of_programs

display(cities)

{'Amsterdam': '387',
 'Groningen': '215',
 'Leiden': '207',
 'Maastricht': '136',
 'Nijmegen': '133',
 'Utrecht': '124',
 'Den Haag': '78',
 'Tilburg': '78',
 'Enschede': '70',
 'Delft': '69',
 'Rotterdam': '67',
 'Eindhoven': '65',
 'Wageningen': '54',
 'Apeldoorn': '33',
 'Arnhem': '27',
 'Leeuwarden': '25',
 'Breda': '22',
 'Middelburg': '8',
 'Ede': '7',
 'Emmen': '6',
 'Haarlem': '6',
 'Velp': '6',
 'Venlo': '6',
 'Zwolle': '6',
 'Den Bosch': '5',
 'Deventer': '5',
 'Hengelo': '5',
 'Vlissingen': '3',
 'Assen': '2',
 'Breukelen': '2',
 'Dronten': '2',
 'Meppel': '2',
 'Almere': '1',
 'Amersfoort': '1',
 'Hilversum': '1',
 'Nieuwegein': '1',
 'Sittard': '1',
 'Terschelling': '1'}

## Get programs information

In [None]:
#The function to scrap all the program details for a specific city, where max_clicks is used to expand the webpage by clicking on "show more"
def scraping_city(url, city, max_clicks):
    # --- Configuration ---
    # Replace this with the actual URL where the button exists
    # url = "https://www.studyinnl.org/dutch-education/studies" # <<<<< IMPORTANT: UPDATE THIS URL
    # Location = 'Amsterdam'
    url = url+f"?location={city}"

    options = Options()
    options.headless = True # Run in headless mode (no browser GUI)
    options.add_argument("--window-size=1920,1080")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")

    # --- Initialize the WebDriver ---
    #Chrome driver is already put in c:\windows, where the directory is already a part of the system path
    driver = webdriver.Chrome(options=options)

    if driver is None:
        print("Failed to initialize WebDriver. Exiting.")
        exit()

    try:
        print(f"Attempting to fetch URL: {url}")
        driver.get(url)
        print("Initial page loaded.")

        # --- Selector for the "Show more" button based on your HTML ---
        # This XPath looks for a button with 'filter__options-button' in its class
        # and whose text content (normalized) contains 'Show more'.
        load_more_button_selector = "//button[contains(@class, 'filter__options-button') and contains(normalize-space(.), 'Show more')]"
        
        click_count = 0
        # Adjust max_clicks based on how many times you anticipate needing to click "Show more".
        # This is a safety measure to prevent an infinite loop.
        # max_clicks = 10 # Example: try clicking up to 10 times

        #Get the webpage after clicking the "Show more"
        while click_count < max_clicks:
            try:
                # Wait for the "Show more" button to be present and clickable
                # Increased timeout slightly as dynamic loading can take time
                show_more_button = WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable((By.XPATH, load_more_button_selector))
                )
                
                print(f"Found 'Show more' button. Clicking... (Attempt {click_count + 1})")
                
                # Scroll the button into view (important for headless sometimes) and then click
                driver.execute_script("arguments[0].scrollIntoView(true);", show_more_button)
                time.sleep(0.5) # Small pause for scrolling to settle if needed
                
                # Using JavaScript click can sometimes be more reliable for buttons
                # that have complex event listeners or might be slightly obscured.
                driver.execute_script("arguments[0].click();", show_more_button)
                # Alternative standard Selenium click:
                # show_more_button.click()

                print("'Show more' button clicked.")
                click_count += 1

                # IMPORTANT: Wait for new content to actually load.
                # This fixed time.sleep() is a placeholder.
                # For robust scraping, you should wait for a specific condition:
                # - e.g., an increase in the number of items displayed
                # - e.g., a specific new element to appear
                # - e.g., a "loading" spinner to disappear
                print("Waiting for more content to load (e.g., 5 seconds)...")
                time.sleep(5) # ADJUST THIS based on how long the site takes to load more content.

            except TimeoutException:
                print("'Show more' button not found or not clickable after waiting. Assuming all content is loaded or button disappeared.")
                break # Exit the loop if the button isn't found after the timeout
            except NoSuchElementException:
                # This might occur if the button is removed from the DOM entirely after the last click
                print("'Show more' button no longer exists on the page. Assuming all content is loaded.")
                break
            except ElementClickInterceptedException:
                print("Button click was intercepted, possibly by an overlay or another element. Trying to wait and see if it resolves.")
                time.sleep(2) # Wait for potential overlays to disappear
                # You might need more sophisticated handling here, e.g., closing a pop-up.
            except Exception as e:
                print(f"An error occurred while trying to find or click 'Show more': {e}")
                break # Exit loop on other errors
                
        print(f"Clicked 'Show more' {click_count} times.")

        # --- Get the final HTML content AFTER all clicks ---
        print("Fetching final page source...")
        page_source = driver.page_source

        # --- Save the final HTML content to a file ---
        file_name = f"html_scrapped/study_of_netherlands_{city}.html" # Choose a relevant file name
        with open(file_name, "w", encoding="utf-8") as f:
            f.write(page_source)
        print(f"Full HTML content saved to {file_name}")

    except Exception as e:
        print(f"An overall error occurred: {e}")

    finally:
        # --- Close the browser ---
        # Important to release resources
        if driver: # Check if driver was successfully initialized
            print("Closing the browser...")
            driver.quit()


In [19]:
 # url = "https://www.studyinnl.org/dutch-education/studies" # <<<<< IMPORTANT: UPDATE THIS URL
# Location = 'Amsterdam'
url = "https://www.studyinnl.org/dutch-education/studies"
for city,amount in cities.items():
    print(f"Scraping city {city}:")
    max_clicks = int(amount)//10+1
    scraping_city(url, city, max_clicks)

Scraping city Amsterdam
Attempting to fetch URL: https://www.studyinnl.org/dutch-education/studies?location=Amsterdam
Initial page loaded.
Found 'Show more' button. Clicking... (Attempt 1)
'Show more' button clicked.
Waiting for more content to load (e.g., 5 seconds)...
Found 'Show more' button. Clicking... (Attempt 2)
'Show more' button clicked.
Waiting for more content to load (e.g., 5 seconds)...
Found 'Show more' button. Clicking... (Attempt 3)
'Show more' button clicked.
Waiting for more content to load (e.g., 5 seconds)...
Found 'Show more' button. Clicking... (Attempt 4)
'Show more' button clicked.
Waiting for more content to load (e.g., 5 seconds)...
Found 'Show more' button. Clicking... (Attempt 5)
'Show more' button clicked.
Waiting for more content to load (e.g., 5 seconds)...
Found 'Show more' button. Clicking... (Attempt 6)
'Show more' button clicked.
Waiting for more content to load (e.g., 5 seconds)...
Found 'Show more' button. Clicking... (Attempt 7)
'Show more' button 

### Parse and extract the information from html content

In [66]:
# --- Function to Extract Program Card Details ---
BASE_URL = 'https://www.studyinnl.org'
def extract_program_info_from_card(card_element):
    """Extracts program information from a single BeautifulSoup card element."""
    program_data = {
        "name": None,
        "university": None,
        "type": None,
        "location": None,
        "duration": None,
        "link": None # To store the program's specific link
    }

    # The main link often contains all other info for a program card
    link_tag = card_element.find('a', class_='card__link')
    if not link_tag:
        # This card might not be a program card (e.g., it could be a filter summary card)
        return None # Indicate that this wasn't a processable program card

    program_data["link"] = BASE_URL+link_tag.get('href') # Get the link URL

    # Program Name (within the link tag)
    h3_tag = link_tag.find('h3')
    if h3_tag:
        program_data["name"] = h3_tag.get_text(strip=True)
    else: # If no h3, this might not be a program card we want, or structure is different
        return None


    # University Name (within the link tag)
    columns_div = link_tag.find('div', class_='columns')
    if columns_div:
        logo_div = columns_div.find('div', class_='card__logo')
        if logo_div:
            uni_span = logo_div.find_next_sibling('span', class_='is-inline-block')
            if uni_span:
                program_data["university"] = uni_span.get_text(strip=True)
        else: # Fallback if only span is present directly under columns
            uni_span_direct = columns_div.find('span', class_='is-inline-block')
            if uni_span_direct: # Make sure this isn't picking up something else if structure varies
                 program_data["university"] = uni_span_direct.get_text(strip=True)
    
    # Program Type, Location, and Duration (within the link tag)
    info_divs = link_tag.find_all('div', class_='card__info')
    for info_div in info_divs:
        svg_tag = info_div.find('svg')
        if svg_tag:
            title_tag = svg_tag.find('title')
            text_node = svg_tag.next_sibling 
            text_content = ""
            if text_node and isinstance(text_node, str): # Check if it's a NavigableString/string
                text_content = text_node.strip()
            
            if title_tag and text_content: 
                title_text = title_tag.get_text(strip=True).lower() 
                if "studyhat" in title_text:
                    program_data["type"] = text_content
                elif "location marker" in title_text or "mapmarker" in title_text:
                    program_data["location"] = text_content
                elif "clock icon" in title_text:
                    program_data["duration"] = text_content
    
    return program_data

In [None]:

#Parse the extracted html per city
df_info = pd.DataFrame()

for city in cities.keys():
    #Get the html content of a specific city
    html_page = f"html_scrapped/study_of_netherlands_{city}.html"
    with open(html_page) as fp:
        soup = BeautifulSoup(fp, 'html.parser')

    # --- Extract Program Card Details within the HTML content---
    all_programs_data = []
    # Program cards are typically in a specific section.
    # Based on the provided HTML, they seem to be within <div class="column is-three-quarters programs-column">
    programs_column_container = soup.find('div', class_='programs-column')

    if programs_column_container:
        # Find all 'div' elements with the class 'card' *within the programs_column_container*
        program_card_elements = programs_column_container.find_all('div', class_='card', recursive=False) 
        # recursive=False ensures we only get direct children 'card' divs, 
        # which seems to be the case for program cards vs the filter summary card.
        # However, a more robust way is to check for a characteristic of a program card.

        # Let's refine to get only actual program cards by checking for an 'a' tag with class 'card__link'
        potential_cards = programs_column_container.find_all('div', class_='card')
        actual_program_cards = [card for card in potential_cards if card.find('a', class_='card__link')]


        if not actual_program_cards:
            print("No program cards found within the 'programs-column' section.")
        else:
            print(f"\nFound {len(actual_program_cards)} actual program cards in {city}. Processing...")
            for card in actual_program_cards:
                data = extract_program_info_from_card(card)
                if data and data.get("name"): # Ensure data is not None and has a name
                    all_programs_data.append(data)
    else:
        print("Could not find the 'programs-column' div. Program card extraction might be incomplete.")
        # Fallback: if you are sure all "card" divs on the page are programs, you could use:
        # all_card_elements_fallback = soup.find_all('div', class_='card')
        # for card in all_card_elements_fallback:
        #     data = extract_program_info_from_card(card)
        #     if data and data.get("name"): all_programs_data.append(data)


    # print("\n--- Extracted Program Details ---")
    # if all_programs_data:
    #     for i, program in enumerate(all_programs_data):
    #         print(f"\n--- Program {i+1} ---")
    #         print(f"  Name: {program.get('name', 'N/A')}")
    #         print(f"  University: {program.get('university', 'N/A')}")
    #         print(f"  Type: {program.get('type', 'N/A')}")
    #         print(f"  Location: {program.get('location', 'N/A')}")
    #         print(f"  Duration: {program.get('duration', 'N/A')}")
    #         print(f"  Link: {program.get('link', 'N/A')}") # Displaying the extracted link
    #         print("-" * 20)
    # else:
    #     print("No program details extracted.")
    
    #Combine the information together
    df_info_city = pd.DataFrame(all_programs_data)
    df_info = pd.concat([df_info,df_info_city],axis = 0)


df_info


Found 387 actual program cards in Amsterdam. Processing...

Found 215 actual program cards in Groningen. Processing...

Found 207 actual program cards in Leiden. Processing...

Found 136 actual program cards in Maastricht. Processing...

Found 133 actual program cards in Nijmegen. Processing...

Found 124 actual program cards in Utrecht. Processing...

Found 78 actual program cards in Den Haag. Processing...

Found 78 actual program cards in Tilburg. Processing...

Found 70 actual program cards in Enschede. Processing...

Found 69 actual program cards in Delft. Processing...

Found 67 actual program cards in Rotterdam. Processing...

Found 65 actual program cards in Eindhoven. Processing...

Found 54 actual program cards in Wageningen. Processing...

Found 33 actual program cards in Apeldoorn. Processing...

Found 27 actual program cards in Arnhem. Processing...

Found 25 actual program cards in Leeuwarden. Processing...

Found 22 actual program cards in Breda. Processing...

Found 8 

Unnamed: 0,name,university,type,location,duration,link
0,Filosofie,Vrije Universiteit Amsterdam,Bachelor,Amsterdam,3 years,https://www.studyinnl.org/dutch-education/stud...
1,"Placemaking: Sense, Space & Strategy",University of Amsterdam,Short or summer course,Amsterdam,19 days,https://www.studyinnl.org/dutch-education/stud...
2,Social Sciences for a Digital Society (research),Vrije Universiteit Amsterdam,Master,Amsterdam,2 years,https://www.studyinnl.org/dutch-education/stud...
3,Finance: Climate and Sustainable Finance,Vrije Universiteit Amsterdam,Master,Amsterdam,1 year,https://www.studyinnl.org/dutch-education/stud...
4,International and European Law: International ...,University of Amsterdam,Master,Amsterdam,1 year,https://www.studyinnl.org/dutch-education/stud...
...,...,...,...,...,...,...
0,Physiotherapy and Rehabilitation sciences,SOMT,Bachelor,Amersfoort,3 years,https://www.studyinnl.org/dutch-education/stud...
0,Executive Master Media Innovation,Breda University of Applied Sciences,Master,Hilversum,1 year,https://www.studyinnl.org/dutch-education/stud...
0,Physiotherapy,THIM University of Applied Sciences in Physiot...,Bachelor,Nieuwegein,4 years,https://www.studyinnl.org/dutch-education/stud...
0,Biobased Materials,Maastricht University,Master,Sittard,2 years,https://www.studyinnl.org/dutch-education/stud...


In [None]:
#Save to excel
# df_info.to_csv("Program_info_netherlands_level0.csv")

### Further extract information from the course/program website link

In [133]:
#Fetch program further information from the link
def fetch_program_description(full_url):
    """
    Fetches the HTML of a program's detail page and extracts the description
    from the specified div.
    """
    
    # print(f"    Fetching details from: {full_url}")

    
    #Scrap the web page HTML content

    #Use requests to do web scrapping here: faster, and does not need to use selenium
    response = requests.get(full_url)
    html_content = response.content

    # #Use selenium to do web scrapping
    # options = Options()
    # options.headless = True
    # options.add_argument("--window-size=1920,1080")
    # driver = webdriver.Chrome(options=options)
    # driver.get(full_url)
    # html_content = driver.page_source
    # # with open("study_of_netherlands.html", "w") as f:
    # #     f.write(page_source)
    # driver.quit()
    
    soup = BeautifulSoup(html_content, 'html.parser')

    program_details = {
        "description": None,
        "language_requirements": [],
        "tuition_fees": {},
        "application_deadlines_and_start_dates": [], # To store start dates and deadlines
        "language": None,
        "degree_title": None, # e.g., Bachelor of Science
        "duration_detailed": None, # From #aboutContent
        "ects_credits": None,
        "accreditation": None,
        "course_website_link": None
    }

    # 1. Extract the link to the course website
    # Looking for an <a> tag with class "cta__program-button" and "ext" that contains "Visit course website"
    course_website_link_tag = soup.find('a', class_=lambda c: c and 'cta__program-button' in c.split() and 'ext' in c.split(), text=lambda t: t and "Visit course website" in t.strip())
    if course_website_link_tag and course_website_link_tag.has_attr('href'):
        program_details["course_website_link"] = course_website_link_tag['href']
    else:
        # Fallback: Sometimes the link is in a different structure or multiple CTAs exist.
        # The one within div.cta__course is a good candidate if the text search fails or is too broad.
        cta_course_div = soup.find('div', class_='cta__course')
        if cta_course_div:
            link_tag_in_cta = cta_course_div.find('a', class_='cta__program-button')
            if link_tag_in_cta and link_tag_in_cta.has_attr('href'):
                program_details["course_website_link"] = link_tag_in_cta['href']
        if not program_details.get("course_website_link"):
            program_details["course_website_link"] = "Not found"


    # 2. Extract Program Description Text
    description_div = soup.find('div', id='collapseCourseIntroduction')
    if description_div:
        # Extract all text within this div, using space as a separator for different child tags like <p>
        program_details["description"] = description_div.get_text(separator=' ', strip=True)

    # 3. Extract Other Details from #aboutContent section
    about_content_div = soup.find('div', id='aboutContent')
    if about_content_div:
        labels = about_content_div.find_all('h4', class_='course__label')
        for label_tag in labels:
            label_text = label_tag.get_text(strip=True).lower()
            value_tag = label_tag.find_next_sibling('p')
            if value_tag:
                value = value_tag.get_text(strip=True)
                if "language" in label_text:
                    program_details["language"] = value
                elif "title" in label_text: # For "Bachelor of Science"
                    program_details["degree_title"] = value
                elif "duration" in label_text: # This is likely the detailed duration
                    program_details["duration_detailed"] = value
                elif "ects credits" in label_text:
                    # The ECTS value is inside a button within the <p> tag
                    button_tag = value_tag.find('button')
                    if button_tag:
                        program_details["ects_credits"] = button_tag.get_text(strip=True)
                    else:
                        program_details["ects_credits"] = value # Fallback if no button
                elif "accreditation" in label_text:
                    # Accreditation is often within an <a> tag inside the <p>
                    a_tag = value_tag.find('a')
                    if a_tag:
                        program_details["accreditation"] = a_tag.get_text(strip=True)
                    else:
                        program_details["accreditation"] = value # Fallback
    
    # 4. Extract Admission Information (language requirements)
    # Find the main "Admission" content block
    admission_content_div = soup.find('div', id='admissionContent')

    if admission_content_div:
        # The admission details are within a div with class "columns admission__columns"
        admission_columns_div = admission_content_div.find('div', class_='columns admission__columns')
        
        if admission_columns_div:
            columns = admission_columns_div.find_all('div', class_='column', recursive=False) # Get direct children columns

            if len(columns) >= 2:
                # --- First Column: Admission URL and Application Requirements ---
                # first_column = columns[0]
                # --- Second Column: Language Requirements ---
                second_column = columns[1]
                h3_lang_req = second_column.find('h3', string="Language requirements")
                if h3_lang_req:
                    dl_lang_req = h3_lang_req.find_next_sibling('dl', class_='language-requirements')
                    if dl_lang_req:
                        dts = dl_lang_req.find_all('dt')
                        dds = dl_lang_req.find_all('dd')
                        for dt, dd in zip(dts, dds): # Pair them up
                            program_details["language_requirements"].append({
                                "score_or_level": dt.get_text(strip=True),
                                "test_name": dd.get_text(strip=True)
                            })

    # 5. Extract Tuition Information
    tuition_section = soup.find('div', class_='tuition-fee-information')
    if tuition_section:
        # Find the table containing tuition fees (usually under a specific year tab, e.g., id="2025")
        # Let's assume the first table with class 'table is-fullwidth' in this section is the one.
        fee_table = tuition_section.find('table', class_='table is-fullwidth')
        if fee_table:
            tbody = fee_table.find('tbody')
            if tbody:
                for row in tbody.find_all('tr'):
                    cells = row.find_all('td')
                    if len(cells) == 2:
                        fee_type_button = cells[0].find('button') # Type is in a button
                        if fee_type_button:
                            fee_type = fee_type_button.get_text(strip=True)
                            fee_amount = cells[1].get_text(strip=True)
                            program_details["tuition_fees"][fee_type] = fee_amount
        
        # Extract Application Deadlines and Start Dates from the second table
        # This table is a sibling or nearby the tuition fee table.
        # Assume it's the next table with class 'table is-fullwidth' within the tuition_section columns.
        all_tables_in_tuition_section = tuition_section.find_all('table', class_='table is-fullwidth')
        if len(all_tables_in_tuition_section) > 1:
            deadline_table = all_tables_in_tuition_section[1] # Assuming it's the second table
            tbody_deadline = deadline_table.find('tbody')
            if tbody_deadline:
                for row in tbody_deadline.find_all('tr'):
                    cells = row.find_all('td')
                    if len(cells) == 3: # Start Date, App. Deadline EU/EEA, App. Deadline Non-EU/EEA
                        start_date = cells[0].get_text(strip=True)
                        deadline_eu = cells[1].get_text(strip=True)
                        deadline_non_eu = cells[2].get_text(strip=True)
                        program_details["application_deadlines_and_start_dates"].append({
                            "start_date": start_date,
                            "deadline_eu_eea": deadline_eu,
                            "deadline_non_eu_eea": deadline_non_eu
                        })


    # # Print all extracted details
    # print("--- Extracted Program Details ---")
    # print(f"Course Website Link: {program_details.get('course_website_link', 'N/A')}")
    # print(f"Description: {program_details.get('description', 'N/A')}")
    # print(f"Language: {program_details.get('language', 'N/A')}")
    # print(f"Degree Title: {program_details.get('degree_title', 'N/A')}")
    # print(f"Duration (Detailed): {program_details.get('duration_detailed', 'N/A')}")
    # print(f"ECTS Credits: {program_details.get('ects_credits', 'N/A')}")
    # print(f"Accreditation: {program_details.get('accreditation', 'N/A')}")
    
    # # Print the extracted admission information
    # print("\nAdmission Requirements:")
    # print("Language Requirements:")
    # if program_details.get("language_requirements"):
    #     for req in program_details["language_requirements"]:
    #         print(f"  - {req['test_name']}: {req['score_or_level']}")
    # else:
    #     print("  N/A")

    # print("\nTuition Fees:")
    # if program_details.get("tuition_fees"):
    #     for fee_type, amount in program_details["tuition_fees"].items():
    #         print(f"  {fee_type}: {amount}")
    # else:
    #     print("  N/A")

    # print("\nApplication Deadlines & Start Dates:")
    # if program_details.get("application_deadlines_and_start_dates"):
    #     for item in program_details["application_deadlines_and_start_dates"]:
    #         print(f"  Start: {item['start_date']}, EU/EEA Deadline: {item['deadline_eu_eea']}, Non-EU/EEA Deadline: {item['deadline_non_eu_eea']}")
    # else:
    #     print("  N/A")
    
    return program_details

#Convert extracted additional program details into natural language sentences
def program_details_to_natural_language(program_details):
    parts = []

    # Basic Info

    # Program Introduction
    description = program_details.get('description', None)
    if description:
        parts.append(f"Program Overview: {description}")
    # Basic Information
    degree_title = program_details.get('degree_title', None)
    language = program_details.get('language', None)
    duration = program_details.get('duration_detailed', None)
    ects = program_details.get('ects_credits', None)
    accreditation = program_details.get('accreditation', None)

    if degree_title:
        parts.append(f"This program awards a degree titled '{degree_title}'.")
    if language:
        parts.append(f"The language of instruction is {language}.")
    if duration:
        parts.append(f"The program duration is {duration}.")
    if ects:
        parts.append(f"It carries a total of {ects} credits.")
    if accreditation:
        parts.append(f"The program is accredited by {accreditation}.")
    
    # Language Requirements — Threshold Satisfying Any One
    language_reqs = program_details.get("language_requirements")
    if language_reqs:
        reqs = [f"{req['test_name']} (minimum score or level: {req['score_or_level']})" for req in language_reqs]
        parts.append(
            "To meet the English language requirement, applicants must achieve at least one of the following minimum scores: "
            + "; or ".join(reqs) + "."
        )

    # Tuition Fees
    tuition = program_details.get("tuition_fees", None)
    if tuition:
        fee_sentences = []
        for fee_type, amount in tuition.items():
            fee_sentences.append(f"{fee_type} students pay {amount}")
        parts.append("Tuition fees are as follows: " + "; ".join(fee_sentences) + ".")

    # Deadlines and Start Dates
    deadlines = program_details.get("application_deadlines_and_start_dates", None)
    if deadlines:
        deadline_summaries = []
        for item in deadlines:
            start = item.get('start_date', 'an upcoming date')
            eu_deadline = item.get('deadline_eu_eea', 'unspecified')
            non_eu_deadline = item.get('deadline_non_eu_eea', 'unspecified')
            # construct the natural language sentence to summarize the deadline and start date info of program
            sentence = (
            f"If you want to join the program starting in {start}, "
            f"the application deadline is {eu_deadline} for EU/EEA applicants, "
            f"and {non_eu_deadline} for non-EU/EEA applicants."
        )
            deadline_summaries.append(sentence)
            
        parts.extend(deadline_summaries)
    
    # Program Official Website Link for Further Information
    website = program_details.get('course_website_link', None)
    if website:
        parts.append(f"For more information, please visit the program’s official website: {website}.")

    return " ".join(parts)

In [134]:
#Example
BASE_URL = 'https://www.studyinnl.org'
program_relative_url = '/dutch-education/studies/artificial-intelligence-1317'
full_url = BASE_URL + program_relative_url

#Extract information
df_link_info_check = fetch_program_description(full_url)
# Get the combined text
readable_context = program_details_to_natural_language(df_link_info_check)

print(readable_context)
pd.DataFrame([{'Context' :readable_context,
                'Course Website Link': df_link_info_check.get('course_website_link', 'N/A')
               }])
pd.DataFrame([df_link_info_check])

Program Overview: The Artificial Intelligence Master’s programme at VU Amsterdam looks specifically at hybrid intelligence, where AI and humans collaborate. Artificial Intelligence (AI) is widely used in our society: from cars that detect pedestrians to our smart phones’ virtual assistants. These applications use AI techniques to interpret information from a wide variety of sources, and in turn to enable intelligent, goal-directed behaviour. The Artificial Intelligence Master’s programme at Vrije Universiteit Amsterdam looks specifically at hybrid intelligence, where AI systems and humans collaborate. The first year is made up of broad courses that focus on core AI topics, while the second year is devoted to your chosen area of specialisation. AI’s applications are highly diverse, ranging from optimising internet searches to supporting elderly people with dementia. The VU’s Artificial Intelligence programme allows you to analyse, develop and apply new AI techniques to come up with solu

  course_website_link_tag = soup.find('a', class_=lambda c: c and 'cta__program-button' in c.split() and 'ext' in c.split(), text=lambda t: t and "Visit course website" in t.strip())


Unnamed: 0,description,language_requirements,tuition_fees,application_deadlines_and_start_dates,language,degree_title,duration_detailed,ects_credits,accreditation,course_website_link
0,The Artificial Intelligence Master’s programme...,"[{'score_or_level': '6.5', 'test_name': 'IELTS...","{'EU/EEA': '€ 2,601', 'Non-EU/EEA': '€ 23,490'...","[{'start_date': '1 Sep '25', 'deadline_eu_eea'...",English,Master of Science,2 years,120 ECTS,NVAO,https://vu.nl/en/education/master/artificial-i...


In [None]:

df_info = pd.read_csv("Program_info_netherlands_level0.csv",index_col=0)
#Extract program basic information for all the links
rows_tabular = [] #list to collect program details in tabular data format
rows_text = [] #convert the program details to natural language sentences, and save the text and link to official website information

for full_url_link in tqdm(df_info.link):
    # Extract information
    df_link_info_check = fetch_program_description(full_url_link)
    
    # Get the combined text (convert all program details into natural language sentences)
    readable_context = program_details_to_natural_language(df_link_info_check)

    # Collect results
    rows_tabular.append(df_link_info_check)
    rows_text.append({'Context' :readable_context,
                'Course Website Link': df_link_info_check.get('course_website_link', 'N/A')
               })

# Build DataFrame once
df_link_info_basic_tabular = pd.DataFrame(rows_tabular)
df_link_info_basic_text = pd.DataFrame(rows_text)
display(df_link_info_basic_tabular)
display(df_link_info_basic_text)


  course_website_link_tag = soup.find('a', class_=lambda c: c and 'cta__program-button' in c.split() and 'ext' in c.split(), text=lambda t: t and "Visit course website" in t.strip())
100%|██████████| 1867/1867 [05:12<00:00,  5.97it/s]


Unnamed: 0,description,language_requirements,tuition_fees,application_deadlines_and_start_dates,language,degree_title,duration_detailed,ects_credits,accreditation,course_website_link
0,,"[{'score_or_level': 'B2', 'test_name': 'Cambri...","{'EU/EEA': '€ 2,601', 'Non-EU/EEA': '€ 13,920'...","[{'start_date': '1 Sep '25', 'deadline_eu_eea'...",English,Bachelor of Arts,3 years,180 ECTS,NVAO,https://vu.nl/en/education/bachelor/philosophy
1,This three-week programme will explore the web...,[],"{'EU/EEA': 'Information not available', 'Non-E...","[{'start_date': '20 Jul '25', 'deadline_eu_eea...",English,-,19 days,6 ECTS,Information not available,https://summerschool.uva.nl/content/summer-cou...
2,How do news reports frame disaster response af...,"[{'score_or_level': 'C1', 'test_name': 'Cambri...","{'EU/EEA': '€ 2,601', 'Non-EU/EEA': '€ 24,150'...","[{'start_date': '1 Sep '25', 'deadline_eu_eea'...",English,Master of Science,2 years,Information not available,NVAO,https://vu.nl/en/education/master/social-scien...
3,Are you ready to tackle the complexities of cl...,"[{'score_or_level': 'C1', 'test_name': 'Cambri...","{'EU/EEA': '€ 2,601', 'Non-EU/EEA': '€ 24,150'...","[{'start_date': '1 Sep '25', 'deadline_eu_eea'...",English,Master of Science,1 year,Information not available,NVAO,https://vu.nl/en/education/master/climate-and-...
4,,"[{'score_or_level': 'C1', 'test_name': 'Cambri...","{'EU/EEA': '€ 2,601', 'Non-EU/EEA': '€ 22,500'...","[{'start_date': '1 Sep '25', 'deadline_eu_eea'...",English,Master of Laws,1 year,60 ECTS,NVAO,http://www.uva.nl/llm-international-trade-and-...
...,...,...,...,...,...,...,...,...,...,...
1862,Would you like to apply to an undergraduate ph...,"[{'score_or_level': '6', 'test_name': 'IELTS r...","{'EU/EEA': 'Information not available', 'Non-E...","[{'start_date': '1 Sep '25', 'deadline_eu_eea'...",English,Bachelor of Science,3 years,180 ECTS,NVAO,https://www.somt.nl/en/education/bachelor-of-p...
1863,The objective of the Executive Master Media In...,"[{'score_or_level': 'C1', 'test_name': 'Cambri...","{'EU/EEA': '€ 2,601', 'Non-EU/EEA': '€ 16,950'...","[{'start_date': '1 Sep '25', 'deadline_eu_eea'...",English,Master of Arts,1 year,60 ECTS,NVAO,https://www.buas.nl/en/programmes/executive-ma...
1864,,[],"{'EU/EEA': 'Information not available', 'Non-E...","[{'start_date': '1 Sep '25', 'deadline_eu_eea'...",English,Bachelor of Science,4 years,240 ECTS,NVAO,https://en.thim.nl
1865,Would you like to contribute to a sustainable ...,"[{'score_or_level': '6.5', 'test_name': 'IELTS...","{'EU/EEA': '€ 2,601', 'Non-EU/EEA': 'Informati...","[{'start_date': '1 Sep '25', 'deadline_eu_eea'...",English,Master of Science,2 years,120 ECTS,NVAO,https://curriculum.maastrichtuniversity.nl/edu...


Unnamed: 0,Context,Course Website Link
0,This program awards a degree titled 'Bachelor ...,https://vu.nl/en/education/bachelor/philosophy
1,Program Overview: This three-week programme wi...,https://summerschool.uva.nl/content/summer-cou...
2,Program Overview: How do news reports frame di...,https://vu.nl/en/education/master/social-scien...
3,Program Overview: Are you ready to tackle the ...,https://vu.nl/en/education/master/climate-and-...
4,This program awards a degree titled 'Master of...,http://www.uva.nl/llm-international-trade-and-...
...,...,...
1862,Program Overview: Would you like to apply to a...,https://www.somt.nl/en/education/bachelor-of-p...
1863,Program Overview: The objective of the Executi...,https://www.buas.nl/en/programmes/executive-ma...
1864,This program awards a degree titled 'Bachelor ...,https://en.thim.nl
1865,Program Overview: Would you like to contribute...,https://curriculum.maastrichtuniversity.nl/edu...


In [None]:
#Save to excel
#The program details in tabular format
df_link_info_basic_tabular.to_csv("Program_info_netherlands_level1_tabular.csv")
#The program details converted from tabular data into natural language text
df_link_info_basic_text.to_csv("Program_info_netherlands_level1_text.csv")

### Combine the information between level0 and level1 together

In [7]:
df_info = pd.read_csv("Program_info_netherlands_level0.csv",index_col = 0)
df_info.head()
df_link_info_basic_text = pd.read_csv("Program_info_netherlands_level1_text.csv",index_col = 0)
df_link_info_basic_text.head()

Unnamed: 0,Context,Course Website Link
0,This program awards a degree titled 'Bachelor ...,https://vu.nl/en/education/bachelor/philosophy
1,Program Overview: This three-week programme wi...,https://summerschool.uva.nl/content/summer-cou...
2,Program Overview: How do news reports frame di...,https://vu.nl/en/education/master/social-scien...
3,Program Overview: Are you ready to tackle the ...,https://vu.nl/en/education/master/climate-and-...
4,This program awards a degree titled 'Master of...,http://www.uva.nl/llm-international-trade-and-...


In [None]:
#Combine the program information at level0<the original webpage> and level1<the webpage of link in level0 webpage>
df_level0_level1 = pd.concat([df_info.reset_index(drop=True),df_link_info_basic_text.reset_index(drop=True)],axis = 1)
df_level0_level1.head()

Unnamed: 0,name,university,type,location,duration,link,Context,Course Website Link
0,Filosofie,Vrije Universiteit Amsterdam,Bachelor,Amsterdam,3 years,https://www.studyinnl.org//dutch-education/stu...,This program awards a degree titled 'Bachelor ...,https://vu.nl/en/education/bachelor/philosophy
1,"Placemaking: Sense, Space & Strategy",University of Amsterdam,Short or summer course,Amsterdam,19 days,https://www.studyinnl.org//dutch-education/stu...,Program Overview: This three-week programme wi...,https://summerschool.uva.nl/content/summer-cou...
2,Social Sciences for a Digital Society (research),Vrije Universiteit Amsterdam,Master,Amsterdam,2 years,https://www.studyinnl.org//dutch-education/stu...,Program Overview: How do news reports frame di...,https://vu.nl/en/education/master/social-scien...
3,Finance: Climate and Sustainable Finance,Vrije Universiteit Amsterdam,Master,Amsterdam,1 year,https://www.studyinnl.org//dutch-education/stu...,Program Overview: Are you ready to tackle the ...,https://vu.nl/en/education/master/climate-and-...
4,International and European Law: International ...,University of Amsterdam,Master,Amsterdam,1 year,https://www.studyinnl.org//dutch-education/stu...,This program awards a degree titled 'Master of...,http://www.uva.nl/llm-international-trade-and-...


In [9]:
#Function to convert each row of program information(in tabular data format) to natural language sentences
def row_to_natural_language(row):
    parts = []

    # University + program
    if row['university']:
        start = f"{row['university']} offers"
        if row['name'] and row['type']:
            start += f" a {row['type']} program called '{row['name']}'"
        elif row['name']:
            start += f" a program called '{row['name']}'"
        elif row['type']:
            start += f" a {row['type']} program"
        else:
            start += " a study program"
        parts.append(start)
    else:
        parts.append(f"{row['name']} is a study program")

    # Location
    if row.get('location'):
        parts[-1] += f" in {row['location']}."
    else:
        parts[-1] += "."

    # Duration
    if row.get('duration'):
        parts.append(f"The program lasts for {row['duration']}.")

    # Overview
    if row.get('Context'):
        parts.append(f"{row['Context']}")

    return " ".join(parts)


In [12]:
df_text = pd.DataFrame({'text':df_level0_level1.apply(row_to_natural_language, axis=1)})
df_text

Unnamed: 0,text
0,Vrije Universiteit Amsterdam offers a Bachelor...
1,University of Amsterdam offers a Short or summ...
2,Vrije Universiteit Amsterdam offers a Master p...
3,Vrije Universiteit Amsterdam offers a Master p...
4,University of Amsterdam offers a Master progra...
...,...
1862,SOMT offers a Bachelor program called 'Physiot...
1863,Breda University of Applied Sciences offers a ...
1864,THIM University of Applied Sciences in Physiot...
1865,Maastricht University offers a Master program ...


In [None]:
# #Save it as csv file
# df_text.to_csv("Program_info_netherlands_text.csv")