## Scraping DAAD Website for Courses

### Getting Result Page using Selenium

In [1]:
# Importing Dependencies
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
import time
import csv

In [4]:
# Getting the keyword to search
def get_keyword():
    keyword = input("Enter the keyword with you want to search courses: ")
    return keyword

In [11]:
# Fetching the URL using Chrome
def fetch_url():
    url = "https://www2.daad.de/deutschland/studienangebote/international-programmes/en/"
    browser = webdriver.Chrome()
    browser.get(url)
    browser.maximize_window()
    time.sleep(3)
    return browser

In [12]:
# Handling Cookie Pop Up
def cookie_popup(browser):
    accept_element = browser.find_element(By.XPATH, "/html/body/div[2]/div[2]/div[2]/div/div/div[2]/button")
    accept_element.click()

In [36]:
# Selecting the Degree, Language and Field of the Program
def select_program(browser):
    # Selecting Degree Program
    course_menu_element = browser.find_element(By.XPATH, '//*[@id="search-form-homepage"]/div[2]/div[2]/div/div[2]/form/div[2]/div[1]/div/div/fieldset/div/button')
    course_menu_element.click()
    course_element = browser.find_element(By.XPATH, '/html/body/div[2]/main/div[1]/div[2]/div[2]/div/div[2]/form/div[2]/div[1]/div/div/fieldset/div/ul/li[2]/label/input')
    course_element.click()
    time.sleep(2)

    # Selecting Language of the Program
    language_menu_element = browser.find_element(By.XPATH, '/html/body/div[2]/main/div[1]/div[2]/div[2]/div/div[2]/form/div[2]/div[2]/div/div/fieldset/div/button')
    language_menu_element.click()
    language_element = browser.find_element(By.XPATH, '/html/body/div[2]/main/div[1]/div[2]/div[2]/div/div[2]/form/div[2]/div[2]/div/div/fieldset/div/ul/li[2]/label/input')
    language_element.click()
    time.sleep(2)

    # Selecting Field of Study
    field_menu_element = browser.find_element(By.ID, "filterFos")
    field_menu_element.click()
    field_element = browser.find_element(By.XPATH, '/html/body/div[2]/main/div[1]/div[2]/div[2]/div/div[2]/form/div[2]/div[3]/div/div/select/option[4]')
    field_element.click()
    time.sleep(2)

In [32]:
# Searching for the keyword
def search_keyword(browser, keyword):
    # Selecting Search Menu
    search_element = browser.find_element(By.ID, "suggest")
    search_element.send_keys(keyword)
    
    # Closing Search Menu
    closing_element = browser.find_element(By.XPATH, '/html/body/div[2]/main/div[2]/div/aside/div/div/ul/li[2]/span[1]')
    closing_element.click()

    # Searching Universities based on the above Criteria
    search_element = browser.find_element(By.XPATH, '/html/body/div[2]/main/div[1]/div[2]/div[2]/div/div[2]/form/div[3]/button')
    search_element.click()
    time.sleep(5)

In [42]:
# Selecting the Filters for the Program
def select_filters(browser):
    # Selecting Tuition Fees for the program
    fees_menu_element = browser.find_element(By.ID, "filterFee")
    fees_menu_element.click()
    fees_element = browser.find_element(By.XPATH, '/html/body/div[2]/main/form/div[1]/div/div/div/div[1]/div[2]/div[10]/div/div/select/option[2]')
    fees_element.click()
    time.sleep(4)

    # Selecting Beginning of the Program
    beginning_menu_element = browser.find_element(By.XPATH, '/html/body/div[2]/main/form/div[1]/div/div/div/div[1]/div[2]/div[11]/div/div[2]/div/button')
    beginning_menu_element.click()
    beginning_element = browser.find_element(By.XPATH, '/html/body/div[2]/main/form/div[1]/div/div/div/div[1]/div[2]/div[11]/div/div[2]/div/ul/li[2]/label/input')
    beginning_element.click()

### Getting all Data for all Pages

In [19]:
# Getting the number of pages
def get_no_pages(browser):
    no_pages = browser.find_element(By.XPATH, "/html/body/div[2]/main/div[2]/div[3]/div/div/div[1]/span/span")
    no_pages = int(no_pages.text)
    return no_pages

In [20]:
# Getting to the first webpage
def first_page(browser):
    page_dropdown_element = browser.find_element(By.XPATH, "/html/body/div[2]/main/div[2]/div[3]/div/div/div[1]/div[1]/div/select")
    page_dropdown_element.click()
    page_number_element = browser.find_element(By.XPATH, "/html/body/div[2]/main/div[2]/div[3]/div/div/div[1]/div[1]/div/select/option[1]")
    page_number_element.click()
    time.sleep(2)

In [44]:
# Getting all data for all the webpages
def get_data(browser, no_pages):
    course = []
    university = []
    location = []
    subject = []
    url = []

    for i in range(0, no_pages):
        print(f"Scraping page {i + 1} of {no_pages}")
        # Getting the webpage and Parsing it
        page = browser.page_source
        doc = BeautifulSoup(page, "html.parser")

        # Getting data using tags
        course_name = doc.find_all("span", {"class": "js-course-title u-hide@sm"})
        university_name = doc.find_all("span", {"class": "c-ad-carousel__subtitle c-ad-carousel__subtitle--small js-course-academy"})
        location_name = doc.find_all("span", {"class": "c-ad-carousel__subtitle c-ad-carousel__subtitle--location c-ad-carousel__subtitle--small"})
        subject_name = []
        ul_name = doc.find_all("ul", {"class": "c-ad-carousel__data-list c-ad-carousel__data-list--not-colored p-0 mb-0"})
        
        for j in range(0, len(ul_name)):
            li_name = ul_name[j].find_all("li")
            subject_upper = li_name[0].find_all("span")
            subject_name.append(subject_upper[0])
    
        url_name = doc.find_all("a", {"class": "list-inline-item mr-0 js-course-detail-link"})

        # Appending page wise data
        course += course_name
        university += university_name
        location += location_name
        subject += subject_name
        url += url_name

        # Breaking condition
        if i == no_pages - 1:
            break

        # Going to the next page
        next_page_element = browser.find_element(By.XPATH, "/html/body/div[2]/main/div[2]/div[3]/div/div/div[1]/div[2]/a[2]")
        next_page_element.click()
        time.sleep(3)

    return course, university, location, subject, url

### Processing the Data and Storing it in CSV

In [40]:
# Processing all data
def process(course, university, location, subject, url):
    base_url = "www2.daad.de"
    course_final = [0] * len(course)
    university_final = [0] * len(course)
    location_final = [0] * len(course)
    subject_final = [0] * len(course)
    url_final = [0] * len(course)
    
    for i in range(0, len(course)):
        course_final[i] = course[i].text.strip()
        university_final[i] = university[i].text.split("•")[0].strip()
        location_final[i] = location[i].text.strip()
        subject_final[i] = subject[i].text.strip()
        url_final[i] = base_url + url[i]["href"]

    return course_final, university_final, location_final, subject_final, url_final

In [46]:
# Writing to CSV
def write_csv(course_final, university_final, location_final, subject_final, url_final, keyword):
    print("Writing Data to CSV")
    with open(f"All DAAD Courses - {keyword}.csv", "w", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(["Course", "University", "Location", "Subject", "Link"])
        for i in range(0, len(course_final)):
            writer.writerow([course_final[i], university_final[i], location_final[i], subject_final[i], url_final[i]])

### Calling all the Functions

In [38]:
# Merging all functions into one
def get_courses():
    # Selecting criteria for the program
    keyword = get_keyword()
    browser = fetch_url()
    cookie_popup(browser)
    select_program(browser)
    search_keyword(browser, keyword)
    select_filters(browser)

    # Getting all data
    no_pages = get_no_pages(browser)
    first_page(browser)
    course, university, location, subject, url = get_data(browser, no_pages)

    # Processing data and storing it to CSV
    course_final, university_final, location_final, subject_final, url_final = process(course, university, location, subject, url)
    write_csv(course_final, university_final, location_final, subject_final, url_final, keyword)

In [47]:
get_courses()

Enter the keyword with you want to search courses:  Computer


Scraping page 1 of 7
Scraping page 2 of 7
Scraping page 3 of 7
Scraping page 4 of 7
Scraping page 5 of 7
Scraping page 6 of 7
Scraping page 7 of 7
Writing Data to CSV
