## Scraping DAAD Website for German Universities

### Getting Dynamic Webpage using Selenium

In [40]:
# Importing Dependencies
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
import time
import csv

In [41]:
# Fetching URL using Chrome Browser
url = "https://www2.daad.de/deutschland/studienangebote/international-programmes/en/"
browser = webdriver.Chrome()
browser.get(url)
browser.maximize_window()
time.sleep(3)

# For Headless Browser Use below code
# options = webdriver.ChromeOptions()
# options.add_argument("--headless")
# browser = webdriver.Chrome(options=options)

In [42]:
# Handling Cookie Pop Up
accept_element = browser.find_element(By.XPATH, "/html/body/div[2]/div[2]/div[2]/div/div/div[2]/button")
accept_element.click()

In [43]:
# Selecting Degree Program
course_menu_element = browser.find_element(By.XPATH, '//*[@id="search-form-homepage"]/div[2]/div[2]/div/div[2]/form/div[2]/div[1]/div/div/fieldset/div/button')
course_menu_element.click()
course_element = browser.find_element(By.XPATH, '/html/body/div[2]/main/div[1]/div[2]/div[2]/div/div[2]/form/div[2]/div[1]/div/div/fieldset/div/ul/li[2]/label/input')
course_element.click()

In [44]:
# Selecting Language of the Program
language_menu_element = browser.find_element(By.XPATH, '/html/body/div[2]/main/div[1]/div[2]/div[2]/div/div[2]/form/div[2]/div[2]/div/div/fieldset/div/button')
language_menu_element.click()
language_element = browser.find_element(By.XPATH, '/html/body/div[2]/main/div[1]/div[2]/div[2]/div/div[2]/form/div[2]/div[2]/div/div/fieldset/div/ul/li[2]/label/input')
language_element.click()

In [45]:
# Selecting Field of Study
field_menu_element = browser.find_element(By.ID, "filterFos")
field_menu_element.click()
field_element = browser.find_element(By.XPATH, '/html/body/div[2]/main/div[1]/div[2]/div[2]/div/div[2]/form/div[2]/div[3]/div/div/select/option[4]')
field_element.click()

In [46]:
# Selecting Search Menu
search_element = browser.find_element(By.ID, "suggest")
search_element.send_keys("Computer Science")
# Closing Search Menu
closing_element = browser.find_element(By.XPATH, '/html/body/div[2]/main/div[2]/div/aside/div/div/ul/li[2]/span[1]')
closing_element.click()

In [47]:
# Searching Universities based on the above Criteria
search_element = browser.find_element(By.XPATH, '/html/body/div[2]/main/div[1]/div[2]/div[2]/div/div[2]/form/div[3]/button')
search_element.click()

In [48]:
# Selecting Tuition Fees for the program
fees_menu_element = browser.find_element(By.ID, "filterFee")
fees_menu_element.click()
fees_element = browser.find_element(By.XPATH, '/html/body/div[2]/main/form/div[1]/div/div/div/div[1]/div[2]/div[10]/div/div/select/option[2]')
fees_element.click()

In [49]:
# Selecting Beginning of the Program
beginning_menu_element = browser.find_element(By.XPATH, '/html/body/div[2]/main/form/div[1]/div/div/div/div[1]/div[2]/div[11]/div/div[2]/div/button')
beginning_menu_element.click()
beginning_element = browser.find_element(By.XPATH, '/html/body/div[2]/main/form/div[1]/div/div/div/div[1]/div[2]/div[11]/div/div[2]/div/ul/li[2]/label/input')
beginning_element.click()

### Parsing the Webpage using BeautifulSoup and Getting all Information for first Webpage

In [72]:
# Getting the webpage and Parsing it
page = browser.page_source
doc = BeautifulSoup(page, "html.parser")

In [73]:
# Getting the tag for the name of the Course
course_name = doc.find_all("span", {"class": "js-course-title u-hide@sm"})
len(course_name)

10

In [74]:
course_name[0].text.strip()

'Master of Science in Computer Engineering'

In [75]:
# Getting the tag for the name of the University
university_name = doc.find_all("span", {"class": "c-ad-carousel__subtitle c-ad-carousel__subtitle--small js-course-academy"})
len(university_name)

10

In [76]:
university_name[0].text.split("•")[0].strip()

'Paderborn University'

In [77]:
# Getting the tag for the location of the University
location_name = doc.find_all("span", {"class": "c-ad-carousel__subtitle c-ad-carousel__subtitle--location c-ad-carousel__subtitle--small"})
len(location_name)

10

In [78]:
location_name[0].text.strip()

'Paderborn'

In [79]:
# Getting the tag for the Subject of the Course
subject_name = []
ul_name = doc.find_all("ul", {"class": "c-ad-carousel__data-list c-ad-carousel__data-list--not-colored p-0 mb-0"})

for i  in range(0, len(ul_name)):
    li_name = ul_name[i].find_all("li")
    subject = li_name[0].find_all("span")
    subject_name.append(subject[0])

len(subject_name)

10

In [80]:
subject_name[0].text.strip()

'Electrical Engineering'

### Going to Next Page and Checking all Tags

In [81]:
next_page_element = browser.find_element(By.XPATH, "/html/body/div[2]/main/div[2]/div[3]/div/div/div[1]/div[2]/a[2]")
next_page_element.click()

In [82]:
# Getting the webpage and Parsing it
page = browser.page_source
doc = BeautifulSoup(page, "html.parser")

In [83]:
# Getting the tag for the name of the Course
course_name = doc.find_all("span", {"class": "js-course-title u-hide@sm"})
len(course_name)

10

In [97]:
course_name[9].text.strip()

"International Master's Programme in Advanced Computational and Civil Engineering Structural Studies (ACCESS)"

In [85]:
# Getting the tag for the name of the University
university_name = doc.find_all("span", {"class": "c-ad-carousel__subtitle c-ad-carousel__subtitle--small js-course-academy"})
len(university_name)

10

In [96]:
university_name[9].text.split("•")[0].strip()

'Dresden University of Technology'

In [87]:
# Getting the tag for the location of the University
location_name = doc.find_all("span", {"class": "c-ad-carousel__subtitle c-ad-carousel__subtitle--location c-ad-carousel__subtitle--small"})
len(location_name)

10

In [95]:
location_name[9].text.strip()

'Dresden'

In [89]:
# Getting the tag for the Subject of the Course
subject_name = []
ul_name = doc.find_all("ul", {"class": "c-ad-carousel__data-list c-ad-carousel__data-list--not-colored p-0 mb-0"})

for i  in range(0, len(ul_name)):
    li_name = ul_name[i].find_all("li")
    subject = li_name[0].find_all("span")
    subject_name.append(subject[0])

len(subject_name)

10

In [94]:
subject_name[9].text.strip()

'Civil Engineering'

### Getting all Tags for all Pages

In [70]:
# Getting number of webpages
no_pages = browser.find_element(By.XPATH, "/html/body/div[2]/main/div[2]/div[3]/div/div/div[1]/span/span")
no_pages = int(no_pages.text)
no_pages

17

In [131]:
# Getting to the first page
page_dropdown_element = browser.find_element(By.XPATH, "/html/body/div[2]/main/div[2]/div[3]/div/div/div[1]/div[1]/div/select")
page_dropdown_element.click()
page_number_element = browser.find_element(By.XPATH, "/html/body/div[2]/main/div[2]/div[3]/div/div/div[1]/div[1]/div/select/option[1]")
page_number_element.click()
time.sleep(2)

In [132]:
# Getting data all data
course = []
university = []
location = []
subject = []
for i in range(0, no_pages):
    print(f"Scraping page {i + 1}")
    # Getting the webpage and Parsing it
    page = browser.page_source
    doc = BeautifulSoup(page, "html.parser")

    course_name = doc.find_all("span", {"class": "js-course-title u-hide@sm"})
    university_name = doc.find_all("span", {"class": "c-ad-carousel__subtitle c-ad-carousel__subtitle--small js-course-academy"})
    location_name = doc.find_all("span", {"class": "c-ad-carousel__subtitle c-ad-carousel__subtitle--location c-ad-carousel__subtitle--small"})
    subject_name = []
    ul_name = doc.find_all("ul", {"class": "c-ad-carousel__data-list c-ad-carousel__data-list--not-colored p-0 mb-0"})
    
    for j in range(0, len(ul_name)):
        li_name = ul_name[j].find_all("li")
        subject_upper = li_name[0].find_all("span")
        subject_name.append(subject_upper[0])

    course += course_name
    university += university_name
    location += location_name
    subject += subject_name

    if i == no_pages - 1:
        break
    next_page_element = browser.find_element(By.XPATH, "/html/body/div[2]/main/div[2]/div[3]/div/div/div[1]/div[2]/a[2]")
    next_page_element.click()
    time.sleep(2)

Scraping page 1
Scraping page 2
Scraping page 3
Scraping page 4
Scraping page 5
Scraping page 6
Scraping page 7
Scraping page 8
Scraping page 9
Scraping page 10
Scraping page 11
Scraping page 12
Scraping page 13
Scraping page 14
Scraping page 15
Scraping page 16
Scraping page 17


In [133]:
len(subject)

164

In [136]:
# Processing all the data
course_final = [0] * len(course)
university_final = [0] * len(course)
location_final = [0] * len(course)
subject_final = [0] * len(course)

for i in range(0, len(course)):
    course_final[i] = course[i].text.strip()
    university_final[i] = university[i].text.split("•")[0].strip()
    location_final[i] = location[i].text.strip()
    subject_final[i] = subject[i].text.strip()

In [141]:
len(course_final)

164

In [140]:
# Writing to CSV
with open("University.csv", "w", newline=) as f:
    writer = csv.writer(f)
    writer.writerow(["Course", "University", "Location", "Subject"])
    for i in range(0, len(course_final)):
        writer.writerow([course_final[i], university_final[i], location_final[i], subject_final[i]])