In [8]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By 
from selenium.webdriver.support.ui import WebDriverWait 
from selenium.webdriver.support import expected_conditions as EC 
from selenium.common.exceptions import TimeoutException
import pandas as pd
from bs4 import BeautifulSoup
import glob
import os

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


def get_browser() -> webdriver.Chrome:
    options = Options()
    options.add_argument("--incognito --window-size=1920,1200 --headless")
    browser = webdriver.Chrome(options=options)    
    return browser


def get_page_list(browser: webdriver.Chrome, url: str, max_page: int=1) -> list:
    browser.get(url)
    value = "totaal aantal pagina"
    pages = browser.find_elements(By.XPATH,"//*[contains(text(), '%s')]//ancestor::a[1]" % value)   
    pager = [page.get_attribute('innerHTML').split('</span>')[1] for page in pages]   
    total_pages = int(pager[0])
    
    if max_page < total_pages:
        total_pages = max_page
    
    page_list = ["https://www.vaia.be/en/calendar/p"+ str(i) +"&language[0]=381" for i in (range(1,total_pages+1)) ]
    return page_list


def format_date(input: str) -> str:
    day = input.split(' ')[0]
    month = input.split(' ')[1]
    year = input.split(' ')[2]
    month_dict = {'January': '01', 'February': '02', 'March': '03', 'April': '04', 'May': '05', 'June': '06', 'July': '07', 'August': '08', 'September': '09', 'October': '10', 'November': '11', 'December': '12'}
    month = month_dict[month]
    return day + '/' + month + '/' + year


def clean_meta_data(input: str) -> str:      
    return input.replace(",",";").replace(" - ","-").replace(" -","-").replace("\n","").replace("\t","").replace("<p>","").replace("</p>","").replace("<br>","")


def get_index_page_details(browser: webdriver.Chrome, url: str) -> tuple[list, list, list, list]:
    
    browser.get(url)
    courses = browser.find_elements(By.CLASS_NAME, "link--extended")
    links = [elem.get_attribute('href') for elem in courses]
    titles = [elem.get_attribute('innerHTML') for elem in courses]
    course_dates = browser.find_elements(By.CLASS_NAME, "py-1")
    course_date = [format_date(course_date.get_attribute('innerHTML').strip()) for course_date in course_dates]    
    course_meta_datas = browser.find_elements(By.CLASS_NAME, "redactor")
    course_meta_data = [clean_meta_data(course_meta_data.get_attribute('innerHTML')) for course_meta_data in course_meta_datas]
    course_meta_data.pop(0)

    return links, titles, course_date, course_meta_data
    
    
def get_details(browser: webdriver.Chrome, url: str) -> tuple[str, str, str, str, str, str, str, str, str, str, str]:
   
    browser.get(url)        
    print(url)
    
    start_time, location, language, subscription, constraint, price, details, sub_title, intro, full_body, target_group =["","","","","","","","","","",""]
    
    #Sub title     
    sub_title_data = browser.find_elements(By.XPATH, "//*[contains(@class,'page-main')]/div/div/div/div[2]")
    sub_title = [elem.get_attribute('innerHTML') for elem in sub_title_data]    
    
      
    if len(sub_title) > 0:
        sub_title = sub_title[0]
    else:
        sub_title = ""
             
    
    #intro        
    intro_data = browser.find_elements(By.XPATH, "//*[contains(@class,'page-main')]/div/div/div/div[4]")
    intro = [elem.get_attribute('innerHTML') for elem in intro_data]   
        
     
    if len(intro) > 0:
        intro = intro[0]
    else:
        intro = ""
    
    #left table
    data = browser.find_elements(By.XPATH, "//div[contains(@class,'practical-info')]/div/div/div/div")
    course_meta_data = [course_meta_data.get_attribute('innerHTML') for course_meta_data in data]    
    
    for info in course_meta_data:    
         if "svg#groups" in info:
                target_group = info.replace("Doelgroep: ","")
         elif "svg#date-range" in info:
                start_time = info
         elif "svg#place" in info:
                location = info
         elif "svg#globe" in info:
                language = info
        
    
    #right table
    data2 = browser.find_elements(By.XPATH, "//div[contains(@class,'practical-info')]/div/div/div[2]/div/div[1]/div/ul/li")
    course_meta_data2 = [elem.get_attribute('innerHTML') for elem in data2]

    
    for info in course_meta_data2:
        if "Register until" in info:
            subscription = info.replace("Register until: ","")
        elif "Prerequisites" in info:
            constraint = info.replace("Prerequisites: ","")
        elif "Price" in info:
            price = info.replace("Price: ","") 
    
    # Detail URL
    data3= browser.find_elements(By.XPATH, "//div[contains(@class,'practical-info')]/div/div/div[2]/div/div[1]/div/a")
    details = [elem.get_attribute('href') for elem in data3]
    if len(details) > 0:
        details = details[0]
    else:
        details = ""    
    
     #Full body    
    full_body_data =browser.find_elements(By.XPATH, "//div[contains(@class,'section--default')]/div[contains(@class,'container')]/div/div/div[contains(@class,'redactor')]")    
    full_body = [elem.get_attribute('innerHTML') for elem in full_body_data]  
    
    if  len(full_body) == 0:
        full_body_data =browser.find_elements(By.XPATH, "//div[contains(@class,'section--default')]/div[contains(@class,'container')]/div/div[contains(@class,'redactor')]")    
        full_body = [elem.get_attribute('innerHTML') for elem in full_body_data]  
       
    
    return start_time, location, language, target_group, subscription, constraint, price, details, sub_title, intro, full_body


def clean_data1(input: str) -> str:
    temp = input.strip()
    loc = temp.find('</svg>')
    temp = temp[loc+6:].strip()
    return temp


def clean_data2(input: str) -> str:
    temp = input.strip()
    temp = temp.replace("\n"," ")    
    return temp


def get_clean_data(html: str) -> str:   

    print(html)
    print("----")
    BS_full_body = BeautifulSoup(html, 'html.parser')
    return BS_full_body.get_text().replace("\n"," ").replace("\t"," ")  


def create_NER_column(data: pd.DataFrame) -> pd.Series:
        
    data['full_body'] = data['full_body'].apply(get_clean_data)
    data['intro'] = data['intro'].apply(get_clean_data)
   
    return data['title'] + ' ' + data['full_body'] + ' ' + data['intro'] + ' ' + data['details'] + ' ' + data['sub_title'] + ' ' + data['course_info']


def init_dataframe() -> pd.DataFrame:
    df = pd.DataFrame()
    df['title'] = ""
    df['url'] = ""
    df['date'] = ""
    df['course_info'] = ""
    df['start_time'] = ""
    df['language'] = ""
    df['location_detail'] = ""
    df['target_group'] = ""
    df['subscription_limit'] = ""
    df['constraints'] = ""
    df['price'] = ""
    df['details'] = ""
    df['sub_title'] = ""
    df['intro'] = ""
    df['full_body'] = ""
    df['data'] = ""
    
    return df


def get_index(page_list: list, browser: webdriver.Chrome, url: str, df: pd.DataFrame) -> pd.DataFrame:
    for url in page_list:
        links, titles, course_date, course_meta_data = get_index_page_details(browser, url)
        
        print(url, len(links), len(titles), len(course_date), len(course_meta_data))
        
        for i in range(len(links)):  
            
            try:
                local_date = course_date[i]
            except:
                local_date = ""
            
            new_record = pd.DataFrame([{'title': titles[i], 'url': links[i], 'date': local_date, 'course_info': course_meta_data[i]}]) 
                       
            df = pd.concat([df, new_record], ignore_index=True)
    return df


def get_detail_pages(df: pd.DataFrame, browser: webdriver.Chrome, start:int = 0, stop:int = 40) -> pd.DataFrame:
    for index, row in df.iterrows():     
        
        if index >= start and index < stop:
            start_time, location, language, target_group, subscription_limit, constraints, price, details, sub_title, intro, full_body = get_details(browser,row['url'])    
        
            df.loc[index, 'start_time'] = clean_data1(start_time)    
            df.loc[index, 'location_detail'] = clean_data1(location)
            df.loc[index, 'language'] = clean_data1(language)
            df.loc[index, 'target_group'] = clean_data1(target_group)
            df.loc[index, 'subscription_limit'] = clean_data2(subscription_limit)
            df.loc[index, 'constraints'] = clean_data2(constraints)
            df.loc[index, 'price'] = clean_data2(price)  
            df.loc[index, 'details'] = details
            df.loc[index, 'sub_title'] = clean_data2(sub_title)
            df.loc[index, 'intro'] = clean_data2(intro)
            df.loc[index, 'full_body'] = "".join(full_body).replace(","," ")            
        else:
            continue
        
    return df


def delete_csv_files():
    files = glob.glob('*.csv')
    for f in files:
        os.remove(f)

In [3]:
#Disable VPN

#Initiate
url = "https://www.vaia.be/en/calendar&language%5B%5D=381"
browser = get_browser()
page_list = get_page_list(browser, url, 46)
page_list.insert(0,url)
df = init_dataframe()
output_file = 'vaia-data.csv'

page_list

['https://www.vaia.be/en/calendar&language%5B%5D=381',
 'https://www.vaia.be/en/calendar/p1&language[0]=381',
 'https://www.vaia.be/en/calendar/p2&language[0]=381',
 'https://www.vaia.be/en/calendar/p3&language[0]=381',
 'https://www.vaia.be/en/calendar/p4&language[0]=381',
 'https://www.vaia.be/en/calendar/p5&language[0]=381',
 'https://www.vaia.be/en/calendar/p6&language[0]=381',
 'https://www.vaia.be/en/calendar/p7&language[0]=381',
 'https://www.vaia.be/en/calendar/p8&language[0]=381',
 'https://www.vaia.be/en/calendar/p9&language[0]=381',
 'https://www.vaia.be/en/calendar/p10&language[0]=381',
 'https://www.vaia.be/en/calendar/p11&language[0]=381',
 'https://www.vaia.be/en/calendar/p12&language[0]=381',
 'https://www.vaia.be/en/calendar/p13&language[0]=381',
 'https://www.vaia.be/en/calendar/p14&language[0]=381',
 'https://www.vaia.be/en/calendar/p15&language[0]=381',
 'https://www.vaia.be/en/calendar/p16&language[0]=381',
 'https://www.vaia.be/en/calendar/p17&language[0]=381',
 '

In [4]:
#Get Index Page Details
df = get_index(page_list, browser,url,df)
browser.close()
df

https://www.vaia.be/en/calendar&language%5B%5D=381 8 8 8 8
https://www.vaia.be/en/calendar/p1&language[0]=381 8 8 8 8
https://www.vaia.be/en/calendar/p2&language[0]=381 8 8 8 8
https://www.vaia.be/en/calendar/p3&language[0]=381 8 8 8 8
https://www.vaia.be/en/calendar/p4&language[0]=381 8 8 8 8
https://www.vaia.be/en/calendar/p5&language[0]=381 8 8 3 8
https://www.vaia.be/en/calendar/p6&language[0]=381 8 8 0 8
https://www.vaia.be/en/calendar/p7&language[0]=381 8 8 0 8
https://www.vaia.be/en/calendar/p8&language[0]=381 8 8 0 8
https://www.vaia.be/en/calendar/p9&language[0]=381 8 8 0 8
https://www.vaia.be/en/calendar/p10&language[0]=381 8 8 0 8
https://www.vaia.be/en/calendar/p11&language[0]=381 8 8 0 8
https://www.vaia.be/en/calendar/p12&language[0]=381 8 8 0 8
https://www.vaia.be/en/calendar/p13&language[0]=381 8 8 7 8
https://www.vaia.be/en/calendar/p14&language[0]=381 8 8 8 8
https://www.vaia.be/en/calendar/p15&language[0]=381 8 8 8 8
https://www.vaia.be/en/calendar/p16&language[0]=38

Unnamed: 0,title,url,date,course_info,start_time,language,location_detail,target_group,subscription_limit,constraints,price,details,sub_title,intro,full_body,data
0,Keeping things private: Exploring open-source ...,https://www.vaia.be/en/courses/keeping-things-...,15/04/2024,lecture &amp; workshop-Antwerp-TEXTUA; UAntwer...,,,,,,,,,,,,
1,Machine Learning with Python,https://www.vaia.be/en/courses/module-12-machi...,15/04/2024,course-Ghent-UGent,,,,,,,,,,,,
2,Current Trends in AI,https://www.vaia.be/en/courses/current-trends-...,17/04/2024,lezingenreeks-Brugge-KU Leuven Postuniversitai...,,,,,,,,,,,,
3,Digital Ethics,https://www.vaia.be/en/courses/digital-ethics-...,18/04/2024,course-hybrid-KU Leuven,,,,,,,,,,,,
4,AI &amp; Digital Technologies,https://www.vaia.be/en/courses/ai-digital-tech...,19/04/2024,webinar-online-Vlerick Business School,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
371,Introduction to AI and Machine Learning for Bi...,https://www.vaia.be/en/courses/introduction-to...,11/10/2021,introduction course-online-VAIA,,,,,,,,,,,,
372,International School on Big Data,https://www.vaia.be/en/courses/international-s...,10/10/2021,Research Training Event-Beersheba; Israel-BigD...,,,,,,,,,,,,
373,"Siri, what’s your advice? On AI and moral judg...",https://www.vaia.be/en/courses/siri-wat-advise...,30/09/2021,seminar-Sense &amp; Sensibility of AI,,,,,,,,,,,,
374,Bioinformatics and AI seminar series,https://www.vaia.be/en/courses/bioinformatics-...,8/09/2021,monthly seminar-KU Leuven,,,,,,,,,,,,


In [5]:
#remove all CSV files
delete_csv_files()

# Specify the columns to consider when looking for duplicates
columns = ['title','url','date']

# Find and drop duplicates, this is normally already done after downloading the data
df = df.drop_duplicates(subset=columns)

#Save index file to disk as a backup
df.to_csv(output_file,sep='|',index=False, encoding='utf-8')

In [15]:
#Get Details Page Details by batch
df = pd.read_csv(output_file,delimiter='|')

browser = get_browser()

#Split up scraping as save data by 40 records

#df = get_detail_pages(df,browser,0,40)
#df = pd.read_csv(output_file,delimiter='|')
#df.to_csv(output_file,sep='|',index=False, encoding='utf-8')
#df = get_detail_pages(df,browser,40,80)

#368 is the last record

df = get_detail_pages(df,browser,0,50)

df.to_csv(output_file,sep='|',index=False, encoding='utf-8')


browser.close()
df

https://www.vaia.be/en/courses/de-zoektocht-naar-anonieme-data


Unnamed: 0,title,url,date,course_info,start_time,language,location_detail,target_group,subscription_limit,constraints,price,details,sub_title,intro,full_body,data
0,Keeping things private: Exploring open-source ...,https://www.vaia.be/en/courses/keeping-things-...,15/04/2024,lecture &amp; workshop-Antwerp-TEXTUA; UAntwer...,15 Apr 2024 10:00 - 16:00,English,"City Campus of the University of Antwerp, Buil...",Target audience: researchers from academia and...,12 Apr 2024,for the lecture there are no prerequisites; fo...,€50-€150 (see Practical information below),https://www.uantwerpen.be/en/research-faciliti...,15 Apr 2024 10:00 - 16:00,"<a href=""https://www.uantwerpen.be/en/research...",\n <h4>Target audience<br></h4>\n<p...,
1,Machine Learning with Python,https://www.vaia.be/en/courses/module-12-machi...,15/04/2024,course-Ghent-UGent,15 Apr 2024 - 3 Jun 2024,English,"Krijgslaan 281, 9000 Gent",Target audience: This course targets professio...,,Participants are expected to be familiar with ...,€600 - €1470,https://beta-academy.ugent.be/en/program/short...,Tackle the analytical part of data mining proj...,Tackle the analytical part of data mining proj...,\n\t\tTackle the analytical part of data minin...,
2,Current Trends in AI,https://www.vaia.be/en/courses/current-trends-...,17/04/2024,lezingenreeks-Brugge-KU Leuven Postuniversitai...,17 Apr 2024 - 25 Apr 2024,English,"Spoorwegstraat 12, 8200 Brugge",Target audience: AI-professionals who wish to ...,10 Apr 2024,,€240-€800,https://puc.kuleuven.be/nl/opleiding/current_t...,Bring AI-professionals up to speed with the la...,<p>The world of artificial intelligence evolve...,\n <h4><strong>Advanced GenAI (17 A...,
3,Digital Ethics,https://www.vaia.be/en/courses/digital-ethics-...,18/04/2024,course-hybrid-KU Leuven,18 Apr 2024 - 20 Jun 2024,English,Hybrid,Target audience: professionals looking to expa...,31 Mar 2024,,€ 1500,https://www.kuleuven.be/digisoc/education-and-...,18 Apr 2024 - 20 Jun 2024,"<a href=""https://www.kuleuven.be/digisoc/educa...",\n <p>As our reliance on technology...,
4,AI &amp; Digital Technologies,https://www.vaia.be/en/courses/ai-digital-tech...,19/04/2024,webinar-online-Vlerick Business School,19 Apr 2024 12:30 - 13:30,English,Online,Target audience: business professionals,,,free,https://www.vlerick.com/en/events/online-info-...,Business value with artificial intelligence: e...,This session aims to demystify AI for business...,\n <p>AI technology can have a huge...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
363,Introduction to AI and Machine Learning for Bi...,https://www.vaia.be/en/courses/introduction-to...,11/10/2021,introduction course-online-VAIA,11 Oct 2021 - 15 Nov 2021,English,Online,Target audience: PhD,30 Sept 2021,No prior knowledge is expected,free,https://www.vlaamse-ai-academie.be/calendar/in...,11 Oct 2021 - 15 Nov 2021,"<a href=""https://www.vlaamse-ai-academie.be/ca...",\n <p>Huge amounts of data are avai...,
364,International School on Big Data,https://www.vaia.be/en/courses/international-s...,10/10/2021,Research Training Event-Beersheba; Israel-BigD...,10 Oct 2021 - 14 Oct 2021,English,Abroad,"Target audience: students, researchers,...",,,€100 - 500,https://irdta.eu/bigdat2021a/,10 Oct 2021 - 14 Oct 2021,"<a href=""https://irdta.eu/bigdat2021a/"" class=...",\n <p>Big data is a broad field cov...,
365,"Siri, what’s your advice? On AI and moral judg...",https://www.vaia.be/en/courses/siri-wat-advise...,30/09/2021,seminar-Sense &amp; Sensibility of AI,30 Sept 2021 14:00 - 15:00,English,Online,Target audience: researchers with knowledge of...,,master's degree,free,,30 Sept 2021 14:00 - 15:00,"<a href=""https://twitter.com/intent/tweet?text...",\n <p>Giubilini and Savulescu (2018...,
366,Bioinformatics and AI seminar series,https://www.vaia.be/en/courses/bioinformatics-...,8/09/2021,monthly seminar-KU Leuven,8 Sept 2021 - 8 Jun 2022,English,ON5 Aula (04.112) in Leuven or online,Target audience: KU Leuven students,09 Mar 2022,,Free,https://gbiomed.kuleuven.be/english/networks/b...,8 Sept 2021 - 8 Jun 2022,"<a href=""https://gbiomed.kuleuven.be/english/n...",\n <p><strong>Wednesday 13 April 2...,


In [10]:
output_file = 'vaia-data.csv'
#Get Details Page Details by batch
df = pd.read_csv(output_file,delimiter='|')


#Only take English courses
df = df[df['language'] == 'English']

#Remove invalid records
for index, row in df.iterrows():
    if type(row["full_body"]) == float or type(row["intro"]) == float:
        print(index, row['url'])
        df.drop(index, inplace=True)   

#When all data is available
df['data'] = create_NER_column(df)
#NER --- df['data']

#Remove invalid records
for index, row in df.iterrows():
    if type(row["data"]) == float:
        print(index, row['url'])
        df.drop(index, inplace=True)   

df.to_csv(output_file,sep='|',index=False, encoding='utf-8')

35 https://www.vaia.be/en/courses/business-process-analytics
36 https://www.vaia.be/en/courses/social-network-analytics
37 https://www.vaia.be/en/courses/legal-protection-by-design-for-ai-systems
41 https://www.vaia.be/en/courses/ethics-and-ai
43 https://www.vaia.be/en/courses/scaling-success-how-generative-ai-is-revolutionizing-customer-experience-cx
46 https://www.vaia.be/en/courses/the-turing-lectures
47 https://www.vaia.be/en/courses/standard-for-public-code
48 https://www.vaia.be/en/courses/impact-ai-in-journalism
52 https://www.vaia.be/en/courses/text-analytics
55 https://www.vaia.be/en/courses/solving-public-problems-with-data
56 https://www.vaia.be/en/courses/quantum-machine-learning
57 https://www.vaia.be/en/courses/tensorlab
58 https://www.vaia.be/en/courses/lifelong-learning-program
59 https://www.vaia.be/en/courses/machine-learning-bias-in-bias-out
60 https://www.vaia.be/en/courses/fraud-analytics
62 https://www.vaia.be/en/courses/advanced-credit-risk-modeling-for-basel-ifr