In [36]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re

In [27]:
# Scrape the JUPAS website
url_JUPAS = 'https://www.jupas.edu.hk/en/programmes-offered/by-funding-category/'
response_JUPAS = requests.get(url_JUPAS)
soup_JUPAS = BeautifulSoup(response_JUPAS.content, 'html.parser')

# Extract all the university abbreviations from the class 'schools_container'
# The "a" elements are inside the div with the class 'schools_container'
# The university abbreviations are the last part of the URL
university_abbrs = []
for div in soup_JUPAS.find_all('div', class_='schools_container'):
    for a in div.find_all('a'):
        university_abbrs.append(a['href'].split('/')[-1])

# Replace 'programme-information' with 'sssdp'
university_abbrs = [abbr.replace('programme-information', 'sssdp') for abbr in university_abbrs]


# Scrape the programme information for each university
# The URL for the programme list is https://www.jupas.edu.hk/en/programme/{university_abbr}/
# Loop through each university abbreviation

university_data = []

for university in university_abbrs:
    # Create the URL
    url_uni = 'https://www.jupas.edu.hk/en/programme/' + university + '/'

    # Scrape the website
    response_uni = requests.get(url_uni)
    soup_uni = BeautifulSoup(response_uni.content, 'html.parser')

    # Extract the table from the website
    table = soup_uni.find('table', class_='program_table program_table-hasFC')

    # Extract the column names from the table
    column_names = [th.text for th in table.find_all('th')]

    # Add the column names for the url and Chinese name
    column_names.append('chinese_name')
    column_names.append('url')

    # Create a list of dictionaries to store the data
    datalist = []

    # Extract the data from the table
    # Include the url of the programme in the DataFrame in url column. The url is in the <a> tag within the <td> tage with class 'c-no'

    for tr in table.find_all('tr'):
        # Ignore the first row of the table, which is the column names
        if tr.find('th'):
            continue
        
        # For the column "Programme Full Title", the English name is the text of the <td> tag with class="c-ft". The Chinese name is the text of the <span> tag with class="tname-cn". Ignore the class="label" <span> tag.
        # Separate the text into Chinese and English into two columns, with the English name in the "Programme Full Title" column and the Chinese name in the "Chinese Name" column.
        
        # Extract the data from each row
        row = [td.text for td in tr.find_all('td')]

        # Extract the English name of the programme and replace the English name with the Full Title in the row
        english_name = tr.find('td', class_='c-ft').contents[0].strip()
        row[-1] = english_name

        # Extract the url of the programme
        url = tr.find('td', class_='c-no').find('a')['href']

        # Extract the Chinese name of the programme
        chinese_name = tr.find('td', class_='c-ft').find('span', class_='tname-cn').text

        # Add the url and Chinese name to the row
        row.append(chinese_name)
        row.append(url)

        # Create a dictionary from the row
        data = dict(zip(column_names, row))

        # Add the dictionary to the list
        datalist.append(data)

    # Add the list of dictionaries to the university_data list
    university_data.extend(datalist)

    # Add a progress counter to check the progress
    print(f'Progress: {len(university_data)}/376', end='\r')

df_all = pd.DataFrame(university_data).set_index('JUPAS Catalogue No.')

Progress: 376/376

In [24]:
df_all.to_excel('2024 JUPAS Program Overview.xlsx')

In [None]:
offer_table = pd.DataFrame()
for program in university_data:

    print(f'Currently Scraping: {index}, Progress: {count+1}/{df_all.shape[0]}', end='\r')
    
    # URL to scape:
    url_programme = 'https://www.jupas.edu.hk/' + row['url']

    # Which school
    school = row['Institution / Scheme']

    # Scrape the website
    response_programme = requests.get(url_programme)
    soup_programme = BeautifulSoup(response_programme.content, 'html.parser')

    # Getting the quota
    quota = soup_programme.find('div', class_='programInfo_block programInfo_block-firstyear').text.strip()
    quota = re.sub(r'\D', '', quota)


    title = soup_programme.find('p', class_='strokeBar_title', string="Statistics")

    if title:
        div = title.find_parent('div', class_='strokeBar_box')   

        tables = div.find_all('table', class_='js-swrapTable program_brand_table js-swiptable statistic-table')

        a_stat = []
        o_stat = []

        for table in tables:

            # Extract the table body
            table_body = table.find('tbody').find_all('tr')
            table_rows = [[item.text.strip() for item in rows.find_all('td')] for rows in table_body]

            header = table_rows[0]
            formatted_data = []

            for row in table_rows[1:]:
                formatted_row = dict(zip(header, row))
                formatted_data.append(formatted_row)
            
            # In this loop, we will have two tables
            # Append the first table to the Application Statistics
            # Append the second table to the Offer Statistics
            if tables.index(table) == 0:
                a_stat = formatted_data
            else:
                o_stat= formatted_data

        a_df=pd.DataFrame.from_dict(a_stat)
        a_df['JUPAS'] = index
        a_df['Type'] = "Application"
        a_df['School'] = school
        a_df['Quota'] = quota

        o_df=pd.DataFrame.from_dict(o_stat)
        o_df['JUPAS'] = index
        o_df['Type'] = "Offer"
        o_df['School'] = school
        o_df['Quota'] = quota

        combined_df = pd.concat([a_df,o_df], ignore_index=True)
        offer_table = pd.concat([offer_table,combined_df], ignore_index=True)
    else:
        new_row = {'JUPAS':index, 'School':school,'Quota':quota}
        
        offer_table= pd.concat([offer_table, pd.DataFrame([new_row])], ignore_index=True)

#offer_table.to_excel('2024 JUPAS Offer Table.xlsx', index=False)
offer_table

In [58]:
offer_table = pd.DataFrame()
for count,program in enumerate(university_data):

    print(f'Currently Scraping: {program["JUPAS Catalogue No."]}, Progress: {count+1}/{len(university_data)}', end='\r')
    
    # URL to scape:
    url_programme = 'https://www.jupas.edu.hk/' + program['url']

    # Which school
    school = program['Institution / Scheme']

    # Scrape the website
    response_programme = requests.get(url_programme)
    soup_programme = BeautifulSoup(response_programme.content, 'html.parser')

    # Getting the quota
    quota = soup_programme.find('div', class_='programInfo_block programInfo_block-firstyear').text.strip()
    quota = re.sub(r'\D', '', quota)


    title = soup_programme.find('p', class_='strokeBar_title', string="Statistics")

    if title:
        div = title.find_parent('div', class_='strokeBar_box')   

        tables = div.find_all('table', class_='js-swrapTable program_brand_table js-swiptable statistic-table')

        a_stat = []
        o_stat = []

        for table in tables:

            # Extract the table body
            table_body = table.find('tbody').find_all('tr')
            table_rows = [[item.text.strip() for item in rows.find_all('td')] for rows in table_body]

            header = table_rows[0]
            formatted_data = []

            for row in table_rows[1:]:
                formatted_row = dict(zip(header, row))
                formatted_data.append(formatted_row)
            
            # In this loop, we will have two tables
            # Append the first table to the Application Statistics
            # Append the second table to the Offer Statistics
            if tables.index(table) == 0:
                a_stat = formatted_data
            else:
                o_stat= formatted_data

        a_df=pd.DataFrame.from_dict(a_stat)
        a_df['JUPAS'] = program["JUPAS Catalogue No."]
        a_df['Type'] = "Application"
        a_df['School'] = school
        a_df['Quota'] = quota

        o_df=pd.DataFrame.from_dict(o_stat)
        o_df['JUPAS'] = program["JUPAS Catalogue No."]
        o_df['Type'] = "Offer"
        o_df['School'] = school
        o_df['Quota'] = quota

        combined_df = pd.concat([a_df,o_df], ignore_index=True)
        offer_table = pd.concat([offer_table,combined_df], ignore_index=True)
    else:
        new_row = {'JUPAS':program["JUPAS Catalogue No."], 'School':school,'Quota':quota}
        
        offer_table= pd.concat([offer_table, pd.DataFrame([new_row])], ignore_index=True)

offer_table.to_excel('2024 JUPAS Offer Table.xlsx', index=False)

Currently Scraping: JS1108, Progress: 30/376

AttributeError: 'NoneType' object has no attribute 'text'