In [None]:
import requests
from bs4 import BeautifulSoup
from bs4 import SoupStrainer
import urllib.parse
import re

In [None]:
def get_is_academia_page(page, params={}):
    BASE_URL = 'https://isa.epfl.ch/imoniteur_ISAP/'
    PAGES = {'filter': '!GEDPUBLICREPORTS.filter', 'results': '!GEDPUBLICREPORTS.html'}
    BASE_PARAMS = {'ww_i_reportmodel': '133685247', # Registered students by section and semester
                   'ww_i_reportModelXsl': '133685270'}
    
    if not isinstance(page, str) or page not in PAGES:
        allowed_pages = ', '.join(PAGES.keys())
        raise ValueError('page argument must be: {}'.format(allowed_pages))
    
    url = urllib.parse.urljoin(BASE_URL, PAGES[page])
    r = requests.get(url, params={**BASE_PARAMS, **params}) # Python 3.5 syntax
    r.raise_for_status() # Raise an exception if we can't get the page

    return r.text

In [None]:
def construct_dict_from_select(soup, match_name):
    css_selector = 'select[name="{}"] > option'.format(match_name)
    option_tags = soup.select(css_selector)
    return {tag.string: tag['value'] for tag in option_tags if tag.string}

In [None]:
strainer = SoupStrainer('select')
page = get_is_academia_page('filter')
soup = BeautifulSoup(page, 'html.parser', parse_only=strainer)

departments = construct_dict_from_select(soup, 'ww_x_UNITE_ACAD')
cs_department = departments['Informatique']
years = construct_dict_from_select(soup, 'ww_x_PERIODE_ACAD')

years

In [None]:
def parse_semesters_list(year_id, department_id):
    params = {'ww_b_list': 1,
              'ww_x_PERIODE_ACAD': year_id,
              'ww_x_UNITE_ACAD': department_id}
    
    page = get_is_academia_page('filter', params)
    soup = BeautifulSoup(page, 'html.parser')

    def extract_id(tag):
        match = re.search('ww_x_GPS=(\d+)', tag['onclick'])
        if match is None:
            return None
        else:
            return match.group(1)

    return {tag.text: extract_id(tag) for tag in soup.find_all('a') if extract_id(tag) is not None}

In [None]:
semesters_ids = {}
for year_id in years.values():
    semesters_ids.update(parse_semesters_list(year_id, cs_department))

semesters_ids

In [None]:
def get_people(semester_id):
    page = get_is_academia_page('results', {'ww_x_GPS': semester_id})
    df = pandas.read_html(page, header=1, index_col=10)[0]
    return df.drop(df.columns[-1], axis=1)

In [None]:
get_people(semesters_ids['Informatique, 2016-2017, Master semestre 3'])