In [None]:
import requests
from bs4 import BeautifulSoup
from bs4 import SoupStrainer
import urllib.parse
import re
import pandas as pd

In [None]:
def get_is_academia_page(page, params={}):
    """
        Return the content of a IS-Academia webpage as a string
    """
    
    BASE_URL = 'https://isa.epfl.ch/imoniteur_ISAP/'
    PAGES = {'filter': '!GEDPUBLICREPORTS.filter', 'results': '!GEDPUBLICREPORTS.html'}
    BASE_PARAMS = {'ww_i_reportmodel': '133685247', # Registered students by section and semester
                   'ww_i_reportModelXsl': '133685270'}
    
    # Check the validity of the `page` parameter
    if not isinstance(page, str) or page not in PAGES:
        allowed_pages = ', '.join(PAGES.keys())
        raise ValueError('page argument must be: {}'.format(allowed_pages))
    
    url = urllib.parse.urljoin(BASE_URL, PAGES[page]) # Create the URL
    r = requests.get(url, params={**BASE_PARAMS, **params}) # Python 3.5 syntax for merging dictionaries
    r.raise_for_status() # Raise an exception if we can't get the page

    return r.text

In [None]:
def construct_dict_from_select(soup, match_name):
    # Use a CSS selector to get the <option> tags that are child of
    # a <select> tag with the specified `name` attribute
    css_selector = 'select[name="{}"] > option'.format(match_name)
    option_tags = soup.select(css_selector)
    
    # Create a dictionary mapping the text to the internal ID used by IS-Acdemia
    return {tag.string: tag['value'] for tag in option_tags if tag.string}

In [None]:
# Use a strainer to parse only the <select> tags in the page
strainer = SoupStrainer('select')
page = get_is_academia_page('filter')
soup = BeautifulSoup(page, 'html.parser', parse_only=strainer)

# Create a dictionary mapping the departments ids
departments = construct_dict_from_select(soup, 'ww_x_UNITE_ACAD')
cs_department = departments['Informatique']
# Create a dictionary mapping the years ids
years = construct_dict_from_select(soup, 'ww_x_PERIODE_ACAD')

years

In [None]:
def parse_semesters_list(year_id, department_id):
    # HTTP parameters
    params = {'ww_b_list': 1,
              'ww_x_PERIODE_ACAD': year_id,
              'ww_x_UNITE_ACAD': department_id}
    
    page = get_is_academia_page('filter', params)
    strainer = SoupStrainer('a') # Only parse the <a> tags
    soup = BeautifulSoup(page, 'html.parser', parse_only=strainer)

    # Extract the ww_x_GPS id from the Javascript code in the `onclick` attribute of links
    def extract_id(tag):
        match = re.search('ww_x_GPS=(\d+)', tag['onclick'])
        if match is None: # Return None if there is no positive id associated with this link
            return None
        else:
            return match.group(1)

    # Create a dictionary mapping the semesters to their IDs
    return {tag.text: extract_id(tag) for tag in soup.find_all('a') if extract_id(tag) is not None}

In [None]:
# Create a dict mapping all CS semesters from all years to their internal ID used by IS-Academia
semester_ids = {}
for year_id in years.values():
    semester_ids.update(parse_semesters_list(year_id, cs_department))

semester_ids

In [None]:
# Returns a dataframe containing the list of students for a given semester
# or None if there isn't any table in the page
def get_people(semester_id):
    page = get_is_academia_page('results', {'ww_x_GPS': semester_id})
    
    # Parse the HTML table using Pandas
    frames = pandas.read_html(page, header=1)
    
    if frames is None or not frames: # Returns None if we can't get any dataframe
        return None
    
    df = frames[0]
    
    # Drop the last column which is empty (due to IS-Academia formatting)
    return df.drop(df.columns[-1], axis=1)

## Do not execute the cell below!
It gets a lot of pages from IS-Academia. Use the next cell to load the data from a pickle file instead.

In [None]:
frames = []
for semester, semester_id in semesters_ids.items():
    df = get_people(semester_id)
    
    # Add a column containing the semester (if we can get the dataframe)
    if df is not None:
        df['semester'] = semester
    frames.append(df)
    
# Concatenate all semesters from all years
df = pd.concat(frames)

## Load the data from a pickle file

In [None]:
df = pd.read_pickle('cs_dump.pkl')
#df.set_index(['semester', 'No Sciper'], inplace=True)
#df.sort_index(inplace=True)

In [None]:
tmp = df[df['Nom Prénom'].str.contains('^Habegger\sL')]
tmp.semester.apply(lambda s: s.split(','))