In [None]:
import requests
from bs4 import BeautifulSoup
from bs4 import SoupStrainer
import urllib.parse
import re
import pandas as pd

In [None]:
def get_is_academia_page(page, params={}):
    """
        Return the content of a IS-Academia webpage as a string
    """
    
    BASE_URL = 'https://isa.epfl.ch/imoniteur_ISAP/'
    PAGES = {'filter': '!GEDPUBLICREPORTS.filter', 'results': '!GEDPUBLICREPORTS.html'}
    BASE_PARAMS = {'ww_i_reportmodel': '133685247', # Registered students by section and semester
                   'ww_i_reportModelXsl': '133685270'}
    
    # Check the validity of the `page` parameter
    if not isinstance(page, str) or page not in PAGES:
        allowed_pages = ', '.join(PAGES.keys())
        raise ValueError('page argument must be: {}'.format(allowed_pages))
    
    url = urllib.parse.urljoin(BASE_URL, PAGES[page]) # Create the URL
    r = requests.get(url, params={**BASE_PARAMS, **params}) # Python 3.5 syntax for merging dictionaries
    r.raise_for_status() # Raise an exception if we can't get the page

    return r.text

In [None]:
def construct_dict_from_select(soup, match_name):
    # Use a CSS selector to get the <option> tags that are child of
    # a <select> tag with the specified `name` attribute
    css_selector = 'select[name="{}"] > option'.format(match_name)
    option_tags = soup.select(css_selector)
    
    # Create a dictionary mapping the text to the internal ID used by IS-Acdemia
    return {tag.string: tag['value'] for tag in option_tags if tag.string}

In [None]:
# Use a strainer to parse only the <select> tags in the page
strainer = SoupStrainer('select')
page = get_is_academia_page('filter')
soup = BeautifulSoup(page, 'html.parser', parse_only=strainer)

# Create a dictionary mapping the departments ids
departments = construct_dict_from_select(soup, 'ww_x_UNITE_ACAD')
cs_department = departments['Informatique']
# Create a dictionary mapping the years ids
years = construct_dict_from_select(soup, 'ww_x_PERIODE_ACAD')

years

In [None]:
def parse_semesters_list(year_id, department_id):
    # HTTP parameters
    params = {'ww_b_list': 1,
              'ww_x_PERIODE_ACAD': year_id,
              'ww_x_UNITE_ACAD': department_id}
    
    page = get_is_academia_page('filter', params)
    strainer = SoupStrainer('a') # Only parse the <a> tags
    soup = BeautifulSoup(page, 'html.parser', parse_only=strainer)

    # Extract the ww_x_GPS id from the Javascript code in the `onclick` attribute of links
    def extract_id(tag):
        match = re.search('ww_x_GPS=(\d+)', tag['onclick'])
        if match is None: # Return None if there is no positive id associated with this link
            return None
        else:
            return match.group(1)

    # Create a dictionary mapping the semesters to their IDs
    return {tag.text: extract_id(tag) for tag in soup.find_all('a') if extract_id(tag) is not None}

In [None]:
# Create a dict mapping all CS semesters from all years to their internal ID used by IS-Academia
semester_ids = {}
for year_id in years.values():
    semester_ids.update(parse_semesters_list(year_id, cs_department))

semester_ids

In [None]:
# Returns a dataframe containing the list of students for a given semester
# or None if there isn't any table in the page
def get_people(semester_id):
    page = get_is_academia_page('results', {'ww_x_GPS': semester_id})
    
    # Parse the HTML table using Pandas
    frames = pandas.read_html(page, header=1)
    
    if frames is None or not frames: # Returns None if we can't get any dataframe
        return None
    
    df = frames[0]
    
    # Drop the last column which is empty (due to IS-Academia formatting)
    return df.drop(df.columns[-1], axis=1)

## Do not execute the cell below!
It gets a lot of pages from IS-Academia. Use the next cell to load the data from a pickle file instead.

In [None]:
#frames = []
#for semester, semester_id in semesters_ids.items():
#    df = get_people(semester_id)
#    
#    # Add a column containing the semester (if we can get the dataframe)
#    if df is not None:
#        df['semester'] = semester
#    frames.append(df)
#    
## Concatenate all semesters from all years
#df = pd.concat(frames)

## Load the data from a pickle file

In [None]:
df = pd.read_pickle('cs_dump.pkl')
df.columns = ['title', 'name', 'orientation_ba', 'orientation_ma', 'specialization', 'filiere_opt', 'minor', 'status', 'exchange_type', 'exchange_school', 'sciper', 'semester']
#df.set_index(['semester', 'No Sciper'], inplace=True)
#df.sort_index(inplace=True)

In [None]:
# Take the 'semester' column and split it into 'section', 'years' and 'semester'
new_cols = pd.DataFrame(df['semester'], index=df.index)
cols = new_cols['semester'].apply(lambda x: pd.Series(x.split(', ')))
cols.columns = ['section', 'years', 'semester']
data = pd.concat([df.drop('semester', axis=1), cols], axis=1)
data

In [None]:
from scipy import stats

index1 = ['Bachelor semestre 1' in semester for semester in data.semester]
index6 = ['Bachelor semestre 6' in semester for semester in data.semester]

list1 = data[index1]
list6 = data[index6]

started = {}
ended = {}
total = {}

for _, row in list1.iterrows():
    sciper = row.sciper
    year = row.years.split('-')[0]
    if not sciper in started.keys():
        started[sciper] = year
    elif year < started[sciper]:
        started[sciper] = year
        
for _, row in list6.iterrows():
    sciper = row.sciper
    year = row.years.split('-')[1]
    if not sciper in ended.keys():
        ended[sciper] = year
    elif year > ended[sciper]:
        ended[sciper] = year

d = []
for key in set(started.keys()).intersection(ended.keys()):
    d.append(key)

average = {}

for sciper in d:
    average[sciper] = (int(ended[sciper]) - int(started[sciper]))*12
    
average
df2 = pd.DataFrame(d,columns=['sciper'])
m = pd.merge(df2, data[['sciper', 'title']], left_on='sciper', right_on='sciper', how='inner')

m.drop_duplicates(inplace=True)
group = m.groupby('title')

group.count()/len(m)

df3 = pd.Series(average)
df3

m.index = m.sciper
m = pd.concat([m, df3], axis=1)
m = m.rename({0:'duration'})
m.groupby('title').mean()

s1 = m[m['title'] == 'Monsieur'][0].tolist()
s2 = m[m['title'] == 'Madame'][0].tolist()

t_stat, p_val = stats.ttest_ind(s1, s2)
p_val

m.groupby('title').mean()

In [None]:
def find_lowest_year(table):
    result = {}
    for _, row in table.iterrows():
        sciper = row.sciper
        year = row.years.split('-')[0]
        if not sciper in result.keys():
            result[sciper] = year
        elif year < result[sciper]:
            result[sciper] = year
    return result

def find_highest_year(table):
    result = {}
    for _, row in table.iterrows():
        sciper = row.sciper
        year = row.years.split('-')[1]
        if not sciper in result.keys():
            result[sciper] = year
        elif year > result[sciper]:
            result[sciper] = year
    return result


In [None]:
# Ex 2
data.semester.unique()
index_ma1 = ['Master semestre 1' in semester for semester in data.semester]
index_ma2 = ['Master semestre 2' in semester for semester in data.semester]

index_mp1 = ['Projet Master printemps' in semester for semester in data.semester]
index_mp2 = ['Projet Master automne' in semester for semester in data.semester]

ma1 = data[index_ma1]
ma2 = data[index_ma2]
ma3 = data[index_ma3]

mp1 = data[index_mp1]
mp2 = data[index_mp2]

ma1_ma2 = pd.concat([ma1, ma2], axis=0)
mp1_mp2 = pd.concat([mp1, mp2], axis=0)

d = []
started = find_lowest_year(ma1_ma2)
ended = find_lowest_year(ma1_ma2)
for key in set(started.keys()).intersection(ended.keys()):
    d.append(key)

average = {}

for sciper in d:
    average[sciper] = (int(ended[sciper]) - int(started[sciper]))*12
    
average

ma3[~ma3.minor.isnull()]

# Time between ma1 and mp
d = []
started = find_lowest_year(ma1_ma2)
ended = find_highest_year(mp1_mp2)
for key in set(started.keys()).intersection(ended.keys()):
    d.append(key)

average = {}

for sciper in d:
    average[sciper] = (int(ended[sciper]) - int(started[sciper]))*12
    
average

df2 = pd.DataFrame(d,columns=['sciper'])
m = pd.merge(df2, data[['sciper', 'title', 'specialization']], left_on='sciper', right_on='sciper', how='inner')
m.drop_duplicates(inplace=True)

m.drop_duplicates('sciper')

In [None]:
data.semester.unique()