# Table of Contents

### Imports

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import math

In [2]:
import requests
from bs4 import BeautifulSoup

### Preparing Data

First we try to request data manually on [IS-Academia](http://isa.epfl.ch/imoniteur_ISAP/%21gedpublicreports.htm?ww_i_reportmodel=133685247) and see via Postman how the query URL looks like. When I selected `Informatique`, `2007-2008` and `Bachelor semestre 1`, the complete query URL gives:

`http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.filter?ww_b_list=1&ww_i_reportmodel=133685247&ww_c_langue=&ww_i_reportModelXsl=133685270&zz_x_UNITE_ACAD=&ww_x_UNITE_ACAD=249847&zz_x_PERIODE_ACAD=&ww_x_PERIODE_ACAD=978181&zz_x_PERIODE_PEDAGO=&ww_x_PERIODE_PEDAGO=249108&zz_x_HIVERETE=&ww_x_HIVERETE=null&dummy=ok`

We can see that after the base URL, all the filters are composed like:

`filter_type=filter_value`

and different filters are concatenated with `&`.

However, the `html` file received does not contain any information about students. We need to click on webpage `Tous` to show all students that match the filters. the new action also generates a query URL, which gives:

`http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.html?ww_x_GPS=-1&ww_i_reportModel=133685247&ww_i_reportModelXsl=133685270&ww_x_UNITE_ACAD=249847&ww_x_PERIODE_ACAD=978181&ww_x_PERIODE_PEDAGO=249108&ww_x_HIVERETE=null`

So now we start with a base URL and collect information of all the options and filters. Then we can use the filters to generate query URLs to retrieve student data.

In [3]:
# Static strings we are going to use
sampleFilterUrl = 'http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.filter?ww_b_list=1&ww_i_reportmodel=133685247&ww_c_langue=&ww_i_reportModelXsl=133685270&zz_x_UNITE_ACAD=&ww_x_UNITE_ACAD=249847&zz_x_PERIODE_ACAD=&ww_x_PERIODE_ACAD=978181&zz_x_PERIODE_PEDAGO=&ww_x_PERIODE_PEDAGO=249108&zz_x_HIVERETE=&ww_x_HIVERETE=null&dummy=ok'
sampleQueryUrl = 'http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.html?ww_x_GPS=-1&ww_i_reportModel=133685247&ww_i_reportModelXsl=133685270&ww_x_UNITE_ACAD=249847&ww_x_PERIODE_ACAD=978181&ww_x_PERIODE_PEDAGO=249108&ww_x_HIVERETE=null'

In [4]:
# Get a response object from the filter url
r = requests.get(sampleFilterUrl, timeout=30)

# Transfer the response object into a BeautifulSoup object
soup = BeautifulSoup(r.text, 'lxml')

In [5]:
# All the information about options and filters are in the 'form' label
# Each 'option' label represents an option in a certain filter. List all to find the options we may use
soup.form.find_all('option')

[<option value="null"></option>,
 <option value="942293">Architecture</option>,
 <option value="246696">Chimie et génie chimique</option>,
 <option value="943282">Cours de mathématiques spéciales</option>,
 <option value="637841336">EME (EPFL Middle East)</option>,
 <option value="942623">Génie civil</option>,
 <option value="944263">Génie mécanique</option>,
 <option value="943936">Génie électrique et électronique </option>,
 <option value="2054839157">Humanités digitales</option>,
 <option selected="" value="249847">Informatique</option>,
 <option value="120623110">Ingénierie financière</option>,
 <option value="946882">Management de la technologie</option>,
 <option value="944590">Mathématiques</option>,
 <option value="945244">Microtechnique</option>,
 <option value="945571">Physique</option>,
 <option value="944917">Science et génie des matériaux</option>,
 <option value="942953">Sciences et ingénierie de l'environnement</option>,
 <option value="945901">Sciences et technologies d

In [6]:
# Each 'input' label represents a certain filter. List all of them to find out their names
soup.form.find_all('input')

[<input name="ww_b_list" type="hidden" value="1"/>,
 <input name="ww_i_reportmodel" type="hidden" value="133685247"/>,
 <input name="ww_c_langue" type="hidden" value=""/>,
 <input checked="" name="ww_i_reportModelXsl" type="radio" value="133685270"/>,
 <input name="ww_i_reportModelXsl" type="radio" value="133685271"/>,
 <input name="zz_x_UNITE_ACAD" type="hidden" value=""/>,
 <input name="zz_x_PERIODE_ACAD" type="hidden" value=""/>,
 <input name="zz_x_PERIODE_PEDAGO" type="hidden" value=""/>,
 <input name="zz_x_HIVERETE" type="hidden" value=""/>,
 <input name="dummy" type="submit" value="ok"/>]

In [7]:
# The last cells return data in BeautifulSoup type. To make the data easier to manipulate,
# we need to transfer the data into DataFrame type
option_string = []
option_value = []

for tag in soup.form.find_all('option'):
    option_value.append(tag['value'])
    option_string.append(tag.string)
    
option_df = pd.DataFrame({'string': option_string, 'value': option_value})
option_df.head()

Unnamed: 0,string,value
0,,
1,Architecture,942293.0
2,Chimie et génie chimique,246696.0
3,Cours de mathématiques spéciales,943282.0
4,EME (EPFL Middle East),637841336.0


Now we can try to retrieve the first bulk of data with the sample query URL defined before. 

In [8]:
# Get a bulk of student data from sample URL
rs = requests.get(sampleQueryUrl, timeout=30)

# Transfer the response object into a BeautifulSoup object
soups = BeautifulSoup(rs.text, 'lxml')

In [9]:
# All the student data are in 'table' label,
# now we can transfer the student data into DataFrame like we did for the options

# Retrieve column names
data_header = []

for tag in soups.table.find_all('th'):
    data_header.append(tag.string)

# Retrieve student data 
data_all = []

for tag in soups.table.find_all('tr'):
    data_single_entry = []
    for subtag in tag.find_all('td'):
        data_single_entry.append(subtag.string)
    data_all.append(data_single_entry)

# Create DataFrame
data_df = pd.DataFrame(data_all, columns=data_header)
data_df.head()

Unnamed: 0,None,Civilité,Nom Prénom,Orientation Bachelor,Orientation Master,Spécialisation,Filière opt.,Mineur,Statut,Type Echange,Ecole Echange,No Sciper
0,,,,,,,,,,,,
1,,,,,,,,,,,,
2,Monsieur,Arévalo Christian,,,,,,Présent,,,169569.0,
3,Monsieur,Aubelle Flavien,,,,,,Présent,,,174905.0,
4,Monsieur,Badoud Morgan,,,,,,Présent,,,173922.0,


In [10]:
# Rearrange columns so they correspond to respective values
data_header += [data_header.pop(0)]
data_df.columns = data_header
data_df.head()

Unnamed: 0,Civilité,Nom Prénom,Orientation Bachelor,Orientation Master,Spécialisation,Filière opt.,Mineur,Statut,Type Echange,Ecole Echange,No Sciper,None
0,,,,,,,,,,,,
1,,,,,,,,,,,,
2,Monsieur,Arévalo Christian,,,,,,Présent,,,169569.0,
3,Monsieur,Aubelle Flavien,,,,,,Présent,,,174905.0,
4,Monsieur,Badoud Morgan,,,,,,Présent,,,173922.0,


In [11]:
# Remove useless columns and rows
useful_columns = [0, 1, 4, 6, 10]
cleaned_data_header = list(np.array(data_header)[useful_columns])
cleaned_data_df = data_df[cleaned_data_header]
cleaned_data_df = cleaned_data_df[cleaned_data_df.Civilité.notnull()]
cleaned_data_df.reset_index(drop=True).head()

Unnamed: 0,Civilité,Nom Prénom,Spécialisation,Mineur,No Sciper
0,Monsieur,Arévalo Christian,,,169569
1,Monsieur,Aubelle Flavien,,,174905
2,Monsieur,Badoud Morgan,,,173922
3,Monsieur,Baeriswyl Jonathan,,,179406
4,Monsieur,Barroco Michael,,,179428


By this moment we have understood how to get clean data from a query URL. We have also a processing pattern to follow. Now we can start retrieving massive data and analyze them according to the assignments.