# 02 - Data from the Web

In [1]:
# Import libraries
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

You will need requests-futures which you can get by   
$ ``pip install requests-futures``

### Task 1.0 Retrieve the Data
Obtain the 200 top-ranking universities : name, rank, country and region, number of faculty members (international and total) and number of students (international and total)

In [2]:
# Do the request
request_url = 'https://www.topuniversities.com/sites/default/files/qs-rankings-data/357051.txt?_=1507975779496'
r = requests.get(request_url)

In [3]:
univ_vals = r.json()['data']
univ_vals[0]

{'cc': 'US',
 'core_id': '410',
 'country': 'United States',
 'guide': '<a href="/where-to-study/north-america/united-states/guide" class="guide-link" target="_blank">United States</a>',
 'logo': '<img src="https://www.topuniversities.com/sites/default/files/massachusetts-institute-of-technology-mit_410_small_0.jpg" alt="Massachusetts Institute of Technology (MIT)  Logo">',
 'nid': '294850',
 'rank_display': '1',
 'region': 'North America',
 'score': '100',
 'stars': '6',
 'title': 'Massachusetts Institute of Technology (MIT)',
 'url': '/universities/massachusetts-institute-technology-mit'}

In [4]:
univ_df = pd.DataFrame(univ_vals)
univ_df.head()

Unnamed: 0,cc,core_id,country,guide,logo,nid,rank_display,region,score,stars,title,url
0,US,410,United States,"<a href=""/where-to-study/north-america/united-...","<img src=""https://www.topuniversities.com/site...",294850,1,North America,100.0,6,Massachusetts Institute of Technology (MIT),/universities/massachusetts-institute-technolo...
1,US,573,United States,"<a href=""/where-to-study/north-america/united-...","<img src=""https://www.topuniversities.com/site...",297282,2,North America,98.7,5,Stanford University,/universities/stanford-university
2,US,253,United States,"<a href=""/where-to-study/north-america/united-...","<img src=""https://www.topuniversities.com/site...",294270,3,North America,98.4,5,Harvard University,/universities/harvard-university
3,US,94,United States,"<a href=""/where-to-study/north-america/united-...","<img src=""https://www.topuniversities.com/site...",294562,4,North America,97.7,5,California Institute of Technology (Caltech),/universities/california-institute-technology-...
4,GB,95,United Kingdom,"<a href=""/where-to-study/europe/united-kingdom...","<img src=""https://www.topuniversities.com/site...",294561,5,Europe,95.6,5,University of Cambridge,/universities/university-cambridge


keep only : 
1. name, rank, country and region
2. number of faculty members (international and total) and number of students (international and total)

second part must be retrieved from different request

In [5]:
univ_dr_df = univ_df.drop('guide', 1).drop('logo', 1).drop('stars', 1).drop('url', 1) \
.drop('score', 1).drop('cc', 1).drop('core_id', 1).drop('nid', 1)
univ_dr_df.head()

Unnamed: 0,country,rank_display,region,title
0,United States,1,North America,Massachusetts Institute of Technology (MIT)
1,United States,2,North America,Stanford University
2,United States,3,North America,Harvard University
3,United States,4,North America,California Institute of Technology (Caltech)
4,United Kingdom,5,Europe,University of Cambridge


In [6]:
url_caltech = 'https://www.topuniversities.com/universities/california-institute-technology-caltech#wurs'
r_caltech = requests.get(url_caltech)
soup_caltech = BeautifulSoup(r_caltech.text, 'html.parser')
soup_caltech.prettify()[0:600]

'<!DOCTYPE html>\n<html dir="ltr" version="XHTML+RDFa 1.0" xml:lang="en" xmlns="http://www.w3.org/1999/xhtml" xmlns:article="http://ogp.me/ns/article#" xmlns:book="http://ogp.me/ns/book#" xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/terms/" xmlns:foaf="http://xmlns.com/foaf/0.1/" xmlns:og="http://ogp.me/ns#" xmlns:product="http://ogp.me/ns/product#" xmlns:profile="http://ogp.me/ns/profile#" xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#" xmlns:schema="http://schema.org/" xmlns:sioc="http://rdfs.org/sioc/ns#" xmlns:sioct="http://rdfs.org/sioc/types#" xm'

From the soup, with a quick ctrl+f on the number displayed on the corresponding page, I was able to identify the elements which contained the value I was interested in:
1. Number of international students ``<div class="int-students-main wrapper col-md-4"> \ <div class="number">``
    
2. Number of students ``<div class="students-main wrapper col-md-4"> \ <div class="number">``

3. Number of faculty staff in total ``<div class="faculty-main wrapper col-md-4"> \ <div class="number">``

4. Number of faculty staff international ``<div class="inter faculty"> \ <div class="number">``

In [7]:
caltech_inter_st = soup_caltech.find('div', class_='int-students-main wrapper col-md-4').find('div', class_='number').text
caltech_inter_st[1:]

'647 '

In [8]:
caltech_st = soup_caltech.find('div', class_ = 'students-main wrapper col-md-4').find('div', class_='number').text
caltech_st[1:]

'2,255 '

Now we can test with MIT to see whether the parsing works the same (which is probably the case). If it works, then we will iterate over the links in the dataframe to parse the values of interest for each university

In [9]:
url_mit = 'https://www.topuniversities.com/universities/massachusetts-institute-technology-mit#wurs'
r_mit = requests.get(url_mit)
soup_mit = BeautifulSoup(r_mit.text, 'html.parser')
soup_mit.find('div', class_='int-students-main wrapper col-md-4').find('div', class_='number').text[1:]


'3,717 '

In [83]:
#tries to retrieve the value contained in the child <block_p class = 'class_p'>, hence contained in <block_c class='class_c'
#returns NaN if one of both block is not found
def retrieve(soup, block_p, class_p, block_c, class_c):
    parent = soup.find(block_p, class_p)
    if parent is None:
        return 'failed'
    else:
        child = parent.find(block_c, class_c)
        if child is None:
            return 'failed'
        else:
            return int(child.text[1:].replace(',',''))
    

In [11]:
retrieve(soup_mit, 'div', 'int-students-main wrapper col-md-4', 'div', 'number')

3717

In [12]:
retrieve(soup_mit, 'div', 'int-st-main wrapper col-md-4', 'div', 'number')

nan

In [13]:
retrieve(soup_mit, 'div', 'int-students-main wrapper col-md-4', 'div', 'nber')

nan

In [14]:
def retrieve_div_number(soup, class_p):
    return retrieve(soup, 'div', class_p, 'div', 'number')

In [15]:
retrieve_div_number(soup_mit, 'int-students-main wrapper col-md-4')

3717

In [16]:
u_urls = np.array(univ_df[['url']].values)
univ_urls = []
main_url = 'https://www.topuniversities.com'
for url_ in u_urls:
    url = url_[0]
    univ_urls.append(main_url + url + '#wurs')
univ_urls[:5]

['https://www.topuniversities.com/universities/massachusetts-institute-technology-mit#wurs',
 'https://www.topuniversities.com/universities/stanford-university#wurs',
 'https://www.topuniversities.com/universities/harvard-university#wurs',
 'https://www.topuniversities.com/universities/california-institute-technology-caltech#wurs',
 'https://www.topuniversities.com/universities/university-cambridge#wurs']

In [17]:
len(univ_urls)

959

Since doing the 959 requests in one go didn't work, I will manually split the requests in chunks as big as possible to minimize the merge step

In [19]:
def get_missing_data(urls):
    international_students = []
    students = []
    international_faculty = []
    faculty = []

    for url in urls:
    
        r = requests.get(url)
        soup = BeautifulSoup(r.text, 'html.parser')
    
        inter_st = retrieve_div_number(soup, 'int-students-main wrapper col-md-4')
        st = retrieve_div_number(soup, 'students-main wrapper col-md-4')
        inter_fac = retrieve_div_number(soup, 'faculty-main wrapper col-md-4')
        fac = retrieve_div_number(soup, 'inter faculty')
    
        international_students.append(inter_st)
        students.append(st)
        international_faculty.append(inter_fac)
        faculty.append(fac)

    return international_students, students, international_faculty, faculty

In [20]:
inter_s, s, inter_f, f = get_missing_data(univ_urls[:5])

In [21]:
inter_s

[3717, 3611, 5266, 647, 6699]

Since this is super slow, we will use the package future-requests that will make non-blocking requests. While waiting, I will treat only a small part of the data. When requests complete, I store the results in variables and its all good. This is the reason for the small block before the function

In [137]:
international_st = []
students = []
international_fac = []
fac = []
bad_soups = []
count = 0

In [209]:
def reset_arrays():
    global international_st
    global students
    global international_fac
    global fac
    global bad_soups 
    global count
    
    international_st = []
    students = []
    international_fac = []
    fac = []
    bad_soups = []
    count = 0
    return 0

In [238]:
def callback(future):
    r = future.result()
    soup = BeautifulSoup(r.text, 'html.parser')
    
    global international_st
    global students
    global international_fac
    global fac
    global num_request
    global count
    global bad_soups
    count = count + 1
    if count > num_request:
        international_st = []
        students = []
        international_fac = []
        fac = []
        count = 0
    
    inter_st = retrieve_div_number(soup, 'int-students-main wrapper col-md-4')
    st = retrieve_div_number(soup, 'students-main wrapper col-md-4')
    inter_fac = retrieve_div_number(soup, 'inter faculty')
    f = retrieve_div_number(soup, 'faculty-main wrapper col-md-4')
    if(f == 'failed'):
        f = retrieve_div_number(soup, 'total faculty')
    if inter_st == 'failed' or st == 'failed' or inter_fac == 'failed' or f == 'failed':
        bad_soups.append(soup)
    international_st.append(inter_st)
    students.append(st)
    international_fac.append(inter_fac)
    fac.append(f)

Now we can do our 959 requests in background. For it to work, we must recompile the initilisation of arrays as well as the callback function

The following block will play a small sound when the requests are done. For it to work you need to install python-vlc   
$ pip install python-vlc

In [24]:
import vlc
def play_sound(future):
    folder_path = 'fancy_sounds/'
    music_path = 'zelda_small_item.wav'
    p = vlc.MediaPlayer(folder_path + music_path)
    p.play()
    return 0

In [148]:
from requests_futures.sessions import FuturesSession
session = FuturesSession()

In [163]:
import threading
from IPython.display import display
import ipywidgets as widgets
import time
import sys
progress = widgets.FloatProgress(value=0.0, min=0.0, max=1.0)

def work(progress):
    global fac
    while(len(fac) <= num_request):
        progress.value = len(fac)/num_request
thread = threading.Thread(target=work, args=(progress,))
display(progress)
thread.start()

In [292]:
url_slice = univ_urls[:]
num_request = len(url_slice)
url_last_idx = len(url_slice) - 1
reset_arrays()
last_future = 0
for i_url, url in enumerate(url_slice):
    # On fait notre requête GET
    future = session.get(url)
    # On rajoute le callback à appeler quand
    # le résultat de la requête arrive.
    if i_url != url_last_idx:
        futures.append(future)
    future.add_done_callback(callback)
    if i_url == (url_last_idx):
        future.add_done_callback(play_sound)

missing data (NaN fields):
    1. [400:500] Universidad Externado de Colombia missing international students (can put 0)
    2. [500:600] Iran University of Science and Technology missing international fac/students (can put 0)
    3. [600:700]
        1. Jadavpur University:             missing international fac (can put 0)
        2. Universidad Iberoamericana IBERO missing international fac/students (can put 0)
        3. University of Haifa              "--------------------------------------------"
        4. University of Hyderabad          missing international fac (can put 0)
        5. Anna University                  missing international fac/students (can put 0)
        6. Universidad del Pais Vasco       missing international fac (can put 0)
        7. University of Minho              missing internation fac/students (can put 0)
        8. University of Minho              "------------------------------------------"
    4. [700:800]
        1. Comenius University in Bratislava   missing international fac (can put 0)
        2. Memorial University of Newfoundland "------------------------------------"
        3. Rhodes University                   "------------------------------------"
        4. Paris Lodron University of Salzburg "------------------------------------"
        5. University of Colombo               "------------------------------------"

it seems the NaN values can be replaced by 0, I won't check the last ones since there are 15 in the next category and 13 in the last respectively [800:900] and [900:]

In [285]:
len(bad_soups)

13

### <b> Warning: Running the following cell will cancell all pending requests, corrupting the data in the process </b>

In [286]:
# attempts to kill waiting requests: if attempt is successful, bar doesn't move. Goal is to cancel all but last requests
# in order to instantly play sound and complete only one request
if False:
    for fu in futures:
        # cancel request on this url
        fu.cancel()
    reset_arrays()
    futures = []

#Last try it seemed to work. We reset arrays twice since the canceling of
#asynchronous requests is asynchronous, the reset might happen before all requests are canceled
#and some requests might finish in the meantime (after reset)
;

''

#### <b> Next block will close background requests, uncomment and use carefully </b>

In [118]:
#bad_soups contains all the soups from which we were unable to scrap data. 
#After requests are finished, the block should compile to 0
len(bad_soups)

60

Si vous lancez de nouvelles requête avant que les précédentes soient terminées, vous avez de forte chances que la data soit corrompue. Dans ce cas attendez la fin des requêtes en cours, puis relancez la boucle ci-dessus

You can see the requests progress here. If you want to do something else, a sound will be played as the requests complete

Now that you finished your coffee and were alerted by the fancy sounds that you could pursue the compilation, we can finally use our well-deserved data

In [290]:
missing_df = pd.DataFrame(data= [international_st, students, international_fac, fac],
                         index = ['inter_st', 'students', 'inter_fac', 'fac']).transpose()
missing_df.head()

Unnamed: 0,inter_st,students,inter_fac,fac


In [291]:
univ_tot_df = pd.concat([univ_dr_df, missing_df], axis = 1)
univ_rk_df = univ_tot_df.set_index(['rank_display'])
univ_rk_df.head()

Unnamed: 0_level_0,country,region,title,inter_st,students,inter_fac,fac
rank_display,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,United States,North America,Massachusetts Institute of Technology (MIT),,,,
2,United States,North America,Stanford University,,,,
3,United States,North America,Harvard University,,,,
4,United States,North America,California Institute of Technology (Caltech),,,,
5,United Kingdom,Europe,University of Cambridge,,,,


### Task 1.1 
Which are the best universities in term of: (a) ratio between faculty members and students, (b) ratio of international students?

(a) ratio between faculty members and students

#### Warning
Le dataframe contient des NaN lorsque la requête n'a pas réussi à récupérer de valeur. Le choix a été fait de drop les universités qui contenaient des NaN. A discuter

In [289]:
univ_ratio_f = univ_tot_df.copy().fillna(0)
univ_ratio_f['ratio'] = univ_ratio_f['fac'] / univ_ratio_f['students']
univ_ratiof_sorted = univ_ratio_f.sort_values(['ratio'], ascending = 0)
univ_ratiof_sorted

Unnamed: 0,country,rank_display,region,title,inter_st,students,inter_fac,fac,ratio
3,United States,4,North America,California Institute of Technology (Caltech),647.0,2255.0,953.0,350.0,0.155211
0,United States,1,North America,Massachusetts Institute of Technology (MIT),3717.0,11067.0,2982.0,1679.0,0.151712
5,United Kingdom,6,Europe,University of Oxford,7353.0,19720.0,6750.0,2964.0,0.150304
15,United States,16,North America,Yale University,2469.0,12402.0,4940.0,1708.0,0.137720
7,United Kingdom,8,Europe,Imperial College London,8746.0,16090.0,3930.0,2071.0,0.128713
1,United States,2,North America,Stanford University,3611.0,15878.0,4285.0,2042.0,0.128606
11,Switzerland,12,Europe,Ecole Polytechnique Fédérale de Lausanne (EPFL),5896.0,10343.0,1695.0,1300.0,0.125689
4,United Kingdom,5,Europe,University of Cambridge,6699.0,18770.0,5490.0,2278.0,0.121364
10,Singapore,11,Asia,"Nanyang Technological University, Singapore (NTU)",7251.0,25738.0,4338.0,2993.0,0.116287
452,United Arab Emirates,451-460,Asia,Khalifa University,207.0,1290.0,158.0,149.0,0.115504


Il semble y avoir une correlation forte entre le ranking et le nombre de membre de faculté par étudiant

(b) ratio of international students

In [64]:
univ_ratio_i = univ_tot_df.copy().dropna()
univ_ratio_i['ratio'] = univ_ratio_i['inter_st'] / univ_ratio_i['students']
univ_ratioi_sorted = univ_ratio_i.sort_values(['ratio'], ascending = 1)
univ_ratioi_sorted.head()

Unnamed: 0,country,rank_display,region,title,inter_st,students,inter_fac,fac,ratio
602,United Arab Emirates,601-650,Asia,American University in Dubai,1861.0,2151.0,147.0,147.0,0.865179
411,Bahrain,411-420,Asia,Arabian Gulf University,4590.0,5493.0,387.0,385.0,0.835609
34,United Kingdom,35,Europe,London School of Economics and Political Scien...,6748.0,9760.0,1088.0,687.0,0.691393
410,United Arab Emirates,411-420,Asia,American University of Sharjah,1016.0,1475.0,187.0,88.0,0.688814
754,United Arab Emirates,751-800,Asia,Abu Dhabi University,2389.0,3510.0,175.0,173.0,0.680627


Even when changing the ascending order to 1, there seem to be no correlation between ranking and ratio of international students among students

In [237]:
univ_country = univ_ratiof_sorted.set_index(['country'])
univ_country.head()

Unnamed: 0_level_0,rank_display,region,title,inter_st,students,inter_fac,fac,ratio
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
United States,4,North America,California Institute of Technology (Caltech),647.0,2255.0,953.0,350.0,0.155211
United States,1,North America,Massachusetts Institute of Technology (MIT),3717.0,11067.0,2982.0,1679.0,0.151712
United Kingdom,6,Europe,University of Oxford,7353.0,19720.0,6750.0,2964.0,0.150304
United States,16,North America,Yale University,2469.0,12402.0,4940.0,1708.0,0.13772
United Kingdom,8,Europe,Imperial College London,8746.0,16090.0,3930.0,2071.0,0.128713
