Import libraries and define main constants

In [84]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

TOP_UNIVERSITIES_BASE_URL = "https://www.topuniversities.com/"
TIMES_BASE_URL = "https://www.timeshighereducation.com/"
# Max rank to load for each ranking
MAX_RANK=10 # 200

We define a few functions used to read the data
- `read_rank` allows to convert a string of the form `"=40"` to `40`, `"10-12"` to `10` (TODO: give example with em dash)
- `read_us_formatted_integer` allows to convert string representation with a comma used used as the thousands separtor.

In [85]:
def read_rank(raw_rank):
    """
    :param raw_rank str: Rank string, as found by scrapping
    :return int: Integer representing the rank of the university
    """
    if '-' in raw_rank:
        return int(raw_rank[:raw_rank.index('-')])
    if '–' in raw_rank:
        return int(raw_rank[:raw_rank.index('–')])
    return int(raw_rank.replace('=', ''))

In [86]:
def read_us_formatted_integer(raw_integer):
    """
    Reads a string representing an integer properly formatted using US convention.
    
    The US format uses commas as the thousands separator.
    
    Examples:
    >>> read_us_formatted_integer("0")
    0
    >>> read_us_formatted_integer("1,000")
    1000
    >>> read_us_formatted_integer("999,999,999")
    999999999
    
    :param raw_integer str: US Formatted string for an integer
    :return int: Integer corresponding to the provided value.
    """
    return int(raw_integer.replace(",", ""))

In [87]:
def read_percentage(raw_percentage):
    """
    :param raw_percentage str: Percentage string
    :return float: Ratio corresponding to the percentage (between 0 and 1)
    """ 
    return float(raw_percentage.replace("%", "")) / 100

In [88]:
def make_entry(name, rank, country, region, faculty_total, faculty_international, students_total, students_international, **extra_keys):
    return {
        "name": name,
        "rank": rank,
        "country": country,
        "region": region,
        "faculty_total": faculty_total,
        "faculty_international": faculty_international,
        "students_total": students_total,
        "students_international": students_international,
        **extra_keys,
    }

In [89]:
def get_top(rawData, limit, top_uni):
    # TODO: Replace this function if we need to be flexible
    # TODO: Define "flexible"
    # TODO: Convert this function to a generator and let the consumer decide if he wants to allocate an array
    top = []
    for r in rawData:
#         print r
        tempRank = read_rank(r["rank_display"] if top_uni else r["rank"])
        if tempRank > limit:
            break
        top.append(r)
    return top

Fetch data from Top Universities

In [90]:
ranking_response = requests.get("https://www.topuniversities.com/sites/default/files/qs-rankings-data/357051.txt")
raw_ranking_data = ranking_response.json()['data']

def read_top_universities_entry(raw_entry, html_details):
    name = raw_entry["title"]
    rank = read_rank(raw_entry["rank_display"])
    country = raw_entry["country"]
    region = raw_entry["region"]
    faculty_total_str = html_details.find("div", {"class": "total faculty"}).find("div", {"class": "number"}).text.strip()
    faculty_total = read_us_formatted_integer(faculty_total_str)
    faculty_international = None
    students_total_str  = html_details.find("div", {"class": "total student"}).find("div", {"class": "number"}).text.strip()
    students_total = read_us_formatted_integer(students_total_str)
    students_international_str  = html_details.find("div", {"class": "total inter"}).find("div", {"class": "number"}).text.strip()
    students_international = read_us_formatted_integer(students_international_str)
    
    return make_entry(name, rank, country, region, faculty_total, faculty_international, students_total, students_international)


def get_top_universities_data(raw_ranking_data, max_rank):
    """
    Return the list of parsed enries (with details) of the top universities ranking.
    It may perform network requests.
    """
    for raw_entry in raw_ranking_data:
        entry = {}
        
        rank = read_rank(raw_entry["rank_display"])
        if (rank > max_rank):
            # Assume sorted raw ranks to break (instead of `continue`-ing)
            break
        
        details_reponse = requests.get(TOP_UNIVERSITIES_BASE_URL + raw_entry["url"])
        raw_details_data = BeautifulSoup(details_reponse.text, 'html.parser')

        yield read_top_universities_entry(raw_entry, raw_details_data)


def get_top_universities_df(raw_ranking_data, max_rank):
    return pd.DataFrame([*get_top_universities_data(raw_ranking_data, max_rank)])

top_universities_df = get_top_universities_df(raw_ranking_data, MAX_RANK)
top_universities_df

Unnamed: 0,country,faculty_international,faculty_total,name,rank,region,students_international,students_total
0,United States,,2982,Massachusetts Institute of Technology (MIT),1,North America,3717,11067
1,United States,,4285,Stanford University,2,North America,3611,15878
2,United States,,4350,Harvard University,3,North America,5266,22429
3,United States,,953,California Institute of Technology (Caltech),4,North America,647,2255
4,United Kingdom,,5490,University of Cambridge,5,Europe,6699,18770
5,United Kingdom,,6750,University of Oxford,6,Europe,7353,19720
6,United Kingdom,,6345,UCL (University College London),7,Europe,14854,31080
7,United Kingdom,,3930,Imperial College London,8,Europe,8746,16090
8,United States,,2449,University of Chicago,9,North America,3379,13557
9,Switzerland,,2477,ETH Zurich - Swiss Federal Institute of Techno...,10,Europe,7563,19815


In [92]:
times_ranking_response = requests.get("https://www.timeshighereducation.com/sites/default/files/the_data_rankings/world_university_rankings_2018_limit0_369a9045a203e176392b9fb8f8c1cb2a.json")
times_raw_ranking_data = times_ranking_response.json()['data']

def read_times_entry(raw_entry):
    from math import floor
    
    name = raw_entry["name"]
    rank = read_rank(raw_entry["rank"])
    # TODO: Assert this is always a country
    country = raw_entry["location"]
    # No region found
    region = None
    students_total = read_us_formatted_integer(raw_entry["stats_number_students"])
    international_students_ratio = read_percentage(raw_entry["stats_pc_intl_students"])
    students_international = floor(students_total * international_students_ratio)
    faculty_to_students_ratio = read_percentage(raw_entry["stats_student_staff_ratio"])
    faculty_total = floor(students_total * faculty_to_students_ratio)
    faculty_international = None
    
    
    return make_entry(name, rank, country, region, faculty_total, faculty_international, students_total, students_international)
    
#   uniProp['internStudentRatio']=t['stats_pc_intl_students']
#   uniProp['internStudent']=math.floor(float(t['stats_pc_intl_students'].replace("%",""))/100*int(t['stats_number_students'].replace(",","")))#Number looks like this 20,409
#   uniProp['totalFacultyRatio']=t['stats_student_staff_ratio']#TODO define which one we want to use
#   uniProp['totalFaculty']=math.floor(int(t['stats_number_students'].replace(",",""))/float(t['stats_student_staff_ratio']))
#   topBAsObjects.append(uniProp)
    
    result["intern_student"] = None
    
    # TODO define which one we want to use
    result["total_faculty_ratio"] = read_percentage(raw_entry["stats_student_staff_ratio"])
    
    result["total_faculty"] = None
    
    return result

def get_times_data(raw_ranking_data, max_rank):
    """
    Return the list of parsed enries (with details) of the top universities ranking.
    It may perform network requests.
    """
    for raw_entry in raw_ranking_data:
        entry = {}
        
        rank = read_rank(raw_entry["rank"])
        if (rank > max_rank):
            # Assume sorted raw ranks to break (instead of `continue`-ing)
            break

        yield read_times_entry(raw_entry)


def get_times_df(raw_ranking_data, max_rank):
    return pd.DataFrame([*get_times_data(raw_ranking_data, max_rank)])


times_df = get_times_df(times_raw_ranking_data, MAX_RANK)
times_df

Unnamed: 0,country,faculty_international,faculty_total,name,rank,region,students_international,students_total
0,United Kingdom,,2285,University of Oxford,1,,7755,20409
1,United Kingdom,,2004,University of Cambridge,2,,6436,18389
2,United States,,143,California Institute of Technology,3,,596,2209
3,United States,,1188,Stanford University,3,,3485,15845
4,United States,,972,Massachusetts Institute of Technology,5,,3800,11177
5,United States,,1809,Harvard University,6,,5284,20326
6,United States,,660,Princeton University,7,,1909,7955
7,United Kingdom,,1807,Imperial College London,8,,8721,15857
8,United States,,838,University of Chicago,9,,3381,13525
9,Switzerland,,2808,ETH Zurich – Swiss Federal Institute of Techno...,10,,7308,19233
