Import libraries and define main constants

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

TOP_UNIVERSITIES_BASE_URL = "https://www.topuniversities.com"
TIMES_BASE_URL = "https://www.timeshighereducation.com/"
# Max rank to load for each ranking
MAX_RANK=200 # 200

## Cached requests

The following method adds a cache for simple HTTP GET requests and ensures that at most one request will be sent per unique URL.

**Re-evalute the cell with the function to flush the cache.**

In [2]:
CACHE = {}
def cached_get_request(url, cache=CACHE):
    """
    Perform a cached HTTP GET request for `url`.
    
    The first time you send a request for an `url`, it will effectivelly
    send an HTTP request. The response will be cached in `cache` and each
    subsequent call for a cached URL will use a cache lookup using an exact
    match on the url instead of sending a new HTTP request.
    
    This function will use a shared cache by default, you can provide your
    own dictionary if you want to override this behavior.
    Reevalute this function to flush the shared cache.
    
    :param url str: URL to GET
    :cache: A dictionary from urls to HTTP responses. Mutated if it did not contain the key `url`.
    :return: _requests_' response for the provided URL.
    """
    
    if url not in cache:
        print("send request : " + url)
        cache[url] = requests.get(url)
    return cache[url]

We define a few functions used to read the data
- `read_rank` allows to convert a string of the form `"=40"` to `40`, `"10-12"` to `10` (TODO: give example with em dash)
- `read_us_formatted_integer` allows to convert string representation with a comma used used as the thousands separtor.

In [3]:
def read_rank(raw_rank):
    """
    :param raw_rank str: Rank string, as found by scrapping
    :return int: Integer representing the rank of the university
    """
    if '-' in raw_rank:
        return int(raw_rank[:raw_rank.index('-')])
    if '–' in raw_rank:
        return int(raw_rank[:raw_rank.index('–')])
    return int(raw_rank.replace('=', ''))

In [4]:
def read_us_formatted_integer(raw_integer):
    """
    Reads a string representing an integer properly formatted using US convention.
    
    The US format uses commas as the thousands separator.
    
    Examples:
    ```
    >>> read_us_formatted_integer("0")
    0
    >>> read_us_formatted_integer("1,000")
    1000
    >>> read_us_formatted_integer("999,999,999")
    999999999
    
    ```
    :param raw_integer str: US Formatted string for an integer
    :return int: Integer corresponding to the provided value.
    """
    return int(raw_integer.replace(",", ""))

In [5]:
def read_percentage(raw_percentage):
    """
    :param raw_percentage str: Percentage string
    :return float: Ratio corresponding to the percentage (between 0 and 1)
    """ 
    return float(raw_percentage.replace("%", "")) / 100

TODO : Describe data

In [6]:
def make_entry(name, rank, country, region, faculty_total, faculty_international, students_total, students_international, **extra_keys):
    return {
        "name": name,
        "rank": rank,
        "country": country,
        "region": region,
        "faculty_total": faculty_total,
        "faculty_international": faculty_international,
        "students_total": students_total,
        "students_international": students_international,
        **extra_keys
    }

In [7]:
def get_top(rawData, limit, top_uni):
    # TODO: Replace this function if we need to be flexible
    # TODO: Define "flexible"
    # TODO: Convert this function to a generator and let the consumer decide if he wants to allocate an array
    top = []
    for r in rawData:
        #print r
        tempRank = read_rank(r["rank_display"] if top_uni else r["rank"])
        if tempRank > limit:
            break
        top.append(r)
    return top

Fetch data from Top Universities
TODO : try catch beautiful soup
Make function for scrapping

In [10]:
def find_value_html(custom_class, html_to_parse):
    try:
        value_inside = html_to_parse.find("div", {"class": custom_class}).find("div", {"class": "number"}).text.strip()
    except Exception as e:
        #print("Error parsing html : ")
        #print(html_to_parse)
        #print(e)
        return None
    return value_inside

In [11]:
ranking_response = cached_get_request("https://www.topuniversities.com/sites/default/files/qs-rankings-data/357051.txt")
raw_ranking_data = ranking_response.json()['data']

def read_top_universities_entry(raw_entry, html_details):
    name = raw_entry["title"]
    rank = read_rank(raw_entry["rank_display"])
    country = raw_entry["country"]
    region = raw_entry["region"]
    
    faculty_total_str = find_value_html("total faculty",html_details)
    faculty_total = read_us_formatted_integer(faculty_total_str) if faculty_total_str is not None else None
    
    faculty_international_str = find_value_html("inter faculty",html_details)
    faculty_international = read_us_formatted_integer(faculty_international_str) if faculty_international_str is not None else None
    
    students_total_str  = find_value_html("total student",html_details)
    students_total = read_us_formatted_integer(students_total_str) if students_total_str is not None else None
    
    students_international_str  = find_value_html("total inter",html_details)
    students_international = read_us_formatted_integer(students_international_str) if students_international_str is not None else None
    
    return make_entry(name, rank, country, region, faculty_total, faculty_international, students_total, students_international)


def get_top_universities_data(raw_ranking_data, max_rank):
    """
    Return the list of parsed enries (with details) of the top universities ranking.
    It may perform network requests.
    """
    for raw_entry in raw_ranking_data:
        entry = {}
        
        rank = read_rank(raw_entry["rank_display"])
        if (rank > max_rank):
            # Assume sorted raw ranks to break (instead of `continue`-ing)
            break
        
        details_reponse = cached_get_request(TOP_UNIVERSITIES_BASE_URL + raw_entry["url"])
        raw_details_data = BeautifulSoup(details_reponse.text, 'html.parser')

        yield read_top_universities_entry(raw_entry, raw_details_data)


def get_top_universities_df(raw_ranking_data, max_rank):
    return pd.DataFrame([*get_top_universities_data(raw_ranking_data, max_rank)])

top_universities_df = get_top_universities_df(raw_ranking_data, MAX_RANK)
top_universities_df[40:80]

send request : https://www.topuniversities.com/universities/stanford-university
send request : https://www.topuniversities.com/universities/harvard-university
send request : https://www.topuniversities.com/universities/california-institute-technology-caltech
send request : https://www.topuniversities.com/universities/university-cambridge
send request : https://www.topuniversities.com/universities/university-oxford
send request : https://www.topuniversities.com/universities/ucl-university-college-london
send request : https://www.topuniversities.com/universities/imperial-college-london
send request : https://www.topuniversities.com/universities/university-chicago
send request : https://www.topuniversities.com/universities/eth-zurich-swiss-federal-institute-technology
send request : https://www.topuniversities.com/universities/nanyang-technological-university-singapore-ntu
send request : https://www.topuniversities.com/universities/ecole-polytechnique-f%C3%A9d%C3%A9rale-de-lausanne-epfl


send request : https://www.topuniversities.com/universities/university-geneva
send request : https://www.topuniversities.com/universities/kth-royal-institute-technology
send request : https://www.topuniversities.com/universities/washington-university-st-louis
send request : https://www.topuniversities.com/universities/university-leeds
send request : https://www.topuniversities.com/universities/university-southampton
send request : https://www.topuniversities.com/universities/university-helsinki
send request : https://www.topuniversities.com/universities/eindhoven-university-technology
send request : https://www.topuniversities.com/universities/purdue-university
send request : https://www.topuniversities.com/universities/yonsei-university
send request : https://www.topuniversities.com/universities/kit-karlsruhe-institute-technology
send request : https://www.topuniversities.com/universities/sungkyunkwan-university-skku
send request : https://www.topuniversities.com/universities/utrecht-

send request : https://www.topuniversities.com/universities/keio-university-cems-mim
send request : https://www.topuniversities.com/universities/stockholm-university
send request : https://www.topuniversities.com/universities/universitat-aut%C3%B2noma-de-barcelona
send request : https://www.topuniversities.com/universities/texas-am-university
send request : https://www.topuniversities.com/universities/instituto-tecnol%C3%B3gico-y-de-estudios-superiores-de-monterrey
send request : https://www.topuniversities.com/universities/maastricht-university


Unnamed: 0,country,faculty_international,faculty_total,name,rank,region,students_international,students_total
40,South Korea,147.0,1250.0,KAIST - Korea Advanced Institute of Science & ...,41,Asia,584.0,9826.0
41,Australia,1477.0,3311.0,The University of Melbourne,41,Oceania,18030.0,42182.0
42,France,75.0,178.0,"Ecole normale supérieure, Paris",43,Europe,374.0,1907.0
43,United Kingdom,942.0,2870.0,University of Bristol,44,Europe,5099.0,20630.0
44,Australia,1612.0,2924.0,The University of New South Wales (UNSW Sydney),45,Oceania,14292.0,39784.0
45,Hong Kong,1074.0,2208.0,The Chinese University of Hong Kong (CUHK),46,Asia,4824.0,18037.0
46,Australia,1870.0,3158.0,The University of Queensland,47,Oceania,10420.0,37497.0
47,United States,425.0,1342.0,Carnegie Mellon University,47,North America,6385.0,13356.0
48,Hong Kong,1027.0,1349.0,City University of Hong Kong,49,Asia,3273.0,9240.0
49,Australia,1829.0,3360.0,The University of Sydney,50,Oceania,17030.0,46678.0


In [12]:
times_ranking_response = cached_get_request("https://www.timeshighereducation.com/sites/default/files/the_data_rankings/world_university_rankings_2018_limit0_369a9045a203e176392b9fb8f8c1cb2a.json")
times_raw_ranking_data = times_ranking_response.json()['data']

def read_times_entry(raw_entry):
    from math import floor
    
    name = raw_entry["name"]
    rank = read_rank(raw_entry["rank"])
    # TODO: Assert this is always a country
    country = raw_entry["location"]
    # No region found
    region = None
    students_total = read_us_formatted_integer(raw_entry["stats_number_students"])
    
    international_students_ratio = read_percentage(raw_entry["stats_pc_intl_students"])
    students_international = floor(students_total * international_students_ratio)
    
    faculty_to_students_ratio = read_percentage(raw_entry["stats_student_staff_ratio"])
    faculty_total = floor(students_total * faculty_to_students_ratio)
    faculty_international = None
    
    
    return make_entry(name, rank, country, region, faculty_total, faculty_international, students_total, students_international)
    
#   uniProp['internStudentRatio']=t['stats_pc_intl_students']
#   uniProp['internStudent']=math.floor(float(t['stats_pc_intl_students'].replace("%",""))/100*int(t['stats_number_students'].replace(",","")))#Number looks like this 20,409
#   uniProp['totalFacultyRatio']=t['stats_student_staff_ratio']#TODO define which one we want to use
#   uniProp['totalFaculty']=math.floor(int(t['stats_number_students'].replace(",",""))/float(t['stats_student_staff_ratio']))
#   topBAsObjects.append(uniProp)
    
    result["intern_student"] = None
    
    # TODO define which one we want to use
    result["total_faculty_ratio"] = read_percentage(raw_entry["stats_student_staff_ratio"])
    
    result["total_faculty"] = None
    
    return result

def get_times_data(raw_ranking_data, max_rank):
    """
    Return the list of parsed enries (with details) of the top universities ranking.
    It may perform network requests.
    """
    for raw_entry in raw_ranking_data:
        entry = {}
        
        rank = read_rank(raw_entry["rank"])
        if (rank > max_rank):
            # Assume sorted raw ranks to break (instead of `continue`-ing)
            break

        yield read_times_entry(raw_entry)


def get_times_df(raw_ranking_data, max_rank):
    return pd.DataFrame([*get_times_data(raw_ranking_data, max_rank)])


times_df = get_times_df(times_raw_ranking_data, MAX_RANK)
times_df

send request : https://www.timeshighereducation.com/sites/default/files/the_data_rankings/world_university_rankings_2018_limit0_369a9045a203e176392b9fb8f8c1cb2a.json


Unnamed: 0,country,faculty_international,faculty_total,name,rank,region,students_international,students_total
0,United Kingdom,,2285,University of Oxford,1,,7755,20409
1,United Kingdom,,2004,University of Cambridge,2,,6436,18389
2,United States,,143,California Institute of Technology,3,,596,2209
3,United States,,1188,Stanford University,3,,3485,15845
4,United States,,972,Massachusetts Institute of Technology,5,,3800,11177
5,United States,,1809,Harvard University,6,,5284,20326
6,United States,,660,Princeton University,7,,1909,7955
7,United Kingdom,,1807,Imperial College London,8,,8721,15857
8,United States,,838,University of Chicago,9,,3381,13525
9,Switzerland,,2808,ETH Zurich – Swiss Federal Institute of Techno...,10,,7308,19233


In [14]:
import string, unidecode
from math import sqrt

def text_synonyms(text):
    synonyms = {
        'universiti':'university', 
        'universite':'university', 
        'universitat':'university', 
        'universitaet':'university', 
        'universidad':'university', 
        'universidade':'university', 
        'universiteit':'university',
        'king\'s':'kings'
    }
    return [word if not(word in synonyms) else synonyms[word] for word in text]

def text_bigrams(text):
    """
    Return the bigrams of the list of tokens
    """
    bigrams  = []
    for index, word in enumerate(text):
        if index + 1 < len(text):
            bigrams += [word + " " + text[index + 1]]
    return bigrams

def text_preprocessing(df, columns="name", suffix="_preprocessed", stopwords=set(["le", "la", "the", "of", "de", "and", "at", "et"])):
    """
    For all selected textual columns of dataframe, 
    append a new preprocessed column which consist of a set of tokens and bigrams
    extracted from text, without any punctuations/accents/uppercases/stopwords.
    """
    data = df[[columns]]
    data = data.applymap(lambda text: unidecode.unidecode(text.lower().translate(str.maketrans('-–', '  ')).translate(str.maketrans('', '', string.punctuation))))
    data = data.applymap(lambda text: [token for token in text.split() if token not in stopwords])
    data = data.applymap(lambda text: text_synonyms(text))
    data = data.applymap(lambda text: set(text + text_bigrams(text)))
    data = data.add_suffix(suffix)
    return df.join(data)

def dataframe_crossjoin(A, B, suffixes=('_A', '_B')):
    """
    Return the cartesian product between the two DataFrames
    """
    A['_tmpkey'] = 1
    B['_tmpkey'] = 1

    AB = pd.merge(A, B, on='_tmpkey', suffixes=suffixes).drop('_tmpkey', axis=1)
    AB.index = pd.MultiIndex.from_product((A.index, B.index))

    A.drop('_tmpkey', axis=1, inplace=True)
    B.drop('_tmpkey', axis=1, inplace=True)

    return AB

def set_distance(A, B):
    """
    Return the distance between two sets, card(A & B) / card(A | B)
    """
    inter = A.intersection(B)
    union = A.union(B)
    return (len(inter) / len(union))

def merge_data(A , B, left_on="name", right_on="name", suffixes=('_A', '_B'), similarity=0.50):
    """
    Merge the data according to the similarity between the two sets of words
    """
    #Exact macth
    exact_matches = A.merge(B, left_on=left_on, right_on=right_on, how="inner", suffixes=suffixes)
    
    #Select doesn't match
    A_no_matches  = A[~A[left_on ].isin(exact_matches[left_on ])]
    B_no_matches  = B[~B[right_on].isin(exact_matches[right_on])]
    
    A = text_preprocessing(A_no_matches, left_on )
    B = text_preprocessing(B_no_matches, right_on)
    
    #Select amongst all possibles pairs, the best one, with a similarity at least higher than the threshold
    cross = dataframe_crossjoin(A, B, suffixes)
    
    selected_cross = cross[[left_on  + "_preprocessed" + suffixes[0], right_on + "_preprocessed" + suffixes[1]]]
    selected_cross = pd.DataFrame(selected_cross.apply(lambda x: set_distance(x[0], x[1]), axis=1))
    selected_cross = selected_cross.loc[selected_cross.groupby(level=0).idxmax()[0]]
    selected_cross = selected_cross.loc[selected_cross.groupby(level=1).idxmax()[0]]
    selected_cross = (selected_cross >= similarity)[0]
    selected_cross = selected_cross[selected_cross].index.tolist()
    
    #We select the first name as the reference
    fuzzy_matches = cross.loc[selected_cross][[left_on + suffixes[0]] + exact_matches.columns[1:].tolist()].reindex()
    fuzzy_matches.columns = exact_matches.columns
    
    A_no_matches = A_no_matches[~A_no_matches.index.isin([x[0] for x in fuzzy_matches.index.values])]
    B_no_matches = B_no_matches[~B_no_matches.index.isin([x[1] for x in fuzzy_matches.index.values])]
    
    matches = exact_matches.append(fuzzy_matches)
    matches.index = [i for i in range(0, matches.shape[0])]
    
    #return cross.loc[selected_cross], A_no_matches, B_no_matches
    return matches, A_no_matches, B_no_matches
    
m, a, b = merge_data(times_df[["name", "rank"]], top_universities_df[["name", "rank"]], suffixes=('_times','_top_universities'))
m

Unnamed: 0,name,rank_times,rank_top_universities
0,University of Oxford,1,6
1,University of Cambridge,2,5
2,Stanford University,3,2
3,Harvard University,6,3
4,Princeton University,7,13
5,Imperial College London,8,8
6,University of Chicago,9,9
7,University of Pennsylvania,10,19
8,Yale University,12,16
9,Johns Hopkins University,13,17


TODO adrien dans le bus, amuse toi bien
Question 3
Commencer par faire un match exact sur le nom et afficher celles qui ne matchent pas -> les garder en exemple et commenter
On ne doit pas aggreger les nombres etc (voir faq http://go.epfl.ch/ada_faq_hw2)
en gros Nom | Classement 1 | Classement 2

Question 4 : 
Calculer tous les pearsons coefficients pour essayer de trouver des corrélations.
A voir si tu trouves/penses à d'autres méthodes
Question 5 : 
Pour le classement global : tester min max mean et regarder les différences, surtout dire pourquoi un est mieux que l'autre
Eventuellement -> il y a pour chaque université des sous classements mais il faut repaser je ferai ça demain

TODO : 
    - Plot question 1 et 2
    - Global ranking différents sous classements 
    - bien commenter
    - analayse textuelle des résultats et de nos idées

# Part 5 
** Que j'auais bien voulu faire mais malheureusement c'est pas merge.......**

First idea : 
Mean of rank numbers.
TODO : Show results and analyse them. Maybe too much equalities or wierd results ?


Second idea : 
Use more precise scores : 
For example we have following scores : 
**Times** 
* overall score
* teaching
* research
* citations
* industry income
* international outlook

**top**
* academic reputation
* citations per faculty
* employer reputation
* faculty student
* international faculty
* international student

We can notice similar categories : 
The most important is overall score. 
We can take just the mean of both of them and make a new ranking.

Scores that are also presents : 
* citations 
    * times : 30% 
    * top : 20%
    * times : We examine research influence by capturing the number of times a university’s published work is cited by scholars globally.
    * Top : To calculate it, we the total number of citations received by all papers produced by an institution across a five-year period by the number of faculty members at that institution.
    * Both are normalized
* international outlook and mean of (international faculty and international student)
    * times : 7,5%
    * Top 2* 5% 
    * Times : calculated like this :
        * International-to-domestic-student ratio: 2.5%
        * International-to-domestic-staff ratio: 2.5%
        * International collaboration: 2.5%
Maybe we can use mean of overall score and take more into account citations and international outlook at it is present in both ranking 


We can also think to electre methods, for example electre III is particulary suited to build a ranking

In [13]:
testPercentageScore = cached_get_request("https://www.topuniversities.com/sites/default/files/qs-rankings-data/357051_indicators.txt")
data_percentage = testPercentageScore.json()['data']
#Note we might want to use either uni href property or overall_rank to associate this to already created structure
#Rank is set as =21 but in the overall_rank key it's only 21. The equal is present when parsing the overall_rank_dis key
def read_figure_percentage(raw_txt):
    beau_txt = BeautifulSoup(raw_txt, 'html.parser')
    return float(beau_txt.find("div", {"class": "td-wrap-in"}).text.strip())
    
def read_raw_entry_percentage(raw_entry):
    overall = read_figure_percentage(raw_entry['overall'])
    academic_reputation = read_figure_percentage(raw_entry['2971069']) if '2971069' in raw_entry else None
    citations = read_figure_percentage(raw_entry['2971070']) if '2971070' in raw_entry else None
    employer_reputation= read_figure_percentage(raw_entry['2971071']) if '2971071' in raw_entry else None
    faculty_student = read_figure_percentage(raw_entry['2971072']) if '2971072' in raw_entry else None
    inter_faculty= read_figure_percentage(raw_entry['2971073']) if '2971073' in raw_entry else None
    inter_student = read_figure_percentage(raw_entry['2971074']) if '2971074' in raw_entry else None
    return (overall,academic_reputation,citations,employer_reputation,faculty_student,inter_faculty,inter_faculty,inter_student)

for i in range(250):
    print(read_raw_entry_percentage(data_percentage[i]))

(100.0, 100.0, 99.9, 100.0, 100.0, 100.0, 100.0, 96.1)
(98.7, 100.0, 99.4, 100.0, 100.0, 99.6, 99.6, 72.7)
(98.4, 100.0, 99.9, 100.0, 98.3, 96.5, 96.5, 75.2)
(97.7, 99.5, 100.0, 85.4, 100.0, 93.4, 93.4, 89.2)
(95.6, 100.0, 78.3, 100.0, 100.0, 97.4, 97.4, 97.7)
(95.3, 100.0, 76.3, 100.0, 100.0, 98.6, 98.6, 98.5)
(94.6, 99.7, 74.7, 99.5, 99.1, 96.6, 96.6, 100.0)
(93.7, 99.4, 68.7, 100.0, 100.0, 100.0, 100.0, 100.0)
(93.5, 99.9, 85.9, 92.9, 96.5, 71.9, 71.9, 79.8)
(93.3, 99.6, 98.7, 99.4, 68.2, 100.0, 100.0, 98.8)
(92.2, 93.9, 83.3, 96.6, 93.6, 100.0, 100.0, 88.2)
(91.2, 83.0, 99.2, 95.5, 92.0, 100.0, 100.0, 100.0)
(91.0, 100.0, 100.0, 97.3, 70.9, 67.4, 67.4, 70.8)
(90.7, 99.6, 96.2, 93.7, 67.4, 92.2, 92.2, 79.2)
(90.5, 100.0, 66.2, 99.9, 88.8, 100.0, 100.0, 86.1)
(90.4, 100.0, 63.2, 99.8, 100.0, 90.7, 90.7, 61.7)
(89.8, 94.3, 83.9, 66.4, 100.0, 87.9, 87.9, 81.3)
(88.9, 99.9, 62.3, 98.1, 100.0, 34.7, 34.7, 94.9)
(88.7, 97.4, 67.4, 94.9, 100.0, 67.1, 67.1, 64.5)
(87.1, 99.3, 85.6, 90.4, 55