In [120]:
from bs4 import BeautifulSoup
import requests
import json
import pandas as pd
import numpy as np
pd.options.mode.chained_assignment = None  # default='warn'

In [121]:
def soup_from_url(url):
    page = requests.get(url)
    return BeautifulSoup(page.content, 'html.parser')

def get_json_ranks_dataframe(url, n):
    soup = soup_from_url(url)
    newDictionary=json.loads(str(soup))
    return pd.DataFrame.from_dict(newDictionary["data"]).head(n)

In [147]:
# urls directly to get the JSON files
url1 = "https://www.topuniversities.com/sites/default/files/qs-rankings-data/357051.txt?_=1508356628355"
url2 = "https://www.timeshighereducation.com/sites/default/files/the_data_rankings/world_university_rankings_2018_limit0_369a9045a203e176392b9fb8f8c1cb2a.json"

# dataframes with all ranks from JSON, 200 first positions
df1 = get_json_ranks_dataframe(url1,200)
df2 = get_json_ranks_dataframe(url2,200)

In [148]:
def get_missing_values_tu(url):
    soup = soup_from_url(url)
    
    def get_div_number(c):
        container = soup.find("div", { "class" : c })
        if not container:
            return np.nan
        num_str = container.find("div", {"class": "number"}).contents[0]
        num = ''.join([i for i in num_str if i.isdigit()])
        return num
    
    # get total faculty staff
    num_staff_total = get_div_number("total faculty")
    
    # get total international faculty staff
    num_staff_inter = get_div_number("inter faculty")
    
    # get total faculty student
    num_student_total = get_div_number("total student")
    
    # get total international faculty student
    num_student_inter = get_div_number("total inter")
    
    return [num_staff_total, num_staff_inter, num_student_total, num_student_inter, url]

def get_missing_values_the(url):
    soup = soup_from_url(url)
    
    region = soup.find("meta",  property="og:locality")
    address = soup.find("meta", property="og:street_address")
    
    ret_val = ""
    if region:
        ret_val = region["content"]
    elif address:
        ret_val = address["content"].replace(',','').strip()
    else:
        ret_val = np.nan
        
    return [ret_val, url]

def get_missing(col, domain, urls, f):
    missing_values = pd.DataFrame(columns=col)
    for idx,url in enumerate(urls):
        miss_value = f(domain + url)
        missing_values.loc[idx] = miss_value
    return missing_values

In [149]:
df1_m = get_missing(['num_staff_total', 'num_staff_inter', 'num_student_total', 'num_student_inter', 'url'], 
           "https://www.topuniversities.com",
            df1.url,
            get_missing_values_tu)

df2_m = get_missing(['region', 'url'],
           "https://www.timeshighereducation.com",
           df2.url,
           get_missing_values_the)

In [159]:
# concatenate with the missing values
top_uni = pd.concat([df1, df1_m], axis=1)
high_educ = pd.concat([df2, df2_m], axis=1)

top_uni_clean = top_uni[['title', 'rank_display', 'country', 'region',
                        'num_staff_total', 'num_staff_inter',
                         'num_student_total', 'num_student_inter']]

top_uni_clean.columns = ['name', 'rank_top', 'country', 'region',
                        'numb_staff_total', 'num_staff_inter',
                         'num_student_total', 'num_student_inter']

high_educ_clean = high_educ[['name', 'rank', 'location', 'region',
                             'stats_student_staff_ratio',
                             'stats_number_students', 'stats_pc_intl_students']]

high_educ_clean.columns = ['name', 'rank_high', 'country', 'region',
                        'numb_staff_total',
                         'num_student_total', 'num_student_inter']

top_uni_clean.rank_top = top_uni_clean.rank_top.apply(lambda x: x.replace('=', ''))
high_educ_clean.num_student_total = high_educ_clean.num_student_total.apply(lambda x: x.replace(',','') )
high_educ_clean.num_student_inter = high_educ_clean.num_student_inter.apply(lambda x: x.replace('%','') )
high_educ_clean.rank_high = high_educ_clean.rank_high.apply(lambda x: x.replace('=',''))

top_uni_clean = top_uni_clean.apply(pd.to_numeric, errors="ignore")
high_educ_clean = high_educ_clean.apply(pd.to_numeric, errors="ignore")

# change staff/student ratio to number of staff
# change international students percentage to the actual value
high_educ_clean.numb_staff_total =  (high_educ_clean.num_student_total / high_educ_clean.numb_staff_total).round()
high_educ_clean.num_student_inter = (high_educ_clean.num_student_inter * high_educ_clean.num_student_total).apply(lambda x : x/100).round()

In [160]:
pd.set_option('precision', 0)
pd.concat([top_uni_clean, high_educ_clean]).sort_values(['name'])

Unnamed: 0,country,name,num_staff_inter,num_student_inter,num_student_total,numb_staff_total,rank_high,rank_top,region
189,Finland,Aalto University,,2549,12744,631,190,,PO Box/PL 96081
137,Finland,Aalto University,370,1831,12147,1257,,137,Europe
118,Denmark,Aarhus University,602,3762,26226,2316,,119,Europe
108,Denmark,Aarhus University,,3020,25167,1760,109,,DK - Denmark Nordre Ringgade 1 8000 Aarhus C
170,Germany,Albert-Ludwigs-Universitaet Freiburg,413,3897,23214,1966,,171,Europe
187,Italy,Alma Mater Studiorum - University of Bologna,153,4195,63399,2990,,188,Europe
125,United States,Arizona State University,,8450,44475,2213,126,,Tempe
47,Australia,Australian National University,,5595,15986,828,48,,Acton
146,Spain,Autonomous University of Barcelona,,5169,32309,2505,147,,Campus de la UAB Placa Civica
80,United States,Boston University,379,7041,25662,3157,,81,North America


In [155]:
from difflib import SequenceMatcher

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

similar("epfl", "the epfl")

0.6666666666666666

In [None]:
def diff_rows(r1, r2):
    """gives the overall difference between 2 rows"""