In [1]:
import pandas as pd
import numpy as np
import json
from pandas.io.json import json_normalize
import requests
from bs4 import BeautifulSoup

# 1. Scrap Site www.topuniversities.com

found the json using postman:

https://www.topuniversities.com/sites/default/files/qs-rankings-data/357051.txt?_=1508252855868

We want:
    - name 
    - rank
    - country
    - region 
    - number of faculty members (international and total)
    - number of students (international and total)


In [2]:
#json to dataframe using panda
json_data = json.load(open('TopUniRanking.json'))
df = json_normalize(json_data['data'])
#the university are sorted by ranks, so we just need the 200 first indexes
df_top200 = df.head(200)
#drop useless column information
df_top200 = df_top200.drop(["cc","core_id","logo","guide"],axis=1)
df_top200.head(10)

Unnamed: 0,country,nid,rank_display,region,score,stars,title,url
0,United States,294850,1,North America,100.0,6.0,Massachusetts Institute of Technology (MIT),/universities/massachusetts-institute-technolo...
1,United States,297282,2,North America,98.7,5.0,Stanford University,/universities/stanford-university
2,United States,294270,3,North America,98.4,5.0,Harvard University,/universities/harvard-university
3,United States,294562,4,North America,97.7,5.0,California Institute of Technology (Caltech),/universities/california-institute-technology-...
4,United Kingdom,294561,5,Europe,95.6,5.0,University of Cambridge,/universities/university-cambridge
5,United Kingdom,294654,6,Europe,95.3,5.0,University of Oxford,/universities/university-oxford
6,United Kingdom,294014,7,Europe,94.6,,UCL (University College London),/universities/ucl-university-college-london
7,United Kingdom,294030,8,Europe,93.7,,Imperial College London,/universities/imperial-college-london
8,United States,294536,9,North America,93.5,5.0,University of Chicago,/universities/university-chicago
9,Switzerland,294432,10,Europe,93.3,,ETH Zurich - Swiss Federal Institute of Techno...,/universities/eth-zurich-swiss-federal-institu...


We now have the general data for the 200 first universities. Then we need to scrap data on each unique page of each university to retrieve those 4 informations: 
* number of total faculty members 
* number of international faculty members
* number of total students 
* number of international students

The url of an university is contained in our previous dataFrame. And the pages are written this way:
 **www.topuniversities.com+url** 



In [3]:
listUrl = df_top200['url']
#create a temp dataframe
index = range(200)
columns = ["nbr_faculty_members","nbr_international_faculty_members","nbr_total_students","nbr_international_total_students"]
temp_df = pd.DataFrame(index=index,columns=columns)

#helper function that filter the html file and return the int corresponding to the special string htmlClass
def findNumberOf(soup,htmlClass):
    filteredHtmlElements = soup.find_all('div', class_=htmlClass)
    #case if a field information is not given
    if(len(filteredHtmlElements)==0):
        return -1
    #find the integer value inside the html balise
    t= filteredHtmlElements[0].find('div', class_='number').text
    #clear the input then convert it into an integer
    return int(t.replace('\n', '').replace('\r', '').replace(',', '').replace(' ',''))

#retrieve the 4 informations needed from the url, for each university
for i,url in enumerate(listUrl):
    finalUrl = "https://www.topuniversities.com"+url
    r = requests.get(finalUrl)
    soup = BeautifulSoup(r.text, 'html.parser')
    #todo: scrap with beautifulSoup using the url
    #merge data into the temp dataframe
    temp_df.nbr_faculty_members[i] = findNumberOf(soup,'total faculty')
    temp_df.nbr_international_faculty_members[i] = findNumberOf(soup,'inter faculty')
    temp_df.nbr_total_students[i] = findNumberOf(soup,'total student')
    temp_df.nbr_international_total_students[i] = findNumberOf(soup,'total inter')

In [5]:
#merge the 2 temps
df_top200_merged = df_top200.join(temp_df)
df_top200_merged.head(10)

Unnamed: 0,country,nid,rank_display,region,score,stars,title,url,nbr_faculty_members,nbr_international_faculty_members,nbr_total_students,nbr_international_total_students
0,United States,294850,1,North America,100.0,6.0,Massachusetts Institute of Technology (MIT),/universities/massachusetts-institute-technolo...,2982,1679,11067,3717
1,United States,297282,2,North America,98.7,5.0,Stanford University,/universities/stanford-university,4285,2042,15878,3611
2,United States,294270,3,North America,98.4,5.0,Harvard University,/universities/harvard-university,4350,1311,22429,5266
3,United States,294562,4,North America,97.7,5.0,California Institute of Technology (Caltech),/universities/california-institute-technology-...,953,350,2255,647
4,United Kingdom,294561,5,Europe,95.6,5.0,University of Cambridge,/universities/university-cambridge,5490,2278,18770,6699
5,United Kingdom,294654,6,Europe,95.3,5.0,University of Oxford,/universities/university-oxford,6750,2964,19720,7353
6,United Kingdom,294014,7,Europe,94.6,,UCL (University College London),/universities/ucl-university-college-london,6345,2554,31080,14854
7,United Kingdom,294030,8,Europe,93.7,,Imperial College London,/universities/imperial-college-london,3930,2071,16090,8746
8,United States,294536,9,North America,93.5,5.0,University of Chicago,/universities/university-chicago,2449,635,13557,3379
9,Switzerland,294432,10,Europe,93.3,,ETH Zurich - Swiss Federal Institute of Techno...,/universities/eth-zurich-swiss-federal-institu...,2477,1886,19815,7563


* Which are the best universities in term of: (a) ratio between faculty members and students, (b) ratio of international students?
* Answer the previous question aggregating the data by (c) country and (d) region.

In [None]:
#drop university that doesn't have the infromation required


# 2. Scrap Site www.timeshighereducation.com

found the json using postman again:

https://www.timeshighereducation.com/sites/default/files/the_data_rankings/world_university_rankings_2018_limit0_369a9045a203e176392b9fb8f8c1cb2a.json

We want:
    - name 
    - rank
    - country
    - region 
    - number of faculty members (international and total)
    - number of students (international and total)

In [None]:
#json to dataframe using panda
json_data2 = json.load(open('TimesRanking.json'))
df2 = json_normalize(json_data2['data'])
print(df2.T.index)
df2.head(10)

# 3. merge stuffs

In [None]:
# 2 dataframe to 1
pass





# 4. Do correlation?

In [None]:
#work
pass





# 5. Best university? (not EPFL)

In [None]:
#harward or stuff
pass


