# Homework 2

In [1]:
# Imports
import json
import difflib 
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen

### Question 1
Obtain the 200 top-ranking universities in www.topuniversities.com (ranking 2018). In particular, extract the following fields for each university: name, rank, country and region, number of faculty members (international and total) and number of students (international and total). Some information is not available in the main list and you have to find them in the details page. Store the resulting dataset in a pandas DataFrame and answer the following questions:

In [2]:
# Load the data json file
top_universities_url = 'https://www.topuniversities.com/sites/default/files/qs-rankings-data/357051_indicators.txt'
university_url = 'https://www.topuniversities.com'
hdr = {'User-Agent': 'Mozilla/5.0'}
request = Request(top_universities_url, headers=hdr)
page = urlopen(request)
data = json.loads(page.read().decode())

# Create an empty dataframe
cols = ['University','Rank', 'Country', 'Region', 'International Faculty', 'Total Faculty', 'International Students', 'Total Students']
df = pd.DataFrame(columns=cols)

# Parse json for each ranking
for item in range(200):
#   Obtain rank, country, region, and university name information
    rank = int(data['data'][item]['overall_rank'].strip('='))
    country = data['data'][item]['location']
    region = data['data'][item]['region']
    name = data['data'][item]['uni']
    soup = BeautifulSoup(name, "lxml")
    link = soup.find_all('a')[0]['href']
    name = soup.find_all('a')[0].text
    
#   Retrieve faculty and student data from the details page
    request = Request(university_url+link, headers=hdr)
    page = urlopen(request)
    soup = BeautifulSoup(page, "lxml")

    faculty_total = 0
    inter_faculty_num = 0
    total_students_num = 0
    inter_students_num = 0

    faculty = soup.find('div', attrs={'class':'total faculty'})
    if faculty:
        faculty_total = (faculty.find('div', attrs={'class':'number'}).text).strip('\t\r\n ')
        faculty_total = int(faculty_total.replace(',', ''))

    inter_faculty = soup.find('div', attrs={'class':'inter faculty'})
    if inter_faculty:
        inter_faculty_num = (inter_faculty.find('div', attrs={'class':'number'}).text).replace('\t\r\n ', '')
        inter_faculty_num = int(inter_faculty_num.replace(',', ''))

    total_students = soup.find('div', attrs={'class':'total student'})
    if total_students:
        total_students_num = (total_students.find('div', attrs={'class':'number'}).text).replace('\t\r\n ', '')
        total_students_num = int(total_students_num.replace(',', ''))

    inter_students = soup.find('div', attrs={'class':'total inter'})
    if inter_students:
        inter_students_num = (inter_students.find('div', attrs={'class':'number'}).text).replace('\t\r\n ', '')
        inter_students_num = int(inter_students_num.replace(',', ''))

#   Append each row of data to the dataframe
    df2 = pd.DataFrame([[name, rank, country, region, inter_faculty_num, faculty_total, inter_students_num, total_students_num]], columns=cols)
    df = df.append(df2, ignore_index=True)

# Create a copy of the dataframe for question 3
t1 = df.copy()

df.head(n=10)

Unnamed: 0,University,Rank,Country,Region,International Faculty,Total Faculty,International Students,Total Students
0,Massachusetts Institute of Technology (MIT),1,United States,North America,1679,2982,3717,11067
1,Stanford University,2,United States,North America,2042,4285,3611,15878
2,Harvard University,3,United States,North America,1311,4350,5266,22429
3,California Institute of Technology (Caltech),4,United States,North America,350,953,647,2255
4,University of Cambridge,5,United Kingdom,Europe,2278,5490,6699,18770
5,University of Oxford,6,United Kingdom,Europe,2964,6750,7353,19720
6,UCL (University College London),7,United Kingdom,Europe,2554,6345,14854,31080
7,Imperial College London,8,United Kingdom,Europe,2071,3930,8746,16090
8,University of Chicago,9,United States,North America,635,2449,3379,13557
9,ETH Zurich - Swiss Federal Institute of Techno...,10,Switzerland,Europe,1886,2477,7563,19815


In [3]:
# Map country to region for question 2
map_country_region = {t1['Country'][k]: t1['Region'][k] for k in range(200)}

# Handle exceptions
map_country_region['Luxembourg'] = 'Europe'
map_country_region['Russian Federation'] = 'Europe'

## Hint: Plot your data below using bar charts and describe briefly what you observed. Assume 'best universities' to refer to a top-k. Here our k = 10.

#### (a) Which are the best universities in terms of ratio between faculty members and students?

In [4]:
# Replace all zero's with NaN values so ratio calculations can be made
t1 = t1.replace(0, np.NaN)
ratios = t1.copy()

# Calculate the ratio of total faculty to total students
ratios['Faculty-Student Ratio'] = ratios['Total Faculty']/ratios['Total Students']

# Sort the ratios by ascending order
fs_ratios = ratios.sort_values('Faculty-Student Ratio', ascending=False)
fs_ratios_a = fs_ratios[['University', 'Faculty-Student Ratio']].head(n=10)

The top 10 universities in terms of faculty-student ratio are:

In [5]:
fs_ratios_a

Unnamed: 0,University,Faculty-Student Ratio
3,California Institute of Technology (Caltech),0.422616
15,Yale University,0.398323
5,University of Oxford,0.342292
4,University of Cambridge,0.292488
16,Johns Hopkins University,0.276353
1,Stanford University,0.26987
0,Massachusetts Institute of Technology (MIT),0.26945
185,University of Rochester,0.266604
18,University of Pennsylvania,0.266437
17,Columbia University,0.247115


#### (b) Which are the best universities in terms of ratio of international students?

In [6]:
# Calculate the ratio of international students to total students
ratios['International Student Ratio'] = ratios['International Students']/ratios['Total Students']

# Sort the ratios by ascending order
inter_ratio = ratios.sort_values('International Student Ratio', ascending=False)
inter_ratio_b = inter_ratio[['University', 'International Student Ratio']].head(n=10)

The top 10 universities in terms of international student ratio are:

In [7]:
inter_ratio_b

Unnamed: 0,University,International Student Ratio
34,London School of Economics and Political Scien...,0.691393
11,Ecole Polytechnique Fédérale de Lausanne (EPFL),0.570047
7,Imperial College London,0.543567
199,Maastricht University,0.502533
46,Carnegie Mellon University,0.478062
6,UCL (University College London),0.477928
91,University of St Andrews,0.457955
41,The University of Melbourne,0.427434
126,Queen Mary University of London,0.421816
25,The University of Hong Kong,0.407144


#### (c) Aggregate answers (a) and (b) by country

In [8]:
fs_ratios_c = fs_ratios[['University', 'Country', 'Faculty-Student Ratio']].head(n=10)

# Aggregate faculty-student ratios by country and university
fs_ratios_c = fs_ratios_c.set_index(['Country', 'University'])
fs_ratios_c = fs_ratios_c.sort_index()

inter_ratios_c = inter_ratio[['University', 'Country', 'International Student Ratio']].head(n=10)

# Aggregate international student ratios by country and university
inter_ratios_c = inter_ratios_c.set_index(['Country', 'University'])
inter_ratios_c = inter_ratios_c.sort_index()

The top 10 universities for faculty-student ratio agreggated by country are:

In [9]:
fs_ratios_c

Unnamed: 0_level_0,Unnamed: 1_level_0,Faculty-Student Ratio
Country,University,Unnamed: 2_level_1
United Kingdom,University of Cambridge,0.292488
United Kingdom,University of Oxford,0.342292
United States,California Institute of Technology (Caltech),0.422616
United States,Columbia University,0.247115
United States,Johns Hopkins University,0.276353
United States,Massachusetts Institute of Technology (MIT),0.26945
United States,Stanford University,0.26987
United States,University of Pennsylvania,0.266437
United States,University of Rochester,0.266604
United States,Yale University,0.398323


The top 10 universities for international student ratio agreggated by country are:

In [10]:
inter_ratios_c

Unnamed: 0_level_0,Unnamed: 1_level_0,International Student Ratio
Country,University,Unnamed: 2_level_1
Australia,The University of Melbourne,0.427434
Hong Kong,The University of Hong Kong,0.407144
Netherlands,Maastricht University,0.502533
Switzerland,Ecole Polytechnique Fédérale de Lausanne (EPFL),0.570047
United Kingdom,Imperial College London,0.543567
United Kingdom,London School of Economics and Political Science (LSE),0.691393
United Kingdom,Queen Mary University of London,0.421816
United Kingdom,UCL (University College London),0.477928
United Kingdom,University of St Andrews,0.457955
United States,Carnegie Mellon University,0.478062


#### (d) Aggregate answers (a) and (b) by region

In [11]:
fs_ratios_d = fs_ratios[['University', 'Region', 'Faculty-Student Ratio']].head(n=10)

# Aggregate faculty-student ratios by region and university
fs_ratios_d = fs_ratios_d.set_index(['Region', 'University'])
fs_ratios_d = fs_ratios_d.sort_index()

inter_ratios_d = inter_ratio[['University', 'Region', 'International Student Ratio']].head(n=10)

# Aggregate international student ratios by region and university
inter_ratios_d = inter_ratios_d.set_index(['Region', 'University'])
inter_ratios_d = inter_ratios_d.sort_index()

The top 10 universities for faculty-student ratio agreggated by region are:

In [12]:
fs_ratios_d

Unnamed: 0_level_0,Unnamed: 1_level_0,Faculty-Student Ratio
Region,University,Unnamed: 2_level_1
Europe,University of Cambridge,0.292488
Europe,University of Oxford,0.342292
North America,California Institute of Technology (Caltech),0.422616
North America,Columbia University,0.247115
North America,Johns Hopkins University,0.276353
North America,Massachusetts Institute of Technology (MIT),0.26945
North America,Stanford University,0.26987
North America,University of Pennsylvania,0.266437
North America,University of Rochester,0.266604
North America,Yale University,0.398323


The top 10 universities for international student ratio agreggated by region are:

In [13]:
inter_ratios_d

Unnamed: 0_level_0,Unnamed: 1_level_0,International Student Ratio
Region,University,Unnamed: 2_level_1
Asia,The University of Hong Kong,0.407144
Europe,Ecole Polytechnique Fédérale de Lausanne (EPFL),0.570047
Europe,Imperial College London,0.543567
Europe,London School of Economics and Political Science (LSE),0.691393
Europe,Maastricht University,0.502533
Europe,Queen Mary University of London,0.421816
Europe,UCL (University College London),0.477928
Europe,University of St Andrews,0.457955
North America,Carnegie Mellon University,0.478062
Oceania,The University of Melbourne,0.427434


### Question 2
Obtain the 200 top-ranking universities in www.timeshighereducation.com (ranking 2018). Repeat the analysis of the previous point and discuss briefly what you observed.

In [14]:
# Load the data json file
top_universities_url = 'https://www.timeshighereducation.com/sites/default/files/the_data_rankings/world_university_rankings_2018_limit0_369a9045a203e176392b9fb8f8c1cb2a.json'
hdr = {'User-Agent': 'Mozilla/5.0'}
request = Request(top_universities_url, headers=hdr)
page = urlopen(request)
data = json.loads(page.read().decode())['data']

# Create an empty dataframe
cols = ['University','Rank', 'Country', 'Region', 'Faculty-Student Ratio', 'International Student Ratio']
df = pd.DataFrame(columns=cols)

# Parse json for each ranking
for item in range(200):
#   Obtain university name, rank, country, student-staff ratio, and international student percent
    name = data[item]['name']
    rank = int(data[item]['rank'].strip('='))
    country = data[item]['location']
    stud_staff_ratio = float(data[item]['stats_student_staff_ratio'])
    inter_stud_percent = int(data[item]['stats_pc_intl_students'].strip('%'))/100.0
    
#   Append each row of data to the dataframe
    df2 = pd.DataFrame([[name, rank, country, map_country_region[country], stud_staff_ratio, inter_stud_percent]], columns=cols)
    df = df.append(df2, ignore_index=True)
    
# Create a copy of the dataframe for question 3
t2 = df.copy()

df.head(n=10)

Unnamed: 0,University,Rank,Country,Region,Faculty-Student Ratio,International Student Ratio
0,University of Oxford,1,United Kingdom,Europe,11.2,0.38
1,University of Cambridge,2,United Kingdom,Europe,10.9,0.35
2,California Institute of Technology,3,United States,North America,6.5,0.27
3,Stanford University,3,United States,North America,7.5,0.22
4,Massachusetts Institute of Technology,5,United States,North America,8.7,0.34
5,Harvard University,6,United States,North America,8.9,0.26
6,Princeton University,7,United States,North America,8.3,0.24
7,Imperial College London,8,United Kingdom,Europe,11.4,0.55
8,University of Chicago,9,United States,North America,6.2,0.25
9,ETH Zurich – Swiss Federal Institute of Techno...,10,Switzerland,Europe,14.6,0.38


#### (a) Which are the best universities in terms of ratio between faculty members and students?

In [15]:
# Replace all zero's with NaN values so ratio calculations can be made
t2 = t2.replace(0, np.NaN)
ratios = t2.copy()

# Sort the faculty-student ratios by ascending order
fs_ratios = ratios.sort_values('Faculty-Student Ratio', ascending=False)
fs_ratios_a = fs_ratios[['University', 'Faculty-Student Ratio']].head(n=10)

The top 10 universities in terms of faculty-student ratio are:

In [16]:
fs_ratios_a

Unnamed: 0,University,Faculty-Student Ratio
99,University of Bonn,77.9
124,University of Mannheim,74.5
91,Technical University of Berlin,64.3
163,University of Erlangen-Nuremberg,58.5
78,RWTH Aachen University,57.7
87,Free University of Berlin,57.0
61,Humboldt University of Berlin,56.5
40,Technical University of Munich,54.4
144,University of Cologne,50.9
174,Université Libre de Bruxelles,43.4


#### (b) Which are the best universities in terms of ratio of international students?

In [17]:
# Sort the international student ratios by ascending order
inter_ratio = ratios.sort_values('International Student Ratio', ascending=False)
inter_ratio_b = inter_ratio[['University', 'International Student Ratio']].head(n=10)

The top 10 universities in terms of international student ratio are:

In [18]:
inter_ratio_b

Unnamed: 0,University,International Student Ratio
24,London School of Economics and Political Science,0.71
178,University of Luxembourg,0.57
37,École Polytechnique Fédérale de Lausanne,0.55
7,Imperial College London,0.55
102,Maastricht University,0.5
15,University College London,0.49
143,University of St Andrews,0.48
23,Carnegie Mellon University,0.45
120,Queen Mary University of London,0.45
39,University of Hong Kong,0.42


#### (c) Aggregate answers (a) and (b) by country

In [19]:
fs_ratios_c = fs_ratios[['University', 'Country', 'Faculty-Student Ratio']].head(n=10)

# Aggregate faculty-student ratios by country and university
fs_ratios_c = fs_ratios_c.set_index(['Country', 'University'])
fs_ratios_c = fs_ratios_c.sort_index()

inter_ratios_c = inter_ratio[['University', 'Country', 'International Student Ratio']].head(n=10)

# Aggregate international student ratios by country and university
inter_ratios_c = inter_ratios_c.set_index(['Country', 'University'])
inter_ratios_c = inter_ratios_c.sort_index()

The top 10 universities for faculty-student ratio agreggated by country are:

In [20]:
fs_ratios_c

Unnamed: 0_level_0,Unnamed: 1_level_0,Faculty-Student Ratio
Country,University,Unnamed: 2_level_1
Belgium,Université Libre de Bruxelles,43.4
Germany,Free University of Berlin,57.0
Germany,Humboldt University of Berlin,56.5
Germany,RWTH Aachen University,57.7
Germany,Technical University of Berlin,64.3
Germany,Technical University of Munich,54.4
Germany,University of Bonn,77.9
Germany,University of Cologne,50.9
Germany,University of Erlangen-Nuremberg,58.5
Germany,University of Mannheim,74.5


The top 10 universities for international student ratio agreggated by country are:

In [21]:
inter_ratios_c

Unnamed: 0_level_0,Unnamed: 1_level_0,International Student Ratio
Country,University,Unnamed: 2_level_1
Hong Kong,University of Hong Kong,0.42
Luxembourg,University of Luxembourg,0.57
Netherlands,Maastricht University,0.5
Switzerland,École Polytechnique Fédérale de Lausanne,0.55
United Kingdom,Imperial College London,0.55
United Kingdom,London School of Economics and Political Science,0.71
United Kingdom,Queen Mary University of London,0.45
United Kingdom,University College London,0.49
United Kingdom,University of St Andrews,0.48
United States,Carnegie Mellon University,0.45


#### (d) Aggregate answers (a) and (b) by region

In [22]:
fs_ratios_d = fs_ratios[['University', 'Region', 'Faculty-Student Ratio']].head(n=10)

# Aggregate faculty-student ratios by region and university
fs_ratios_d = fs_ratios_d.set_index(['Region', 'University'])
fs_ratios_d = fs_ratios_d.sort_index()

inter_ratios_d = inter_ratio[['University', 'Region', 'International Student Ratio']].head(n=10)

# Aggregate international student ratios by region and university
inter_ratios_d = inter_ratios_d.set_index(['Region', 'University'])
inter_ratios_d = inter_ratios_d.sort_index()

The top 10 universities for faculty-student ratio agreggated by region are:

In [23]:
fs_ratios_d

Unnamed: 0_level_0,Unnamed: 1_level_0,Faculty-Student Ratio
Region,University,Unnamed: 2_level_1
Europe,Free University of Berlin,57.0
Europe,Humboldt University of Berlin,56.5
Europe,RWTH Aachen University,57.7
Europe,Technical University of Berlin,64.3
Europe,Technical University of Munich,54.4
Europe,University of Bonn,77.9
Europe,University of Cologne,50.9
Europe,University of Erlangen-Nuremberg,58.5
Europe,University of Mannheim,74.5
Europe,Université Libre de Bruxelles,43.4


The top 10 universities for international student ratio agreggated by region are:

In [24]:
inter_ratios_d

Unnamed: 0_level_0,Unnamed: 1_level_0,International Student Ratio
Region,University,Unnamed: 2_level_1
Asia,University of Hong Kong,0.42
Europe,Imperial College London,0.55
Europe,London School of Economics and Political Science,0.71
Europe,Maastricht University,0.5
Europe,Queen Mary University of London,0.45
Europe,University College London,0.49
Europe,University of Luxembourg,0.57
Europe,University of St Andrews,0.48
Europe,École Polytechnique Fédérale de Lausanne,0.55
North America,Carnegie Mellon University,0.45


## Question 3

#### Merge the two DataFrames created in questions 1 and 2 using university names. Match universities' names as well as you can, and explain your strategy. Keep track of the original position in both rankings.

Our strategy is to create an index on univerity names and perform an approximated matching to homogenize the university names. We conduct the matching using the [difflib.get_close_matches](https://docs.python.org/2/library/difflib.html) functionality which finds the best matches of a value from a list. In this case, the university name in Table 1 is matched from a list of close values of university names from Table 2. Then an outer join is performed. 

In [25]:
t2.index = t2['University']
# Get closest match on university name in both tables
t1['University'] = t1['University'].map(lambda x: (difflib.get_close_matches(x, t2.index)[:1] or [None])[0])

# Rename the rank columns for each dataset accordingly
t1 = t1.rename(columns={'Rank':'Rank_QS'})
t2 = t2.rename(columns={'Rank':'Rank_THE'})

# Perform an outer merge on the tables
merged_df = pd.merge(t1, t2, how='outer')
merged_df.head(n=10)

Unnamed: 0,University,Rank_QS,Country,Region,International Faculty,Total Faculty,International Students,Total Students,Rank_THE,Faculty-Student Ratio,International Student Ratio
0,Massachusetts Institute of Technology,1.0,United States,North America,1679.0,2982.0,3717.0,11067.0,5.0,8.7,0.34
1,Stanford University,2.0,United States,North America,2042.0,4285.0,3611.0,15878.0,3.0,7.5,0.22
2,Harvard University,3.0,United States,North America,1311.0,4350.0,5266.0,22429.0,6.0,8.9,0.26
3,California Institute of Technology,4.0,United States,North America,350.0,953.0,647.0,2255.0,3.0,6.5,0.27
4,University of Cambridge,5.0,United Kingdom,Europe,2278.0,5490.0,6699.0,18770.0,2.0,10.9,0.35
5,University of Oxford,6.0,United Kingdom,Europe,2964.0,6750.0,7353.0,19720.0,1.0,11.2,0.38
6,University College London,7.0,United Kingdom,Europe,2554.0,6345.0,14854.0,31080.0,16.0,10.5,0.49
7,Imperial College London,8.0,United Kingdom,Europe,2071.0,3930.0,8746.0,16090.0,8.0,11.4,0.55
8,University of Chicago,9.0,United States,North America,635.0,2449.0,3379.0,13557.0,9.0,6.2,0.25
9,ETH Zurich – Swiss Federal Institute of Techno...,10.0,Switzerland,Europe,1886.0,2477.0,7563.0,19815.0,10.0,14.6,0.38


## Question 4
#### Find useful insights in the data by performing an exploratory analysis. Can you find a strong correlation between any pair of variables in the dataset you just created? Example: when a university is strong in its international dimension, can you observe a consistency both for students and faculty members?

In [26]:
# Print the correlations between the columns of the dataframe
merged_df.corr()

Unnamed: 0,Rank_QS,International Faculty,Total Faculty,International Students,Total Students,Rank_THE,Faculty-Student Ratio,International Student Ratio
Rank_QS,1.0,-0.520992,-0.292365,-0.29905,-0.03826,0.672412,0.257678,-0.314326
International Faculty,-0.520992,1.0,0.587774,0.639899,0.311591,-0.372113,-0.066348,0.341115
Total Faculty,-0.292365,0.587774,1.0,0.469991,0.767329,-0.421818,-0.098054,-0.120183
International Students,-0.29905,0.639899,0.469991,1.0,0.562732,-0.27111,0.234829,0.428936
Total Students,-0.03826,0.311591,0.767329,0.562732,1.0,-0.169122,0.31867,-0.266687
Rank_THE,0.672412,-0.372113,-0.421818,-0.27111,-0.169122,1.0,0.098513,-0.22141
Faculty-Student Ratio,0.257678,-0.066348,-0.098054,0.234829,0.31867,0.098513,1.0,-0.107818
International Student Ratio,-0.314326,0.341115,-0.120183,0.428936,-0.266687,-0.22141,-0.107818,1.0


#### Explanation:
By observing the chart above, we conclude that the strongest correlation between any two variables is between the ranks from both data sets (0.67). Apart from that, the strongest correlation under the rank from 'www.topuniversities.com' occurs with a negative correlation between the International Faculty and the Rank (-0.52). On the other hand, the strongest correlation under the rank from 'www.timeshighereducation.com' occurs with negative correlation between Total Faculty and the Rank (-0.42). However, we do not believe this is sufficient to say there is a 'strong' correlation in general between any pairs of variables in the dataset other than the ranks.

## Question 5
Can you find the best university taking in consideration both rankings? Explain your approach.

In [27]:
# Calculate the average rank from both datasets for each university
merged_df['Average Rank'] = (merged_df['Rank_QS'] + merged_df['Rank_THE'])/2.0

# Find the minimum average rank to indicate best university
merged_df.loc[[merged_df['Average Rank'].argmin()]]

Unnamed: 0,University,Rank_QS,Country,Region,International Faculty,Total Faculty,International Students,Total Students,Rank_THE,Faculty-Student Ratio,International Student Ratio,Average Rank
1,Stanford University,2.0,United States,North America,2042.0,4285.0,3611.0,15878.0,3.0,7.5,0.22,2.5


#### Explanation
In order to find the best university taking into consideration both rankings, we first calulated the average rank of each university. Then, we found the minimum of the averages to indicate the best university. In this case, 'Stanford University' has the lowest average rank.