# Homework 2

In [None]:
# Imports
import json
import difflib 
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen

### Question 1
Obtain the 200 top-ranking universities in www.topuniversities.com (ranking 2018). In particular, extract the following fields for each university: name, rank, country and region, number of faculty members (international and total) and number of students (international and total). Some information is not available in the main list and you have to find them in the details page. Store the resulting dataset in a pandas DataFrame and answer the following questions:

In [None]:
# Load the data json file
top_universities_url = 'https://www.topuniversities.com/sites/default/files/qs-rankings-data/357051_indicators.txt'
university_url = 'https://www.topuniversities.com'
hdr = {'User-Agent': 'Mozilla/5.0'}
request = Request(top_universities_url, headers=hdr)
page = urlopen(request)
data = json.loads(page.read().decode())

# Create an empty dataframe
cols = ['University','Rank', 'Country', 'Region', 'International Faculty', 'Total Faculty', 'International Students', 'Total Students']
df = pd.DataFrame(columns=cols)

# Parse json for each ranking
for item in range(200):
#   Obtain rank, country, region, and university name information
    rank = int(data['data'][item]['overall_rank'].strip('='))
    country = data['data'][item]['location']
    region = data['data'][item]['region']
    name = data['data'][item]['uni']
    soup = BeautifulSoup(name, "lxml")
    link = soup.find_all('a')[0]['href']
    name = soup.find_all('a')[0].text
    
#   Retrieve faculty and student data from the details page
    request = Request(university_url+link, headers=hdr)
    page = urlopen(request)
    soup = BeautifulSoup(page, "lxml")

    faculty_total = 0
    inter_faculty_num = 0
    total_students_num = 0
    inter_students_num = 0

    faculty = soup.find('div', attrs={'class':'total faculty'})
    if faculty:
        faculty_total = (faculty.find('div', attrs={'class':'number'}).text).strip('\t\r\n ')
        faculty_total = int(faculty_total.replace(',', ''))

    inter_faculty = soup.find('div', attrs={'class':'inter faculty'})
    if inter_faculty:
        inter_faculty_num = (inter_faculty.find('div', attrs={'class':'number'}).text).replace('\t\r\n ', '')
        inter_faculty_num = int(inter_faculty_num.replace(',', ''))

    total_students = soup.find('div', attrs={'class':'total student'})
    if total_students:
        total_students_num = (total_students.find('div', attrs={'class':'number'}).text).replace('\t\r\n ', '')
        total_students_num = int(total_students_num.replace(',', ''))

    inter_students = soup.find('div', attrs={'class':'total inter'})
    if inter_students:
        inter_students_num = (inter_students.find('div', attrs={'class':'number'}).text).replace('\t\r\n ', '')
        inter_students_num = int(inter_students_num.replace(',', ''))

#   Append each row of data to the dataframe
    df2 = pd.DataFrame([[name, rank, country, region, inter_faculty_num, faculty_total, inter_students_num, total_students_num]], columns=cols)
    df = df.append(df2, ignore_index=True)

# Create a copy of the dataframe for question 3
t1 = df.copy()

df.head(n=10)

In [None]:
# Map country to region for question 2
map_country_region = {t1['Country'][k]: t1['Region'][k] for k in range(200)}

# Handle exceptions
map_country_region['Luxembourg'] = 'Europe'
map_country_region['Russian Federation'] = 'Europe'

## Hint: Plot your data below using bar charts and describe briefly what you observed. Assume 'best universities' to refer to a top-k. Here our k = 10.

#### (a) Which are the best universities in terms of ratio between faculty members and students?

In [None]:
# Replace all zero's with NaN values so ratio calculations can be made
t1 = t1.replace(0, np.NaN)
ratios = t1.copy()

# Calculate the ratio of total faculty to total students
ratios['Faculty-Student Ratio'] = ratios['Total Faculty']/ratios['Total Students']

# Sort the ratios by ascending order
fs_ratios = ratios.sort_values('Faculty-Student Ratio', ascending=False)
fs_ratios_a = fs_ratios[['University', 'Faculty-Student Ratio']].head(n=10)

The top 10 universities in terms of faculty-student ratio are:

In [None]:
fs_ratios_a

#### (b) Which are the best universities in terms of ratio of international students?

In [None]:
# Calculate the ratio of international students to total students
ratios['International Student Ratio'] = ratios['International Students']/ratios['Total Students']

# Sort the ratios by ascending order
inter_ratio = ratios.sort_values('International Student Ratio', ascending=False)
inter_ratio_b = inter_ratio[['University', 'International Student Ratio']].head(n=10)

The top 10 universities in terms of international student ratio are:

In [None]:
inter_ratio_b

#### (c) Aggregate answers (a) and (b) by country

In [None]:
fs_ratios_c = fs_ratios[['University', 'Country', 'Faculty-Student Ratio']].head(n=10)

# Aggregate faculty-student ratios by country and university
fs_ratios_c = fs_ratios_c.set_index(['Country', 'University'])
fs_ratios_c = fs_ratios_c.sort_index()

inter_ratios_c = inter_ratio[['University', 'Country', 'International Student Ratio']].head(n=10)

# Aggregate international student ratios by country and university
inter_ratios_c = inter_ratios_c.set_index(['Country', 'University'])
inter_ratios_c = inter_ratios_c.sort_index()

The top 10 universities for faculty-student ratio agreggated by country are:

In [None]:
fs_ratios_c

The top 10 universities for international student ratio agreggated by country are:

In [None]:
inter_ratios_c

#### (d) Aggregate answers (a) and (b) by region

In [None]:
fs_ratios_d = fs_ratios[['University', 'Region', 'Faculty-Student Ratio']].head(n=10)

# Aggregate faculty-student ratios by region and university
fs_ratios_d = fs_ratios_d.set_index(['Region', 'University'])
fs_ratios_d = fs_ratios_d.sort_index()

inter_ratios_d = inter_ratio[['University', 'Region', 'International Student Ratio']].head(n=10)

# Aggregate international student ratios by region and university
inter_ratios_d = inter_ratios_d.set_index(['Region', 'University'])
inter_ratios_d = inter_ratios_d.sort_index()

The top 10 universities for faculty-student ratio agreggated by region are:

In [None]:
fs_ratios_d

The top 10 universities for international student ratio agreggated by region are:

In [None]:
inter_ratios_d

### Question 2
Obtain the 200 top-ranking universities in www.timeshighereducation.com (ranking 2018). Repeat the analysis of the previous point and discuss briefly what you observed.

In [None]:
# Load the data json file
top_universities_url = 'https://www.timeshighereducation.com/sites/default/files/the_data_rankings/world_university_rankings_2018_limit0_369a9045a203e176392b9fb8f8c1cb2a.json'
hdr = {'User-Agent': 'Mozilla/5.0'}
request = Request(top_universities_url, headers=hdr)
page = urlopen(request)
data = json.loads(page.read().decode())['data']

# Create an empty dataframe
cols = ['University','Rank', 'Country', 'Region', 'Faculty-Student Ratio', 'International Student Ratio']
df = pd.DataFrame(columns=cols)

# Parse json for each ranking
for item in range(200):
#   Obtain university name, rank, country, student-staff ratio, and international student percent
    name = data[item]['name']
    rank = int(data[item]['rank'].strip('='))
    country = data[item]['location']
    stud_staff_ratio = float(data[item]['stats_student_staff_ratio'])
    inter_stud_percent = int(data[item]['stats_pc_intl_students'].strip('%'))/100.0
    
#   Append each row of data to the dataframe
    df2 = pd.DataFrame([[name, rank, country, map_country_region[country], stud_staff_ratio, inter_stud_percent]], columns=cols)
    df = df.append(df2, ignore_index=True)
    
# Create a copy of the dataframe for question 3
t2 = df.copy()

df.head(n=10)

#### (a) Which are the best universities in terms of ratio between faculty members and students?

In [None]:
# Replace all zero's with NaN values so ratio calculations can be made
t2 = t2.replace(0, np.NaN)
ratios = t2.copy()

# Sort the faculty-student ratios by ascending order
fs_ratios = ratios.sort_values('Faculty-Student Ratio', ascending=False)
fs_ratios_a = fs_ratios[['University', 'Faculty-Student Ratio']].head(n=10)

The top 10 universities in terms of faculty-student ratio are:

In [None]:
fs_ratios_a

#### (b) Which are the best universities in terms of ratio of international students?

In [None]:
# Sort the international student ratios by ascending order
inter_ratio = ratios.sort_values('International Student Ratio', ascending=False)
inter_ratio_b = inter_ratio[['University', 'International Student Ratio']].head(n=10)

The top 10 universities in terms of international student ratio are:

In [None]:
inter_ratio_b

#### (c) Aggregate answers (a) and (b) by country

In [None]:
fs_ratios_c = fs_ratios[['University', 'Country', 'Faculty-Student Ratio']].head(n=10)

# Aggregate faculty-student ratios by country and university
fs_ratios_c = fs_ratios_c.set_index(['Country', 'University'])
fs_ratios_c = fs_ratios_c.sort_index()

inter_ratios_c = inter_ratio[['University', 'Country', 'International Student Ratio']].head(n=10)

# Aggregate international student ratios by country and university
inter_ratios_c = inter_ratios_c.set_index(['Country', 'University'])
inter_ratios_c = inter_ratios_c.sort_index()

The top 10 universities for faculty-student ratio agreggated by country are:

In [None]:
fs_ratios_c

The top 10 universities for international student ratio agreggated by country are:

In [None]:
inter_ratios_c

#### (d) Aggregate answers (a) and (b) by region

In [None]:
fs_ratios_d = fs_ratios[['University', 'Region', 'Faculty-Student Ratio']].head(n=10)

# Aggregate faculty-student ratios by region and university
fs_ratios_d = fs_ratios_d.set_index(['Region', 'University'])
fs_ratios_d = fs_ratios_d.sort_index()

inter_ratios_d = inter_ratio[['University', 'Region', 'International Student Ratio']].head(n=10)

# Aggregate international student ratios by region and university
inter_ratios_d = inter_ratios_d.set_index(['Region', 'University'])
inter_ratios_d = inter_ratios_d.sort_index()

The top 10 universities for faculty-student ratio agreggated by region are:

In [None]:
fs_ratios_d

The top 10 universities for international student ratio agreggated by region are:

In [None]:
inter_ratios_d

## Question 3

#### Merge the two DataFrames created in questions 1 and 2 using university names. Match universities' names as well as you can, and explain your strategy. Keep track of the original position in both rankings.

Our strategy is to create an index on univerity names and perform an approximated matching to homogenize the university names. We conduct the matching using the [difflib.get_close_matches](https://docs.python.org/2/library/difflib.html) functionality which finds the best matches of a value from a list. In this case, the university name in Table 1 is matched from a list of close values of university names from Table 2. Then an outer join is performed. 

In [None]:
t2.index = t2['University']
# Get closest match on university name in both tables
t1['University'] = t1['University'].map(lambda x: (difflib.get_close_matches(x, t2.index)[:1] or [None])[0])

# Rename the rank columns for each dataset accordingly
t1 = t1.rename(columns={'Rank':'Rank_QS'})
t2 = t2.rename(columns={'Rank':'Rank_THE'})

# Perform an outer merge on the tables
merged_df = pd.merge(t1, t2, how='outer')
merged_df.head(n=10)

## Question 4
#### Find useful insights in the data by performing an exploratory analysis. Can you find a strong correlation between any pair of variables in the dataset you just created? Example: when a university is strong in its international dimension, can you observe a consistency both for students and faculty members?

In [None]:
# Answer

#### Explanation:


## Question 5
Can you find the best university taking in consideration both rankings? Explain your approach.

In [None]:
# Answer

#### Explanation of Approach