# HW2 - Data From The Web

## 1. Web Scraping 

We will first write all functions needed to extract the data from the websites and demonstrate how they work. Then we will complete the assignment using them.

In [None]:
import requests as rq
import numpy as np
from bs4 import BeautifulSoup
import pandas as pd
import seaborn
import pickle
%matplotlib inline

TOP_UNI_URL = 'https://www.topuniversities.com'
TIMES_URL = 'https://www.timeshighereducation.com'

### 1.1 Top Universities Website

After a quick look with Postman on the webside, we realised that the ranking table data were not given on the page but retrived and added with a script on a json file. We then had to directly get this file to obtain the information we need for this assignment.

In [None]:
TOP_WORLD_RANKING_FILE = TOP_UNI_URL + '/sites/default/files/qs-rankings-data/357051.txt'
ranking_top_uni = rq.get(TOP_WORLD_RANKING_FILE).json().get('data')

The file returned by the request is a simple json object with the list of universities ordered by rank after the 'Data' tag.

In [None]:
top_unis = []
for uni in ranking_top_uni[:200]:
    top_unis.append({'Name' : uni.get('title'), 'Rank' : uni.get('rank_display').replace('=', ''), 
                     'Country' : uni.get('country'), 'Region' : uni.get('region'), 'Url' : uni.get('url')})
print(top_unis[:5])

Here are the functions to obtain the amount of faculty members (total & international) and students (total & international)

In [None]:
def get_num_members(soup):
    return int(soup.find('div', class_='total faculty').find('div', class_='number').text.strip().replace(',', '')),\
           int(soup.find('div', class_='inter faculty').find('div', class_='number').text.strip().replace(',', ''))

In [None]:
page = rq.get(TOP_UNI_URL + '/universities/ecole-polytechnique-fédérale-de-lausanne-epfl')
soup = BeautifulSoup(page.text, 'html.parser')
tot, inter = get_num_members(soup)
print('EPFL number of faculty members (total, international:)', tot, inter)

In [None]:
def get_num_students(soup):
    return int(soup.find('div', class_='total student').find('div', class_='number').text.strip().replace(',', '')),\
           int(soup.find('div', class_='total inter').find('div', class_='number').text.strip().replace(',', ''))

In [None]:
tot, inter = get_num_students(soup)
print('EPFL number of students (total, international:', tot, inter)

In [None]:
problematic_uni = []
for uni in top_unis:
    page = rq.get(TOP_UNI_URL + uni['Url'])
    soup = BeautifulSoup(page.text, 'html.parser')
    try:
        tot_mem, inter_mem = get_num_members(soup)
        tot_stud, inter_stud = get_num_students(soup)
        uni['Tot_Mem'] = tot_mem
        uni['Inter_Mem'] = inter_mem
        uni['Tot_Stud'] = tot_stud
        uni['Inter_Stud'] = inter_stud
    except AttributeError: 
        problematic_uni.append(top_unis.index(uni))
        print('Could not fetch data from:', uni['Name'])

As we can see two universities don't have the data on the topuniversities website so we will add them by hand watching on the website our self.

In [None]:
nyu = top_unis[problematic_uni[0]]
nyu['Tot_Mem'] = 7717
nyu['Inter_Mem'] = 604
nyu['Tot_Stud'] = 43860
nyu['Inter_Stud'] = 11593

bang = top_unis[problematic_uni[1]]
bang['Tot_Mem'] = 423
bang['Inter_Mem'] = 0
bang['Tot_Stud'] = 4071
bang['Inter_Stud'] = 47

In [None]:
print(top_unis[:5])

Now we will save those data in a pickle file to be able to use them without requesting everytime

In [None]:
with open('topunis.pickle', 'wb') as out:
    pickle.dump(top_unis, out)

In [None]:
top_uni_df = pd.DataFrame.from_dict(top_unis).set_index('Rank').drop('Url', axis=1)
top_uni_df = top_uni_df[['Name', 'Country', 'Region', 'Tot_Stud', 'Inter_Stud', 'Tot_Mem', 'Inter_Mem']]

In [None]:
top_uni_df

### 1.1.2 Times Higher Education Website

In [None]:
TOP_TIMES_RANKING_FILE = TIMES_URL + '/sites/default/files/the_data_rankings/world_university_rankings_2018_limit0_369a9045a203e176392b9fb8f8c1cb2a.json'
ranking_top_times = rq.get(TOP_TIMES_RANKING_FILE).json().get('data')
for key in ranking_top_times[2].keys(): 
    print("{}: {}". format(key,ranking_top_times[2].get(key)))


The file returned by the request is a simple json object with the list of universities ordered by rank after the 'Data' tag. But this time the *Region* is not given by the website, no even on the specific pages. But this time it contains everything else, no need to scrap the specific pages.

In [None]:
top_times = []
for uni in ranking_top_times[:200]:
    top_times.append({'Name' : uni.get('name'), 'Rank' : uni.get('rank').replace('=', ''), 
                      'Country' : uni.get('location'), 'Region' : '',
                      'Tot_Stud' : int(uni.get('stats_number_students').replace(',', '')), 
                      'Inter_Stud' : round(int(uni.get('stats_number_students').replace(',', '')) * int(uni.get('stats_pc_intl_students').replace('%', '')) / 100),
                      'Tot_Mem' : round(int(uni.get('stats_number_students').replace(',', '')) / float(uni.get('stats_student_staff_ratio'))),
                      'Inter_Mem' : None})
print(ranking_top_times[:1])

In [None]:
for uni in ranking_top_times[:200]:
    print("***************************************")
    for key in uni.keys(): 
        print("{}: {}".format(key, uni.get(key)))

Now we will save those data in a pickle file to be able to use them without requesting everytime

In [None]:
with open('toptimes.pickle', 'wb') as out:
    pickle.dump(top_times, out)

In [None]:
top_times_df = pd.DataFrame.from_dict(top_times).set_index('Rank')
top_times_df = top_times_df[['Name', 'Country', 'Region', 'Tot_Stud', 'Inter_Stud', 'Tot_Mem', 'Inter_Mem']]

In [None]:
top_times_df

This function prepare the names for the merge. We noticed that in one of the rankings the name is present as well as an abreviation of the university name. For exemple "Ecole polytechnique federale de lausanne (EPFL)" and in the othe rankng the abreviation was not present. Because of that we remove all abreviations of that type. We also remove some types of ponctuation and blank spaces.  

In [None]:
import re

def cleanName(name): 
    name = name.lower()
    name = re.sub(r"\(.+\)", ' ', name)
    name = name.replace('the', '')
    name = name.replace('é', '')
    name = name.replace('-', ' ')
    name = name.replace('—', ' ')
    name = name.replace('–', ' ')
    name = re.sub(r"\s+", ' ', name)
    name = name.strip(' ')
    
    return name

Here we transform the names of the universities with `cleanName` and we merge the two frames together. We perform an outer join in order to keep university that doesn't have match in the other ranking. 

In [None]:
top_uni_df['Name'] = top_uni_df['Name'].map(lambda name: cleanName(name))
top_times_df['Name'] = top_times_df['Name'].map(lambda name: cleanName(name))

merged_df = top_times_df.merge(top_uni_df, on=['Name'],  how='outer')

In [None]:
merged_df[:100]