# HW2 - Data From The Web

## 1. Web Scraping 

We will first write all functions needed to extract the data from the websites and demonstrate how they work. Then we will complete the assignment using them.

In [1]:
import requests as rq
import numpy as np
from bs4 import BeautifulSoup
import pandas as pd
import seaborn
import pickle
%matplotlib inline

TOP_UNI_URL = 'https://www.topuniversities.com'
TIMES_URL = 'https://www.timeshighereducation.com'

### 1.1 Top Universities Website

After a quick look with Postman on the webside, we realised that the ranking table data were not given on the page but retrived and added with a script on a json file. We then had to directly get this file to obtain the information we need for this assignment.

In [2]:
TOP_WORLD_RANKING_FILE = TOP_UNI_URL + '/sites/default/files/qs-rankings-data/357051.txt'
ranking_top_uni = rq.get(TOP_WORLD_RANKING_FILE).json().get('data')

The file returned by the request is a simple json object with the list of universities ordered by rank after the 'Data' tag.

In [3]:
top_unis = []
for uni in ranking_top_uni[:200]:
    top_unis.append({'Name' : uni.get('title'), 'Rank' : uni.get('rank_display'), 
                     'Country' : uni.get('country'), 'Region' : uni.get('region'), 'Url' : uni.get('url')})
print(top_unis[:5])

[{'Name': 'Massachusetts Institute of Technology (MIT)', 'Rank': '1', 'Country': 'United States', 'Region': 'North America', 'Url': '/universities/massachusetts-institute-technology-mit'}, {'Name': 'Stanford University', 'Rank': '2', 'Country': 'United States', 'Region': 'North America', 'Url': '/universities/stanford-university'}, {'Name': 'Harvard University', 'Rank': '3', 'Country': 'United States', 'Region': 'North America', 'Url': '/universities/harvard-university'}, {'Name': 'California Institute of Technology (Caltech)', 'Rank': '4', 'Country': 'United States', 'Region': 'North America', 'Url': '/universities/california-institute-technology-caltech'}, {'Name': 'University of Cambridge', 'Rank': '5', 'Country': 'United Kingdom', 'Region': 'Europe', 'Url': '/universities/university-cambridge'}]


Here are the functions to obtain the amount of faculty members (total & international) and students (total & international)

In [5]:
def get_num_members(soup):
    return int(soup.find('div', class_='total faculty').find('div', class_='number').text.strip().replace(',', '')),\
           int(soup.find('div', class_='inter faculty').find('div', class_='number').text.strip().replace(',', ''))

In [6]:
page = rq.get(TOP_UNI_URL + '/universities/ecole-polytechnique-fédérale-de-lausanne-epfl')
soup = BeautifulSoup(page.text, 'html.parser')
tot, inter = get_num_members(soup)
print('EPFL number of faculty members (total, international:)', tot, inter)

EPFL number of faculty members (total, international:) 1695 1300


In [7]:
def get_num_students(soup):
    return int(soup.find('div', class_='total student').find('div', class_='number').text.strip().replace(',', '')),\
           int(soup.find('div', class_='total inter').find('div', class_='number').text.strip().replace(',', ''))

In [8]:
tot, inter = get_num_students(soup)
print('EPFL number of students (total, international:', tot, inter)

EPFL number of students (total, international: 10343 5896


In [10]:
problematic_uni = []
for uni in top_unis:
    page = rq.get(TOP_UNI_URL + uni['Url'])
    soup = BeautifulSoup(page.text, 'html.parser')
    try:
        tot_mem, inter_mem = get_num_members(soup)
        tot_stud, inter_stud = get_num_students(soup)
        uni['Tot_Mem'] = tot_mem
        uni['Inter_Mem'] = inter_mem
        uni['Tot_Stud'] = tot_stud
        uni['Inter_Stud'] = inter_stud
    except AttributeError: 
        problematic_uni.append(top_unis.index(uni))
        print('Could not fetch data from:', uni['Name'])

Could not fetch data from: New York University (NYU)
Could not fetch data from: Indian Institute of Science (IISc) Bangalore


As we can see two universities don't have the data on the topuniversities website so we will add them by hand watching on the website our self.

In [11]:
nyu = top_unis[problematic_uni[0]]
nyu['Tot_Mem'] = 7717
nyu['Inter_Mem'] = 604
nyu['Tot_Stud'] = 43860
nyu['Inter_Stud'] = 11593

bang = top_unis[problematic_uni[1]]
bang['Tot_Mem'] = 423
bang['Inter_Mem'] = 0
bang['Tot_Stud'] = 4071
bang['Inter_Stud'] = 47

In [12]:
print(top_unis[:5])

[{'Name': 'Massachusetts Institute of Technology (MIT)', 'Rank': '1', 'Country': 'United States', 'Region': 'North America', 'Url': '/universities/massachusetts-institute-technology-mit', 'Tot_Mem': 2982, 'Inter_Mem': 1679, 'Tot_Stud': 11067, 'Inter_Stud': 3717}, {'Name': 'Stanford University', 'Rank': '2', 'Country': 'United States', 'Region': 'North America', 'Url': '/universities/stanford-university', 'Tot_Mem': 4285, 'Inter_Mem': 2042, 'Tot_Stud': 15878, 'Inter_Stud': 3611}, {'Name': 'Harvard University', 'Rank': '3', 'Country': 'United States', 'Region': 'North America', 'Url': '/universities/harvard-university', 'Tot_Mem': 4350, 'Inter_Mem': 1311, 'Tot_Stud': 22429, 'Inter_Stud': 5266}, {'Name': 'California Institute of Technology (Caltech)', 'Rank': '4', 'Country': 'United States', 'Region': 'North America', 'Url': '/universities/california-institute-technology-caltech', 'Tot_Mem': 953, 'Inter_Mem': 350, 'Tot_Stud': 2255, 'Inter_Stud': 647}, {'Name': 'University of Cambridge', '

Now we will save those data in a pickle file to be able to use them without requesting everytime

In [13]:
with open('topunis.pickle', 'wb') as out:
    pickle.dump(top_unis, out)

In [86]:
top_uni_df = pd.DataFrame.from_dict(top_unis).set_index('Rank').drop('Url', axis=1)
top_uni_df = top_uni_df[['Name', 'Country', 'Region', 'Tot_Stud', 'Inter_Stud', 'Tot_Mem', 'Inter_Mem']]

In [88]:
top_uni_df.head()

Unnamed: 0_level_0,Name,Country,Region,Tot_Stud,Inter_Stud,Tot_Mem,Inter_Mem
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,Massachusetts Institute of Technology (MIT),United States,North America,11067,3717,2982,1679
2,Stanford University,United States,North America,15878,3611,4285,2042
3,Harvard University,United States,North America,22429,5266,4350,1311
4,California Institute of Technology (Caltech),United States,North America,2255,647,953,350
5,University of Cambridge,United Kingdom,Europe,18770,6699,5490,2278


### 1.1.2 Times Higher Education Website

In [69]:
TOP_TIMES_RANKING_FILE = TIMES_URL + '/sites/default/files/the_data_rankings/world_university_rankings_2018_limit0_369a9045a203e176392b9fb8f8c1cb2a.json'
ranking_top_times = rq.get(TOP_TIMES_RANKING_FILE).json().get('data')
print(ranking_top_times[:1])

[{'rank_order': '10', 'rank': '1', 'name': 'University of Oxford', 'scores_overall': '94.3', 'scores_overall_rank': '10', 'scores_teaching': '86.7', 'scores_teaching_rank': '5', 'scores_research': '99.5', 'scores_research_rank': '1', 'scores_citations': '99.1', 'scores_citations_rank': '15', 'scores_industry_income': '63.7', 'scores_industry_income_rank': '169', 'scores_international_outlook': '95.0', 'scores_international_outlook_rank': '24', 'record_type': 'master_account', 'member_level': '0', 'url': '/world-university-rankings/university-oxford', 'nid': 468, 'location': 'United Kingdom', 'stats_number_students': '20,409', 'stats_student_staff_ratio': '11.2', 'stats_pc_intl_students': '38%', 'stats_female_male_ratio': '46 : 54', 'aliases': 'University of Oxford', 'subjects_offered': 'Archaeology,Art, Performing Arts & Design,Biological Sciences,Business & Management,Chemical Engineering,Chemistry,Civil Engineering,Computer Science,Economics & Econometrics,Electrical & Electronic Eng

The file returned by the request is a simple json object with the list of universities ordered by rank after the 'Data' tag. But this time the *Region* is not given by the website, no even on the specific pages. But this time it contains everything else, no need to scrap the specific pages.

In [94]:
top_times = []
for uni in ranking_top_times[:200]:
    top_times.append({'Name' : uni.get('name'), 'Rank' : uni.get('rank'), 
                      'Country' : uni.get('location'), 'Region' : '',
                      'Tot_Stud' : int(uni.get('stats_number_students').replace(',', '')), 
                      'Inter_Stud' : round(int(uni.get('stats_number_students').replace(',', '')) * int(uni.get('stats_pc_intl_students').replace('%', '')) / 100),
                      'Tot_Mem' : round(int(uni.get('stats_number_students').replace(',', '')) / float(uni.get('stats_student_staff_ratio'))),
                      'Inter_Mem' : None})
print(top_times[:5])

[{'Name': 'University of Oxford', 'Rank': '1', 'Country': 'United Kingdom', 'Region': '', 'Tot_Stud': 20409, 'Inter_Stud': 7755, 'Tot_Mem': 1822, 'Inter_Mem': None}, {'Name': 'University of Cambridge', 'Rank': '2', 'Country': 'United Kingdom', 'Region': '', 'Tot_Stud': 18389, 'Inter_Stud': 6436, 'Tot_Mem': 1687, 'Inter_Mem': None}, {'Name': 'California Institute of Technology', 'Rank': '=3', 'Country': 'United States', 'Region': '', 'Tot_Stud': 2209, 'Inter_Stud': 596, 'Tot_Mem': 340, 'Inter_Mem': None}, {'Name': 'Stanford University', 'Rank': '=3', 'Country': 'United States', 'Region': '', 'Tot_Stud': 15845, 'Inter_Stud': 3486, 'Tot_Mem': 2113, 'Inter_Mem': None}, {'Name': 'Massachusetts Institute of Technology', 'Rank': '5', 'Country': 'United States', 'Region': '', 'Tot_Stud': 11177, 'Inter_Stud': 3800, 'Tot_Mem': 1285, 'Inter_Mem': None}]


Now we will save those data in a pickle file to be able to use them without requesting everytime

In [95]:
with open('toptimes.pickle', 'wb') as out:
    pickle.dump(top_times, out)

In [98]:
top_times_df = pd.DataFrame.from_dict(top_times).set_index('Rank')
top_times_df = top_times_df[['Name', 'Country', 'Region', 'Tot_Stud', 'Inter_Stud', 'Tot_Mem', 'Inter_Mem']]

In [100]:
top_times_df.head()

Unnamed: 0_level_0,Name,Country,Region,Tot_Stud,Inter_Stud,Tot_Mem,Inter_Mem
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,University of Oxford,United Kingdom,,20409,7755,1822,
2,University of Cambridge,United Kingdom,,18389,6436,1687,
=3,California Institute of Technology,United States,,2209,596,340,
=3,Stanford University,United States,,15845,3486,2113,
5,Massachusetts Institute of Technology,United States,,11177,3800,1285,
