In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

# Lab 3 SI 618: Fetching and parsing structured documents (100 points)
#
# The utf8 'magic comment' is to tell Python that this source code will
# contain unicode literals outside of the ISO-Latin-1 character set.

# Some lines of code are taken from Google's Python Class
# http://code.google.com/edu/languages/google-python-class/  and
# an earlier lab by Dr. Yuhang Wang.

# The purpose of this lab is to have you practice using some powerful
# modules for fetching and parsing content:
#    urllib3 : for fetching the content of a URL (e.g. HTML page)
#    BeautifulSoup : for parsing HTML and XML pages
#    json : for JSON reading and writing
#
# You should fill in the code for the functions below.
# main() is already set up to call the functions with a few different inputs,
# printing 'OK' when each function is correct.

In [2]:
from bs4 import BeautifulSoup
import json
import urllib3
import re
urllib3.disable_warnings()

In [3]:
# We will get the University of Michigan wikipedia page using urllib3. This will be the html used in the lab
http = urllib3.PoolManager(cert_reqs='CERT_NONE')

response = http.request("GET", 'https://en.wikipedia.org/wiki/University_of_Michigan')
html_doc = response.data.decode('utf-8')

In [4]:
# this is the json string used in this lab. It is a tweet posted in the CDC Twitter account regarding COVID vaccines on 9/10/2021 
json_str = open("tweet.json","r").read()

In [5]:
# Q1. get_title (5 points)
# The get_title function should should process the HTML page stored in the global
# variable html_doc, and return the title of the page in a unicode string.
# get_title() should return 'University of Michigan - Wikipedia'
def get_title():
    soup = BeautifulSoup(html_doc)
    return soup.head.title.contents[0]
    
# get_title()

In [6]:
# Q2. get_tweet_image_attachment_url (10 points)
# The get_tweet_image_attachment_url function should load the dictionary stored as a JSON string
# in global variable json_str, and return the secure link for the image file attached in the tweet.
# get_tweet_image_attachment_url() should return https://pbs.twimg.com/media/E-8npfCWYAMzdlC.jpg
def get_tweet_image_attachment_url():
    data = json.loads(json_str)
    return data["entities"]["media"][0]['media_url_https']
    
# get_tweet_image_attachment_url()

In [7]:
# Q3. get_wiki_links_count (15 points)
# The get_link_count function should process the HTML page stored in the global variable
# html_doc, and return the number of links that point back to a wikipedia page
# get_wiki_links_count() should return 1619
def get_wiki_links_count():
    soup = BeautifulSoup(html_doc)
    return len(soup.find_all('a',href=re.compile("^/wiki/")))

# get_wiki_links_count()

In [8]:
# Q4. get_latin_name (10 points)
# The get_latin_name function should process the HTML page stored in the global variable
# html_doc, and return the name of the university in Latin  (can be found below the University of Michigan Seal image on the table at the start of the page) 
# get_latin_name() should return 'Universitas Michigania'
def get_latin_name():
    soup = BeautifulSoup(html_doc)
    return soup.find_all('i',lang=re.compile("la"))[0].contents[0]

# get_latin_name()

In [9]:
# Q5. get_first_level_sub_headings (25 points)
# The get_first_level_sub_headings function should process the HTML page stored in the global variable
# html_doc, and return the first level subheadings from the text of the webpage, WITHOUT USING THE CONTENTS TABLE. 
# First level headings include 'History', 'Campus' and the like and NOT their sub level headings such as 'Historical Links', 'Central Campus', 'North Campus' etc
# Note that it should return a string, not a list. 
# get_first_level_sub_headings() should return '["History", "Campus", "Organization and administration", "Academics", "Student body", "Student life", "Athletics", "Museums", "Notable alumni", "Notes", "References", "External links"]'
def get_first_level_sub_headings():
    soup = BeautifulSoup(html_doc)
    ans = [row.contents[0].contents[2].text for row in soup.find_all('li',class_= 'toclevel-1')]
    return str(ans).replace("'","\"")

# get_first_level_sub_headings()

In [10]:
# Q6. get_school_years_info (35 points)
# The get_school_years_info function should process the HTML page stored in the global variable
# html_doc, and return information from the info table under the 'Organization and Administration' section. 
# Note that it should return a string, not a list.
# get_school_years_info() should return '{"Literature, Science, andthe Arts": "1841", "Medicine": "1850", "Engineering": "1854", "Law": "1859", "Dentistry": "1875", "Pharmacy": "1876", "Music, Theatre &Dance": "1880", "Nursing": "1893", "Architecture &Urban Planning": "1906", "Graduate Studies": "1912", "Government": "1914", "Education": "1921", "Business": "1924", "Environment andSustainability": "1927", "Public Health": "1941", "Social Work": "1951", "Information": "1969", "Art & Design": "1974", "Kinesiology": "1984"}'
# HINT: construct lists/list of tuples first, and then convert it to a dictionary to turn into a JSON string.
def get_school_years_info():
    soup = BeautifulSoup(html_doc)
    all_data = [i.text.replace('\n','') for i in soup.find_all('table',class_=re.compile("toccolours"))[0].find_all('td')][2:]
    keys = []
    values = []
    for i in range(0, len(all_data), 2):
        keys.append(all_data[i])
        values.append(all_data[i+1])
    
    info_dict = {k:v for k,v in zip(keys,values)}
    return json.dumps(info_dict)

# get_school_years_info() 

In [11]:
#######################################################################
# DO NOT MODIFY ANY CODE BELOW
#######################################################################

# Provided simple test() function used in main() to print
# what each function returns vs. what it's supposed to return.
def test(got, expected):
  if got == expected:
    prefix = ' Test Passed '
  else:
    prefix = '  Test Not Passed '
  print ('%s got: %s expected: %s' % (prefix, repr(got), repr(expected)))
  print()

# Provided main() calls the above functions with interesting inputs,
# using test() to check if each result is correct or not.

In [12]:
def main():
  print ('get_title')

  test(get_title(), 'University of Michigan - Wikipedia')
  
  print ('get_tweet_image_attachment_url')

  test(get_tweet_image_attachment_url(), "https://pbs.twimg.com/media/E-8npfCWYAMzdlC.jpg")

  print ('get_wiki_links_count')

  test(get_wiki_links_count(), 1619)

  print ('get_latin_name')

  test(get_latin_name(), 'Universitas Michigania')
  
  print ('get_first_level_sub_headings')

  test(get_first_level_sub_headings(), '["History", "Campus", "Organization and administration", "Academics", "Student body", "Student life", "Athletics", "Museums", "Notable alumni", "Notes", "References", "External links"]')

  print ('get_school_info')

  test(get_school_years_info(), '{"Literature, Science, andthe Arts": "1841", "Medicine": "1850", "Engineering": "1854", "Law": "1859", "Dentistry": "1875", "Pharmacy": "1876", "Music, Theatre &Dance": "1880", "Nursing": "1893", "Architecture &Urban Planning": "1906", "Graduate Studies": "1912", "Government": "1914", "Education": "1921", "Business": "1924", "Environment andSustainability": "1927", "Public Health": "1941", "Social Work": "1951", "Information": "1969", "Art & Design": "1974", "Kinesiology": "1984"}')

In [13]:
# Standard boilerplate to call the main() function.
if __name__ == '__main__':
  main()

get_title
 Test Passed  got: 'University of Michigan - Wikipedia' expected: 'University of Michigan - Wikipedia'

get_tweet_image_attachment_url
 Test Passed  got: 'https://pbs.twimg.com/media/E-8npfCWYAMzdlC.jpg' expected: 'https://pbs.twimg.com/media/E-8npfCWYAMzdlC.jpg'

get_wiki_links_count
 Test Passed  got: 1619 expected: 1619

get_latin_name
 Test Passed  got: 'Universitas Michigania' expected: 'Universitas Michigania'

get_first_level_sub_headings
 Test Passed  got: '["History", "Campus", "Organization and administration", "Academics", "Student body", "Student life", "Athletics", "Museums", "Notable alumni", "Notes", "References", "External links"]' expected: '["History", "Campus", "Organization and administration", "Academics", "Student body", "Student life", "Athletics", "Museums", "Notable alumni", "Notes", "References", "External links"]'

get_school_info
 Test Passed  got: '{"Literature, Science, andthe Arts": "1841", "Medicine": "1850", "Engineering": "1854", "Law": "1859