In [6]:
import urllib.request as urllib
from bs4 import BeautifulSoup
import re
import json
import os

## Function definitions

In [11]:
# Read in a webpage
def read_in_page(url):
    #Query the website and return the html to the variable 'page'
    page = urllib.urlopen(url)
    soup = BeautifulSoup(page, "lxml")
    return(soup)


# Get all relevant links in a webpage
def get_links(page):
    links = page.find_all("a", class_="icon-arrow")
    links_new = []
    for link in links:
        sep = '?'
        rest = link['href'].split(sep, 1)[0]
        links_new.append(rest)
    return(links_new)


# Returns a dictionary {study-name: az-url}
def get_links_studies(page):
    links = page.find_all("a", class_="icon-arrow")
    links_new = {}
    for link in links:
        sep = '?'
        rest = link['href'].split(sep, 1)[0]
        # Get rid of everything before "student" and replace with https://
        temp = rest.split('student')
        rest = 'https://student' + temp[1]
        if rest.endswith('/'):
        # only do this if there isnt a slash at the end of rest
            links_new[link.string] = rest + "az"
        else:
            links_new[link.string] = rest + "/az"
    return(links_new)  


def get_az_links(az_url):
    az_page = read_in_page(az_url)
    az_links = get_links(az_page)
    az_links_new = []

    # get all the links on an AZ page
    for link in az_links:
        link = "http://www.student.uva.nl" + link
        az_links_new.append(link)
    return(az_links_new)


# Returns a list of keywords when given a question URL.
def get_keywords(question_url):
    # grab keywords with beautiful soup
    question_page = read_in_page(question_url)
    keyword_tag = question_page.findAll("meta", {"name": 'keywords'})
    if keyword_tag:
        keywords = keyword_tag[0]['content']
        keywords_list = keywords.split(",")
        return(keywords_list)
    else:
        return([])

# Takes URL returns text from that page
def article_text(url):
    page = urllib.urlopen(url)
    soup = BeautifulSoup(page, "lxml")
    info = ""
    [x.extract() for x in soup.findAll('h1')] # get rid of headers
    [x.extract() for x in soup.findAll('h2')] # get rid of headers
    [x.extract() for x in soup.findAll('h3')]
    [x.extract() for x in soup.findAll('p', class_="meta")]
    
    for line in soup.find_all("article", class_ = "eight columns"):
        info = info + line.text
    info = info.replace('\n', ' ')
    info = info.replace('\r', ' ')
    info = info.replace('\t', ' ')
    info = ' '.join(info.split())
    #info = re.split(r'\.(?!\d)', info)
    return info

# Makes a json file for each article in a A-Z page 
def make_az_jsons_english(link, n, study_name):
    question_page_dict = {}
    keywords = get_keywords(link)
    text = article_text(link)
        
    question_page_dict['level'] = study_name
    question_page_dict['url'] = link
    question_page_dict['keywords'] = keywords
    question_page_dict['text'] = text
        
    # put question_page_dict into a JSON file
    filename = str(n) + "-study-fnwi.json"
        
    with open(filename, 'w') as outfile:
        json.dump(question_page_dict, outfile)
                
    #print("link done")         

### Scraping the UvA website and creating JSON files for each article

Since Watson had a limit on the amount of documents that could be uploaded, we had to be selective about choosing which pages we would train. We choose the general english A-Z page and some masters of FNWI, since these were in english and all the A-Z pages of bachelor studies were in dutch.

In the cell below we take the url of the studypage and see which studies it contains.

In [9]:
unique_az_links = {}

# read in page and for each study get the URL of the A-Z page
page = read_in_page('http://student.uva.nl/opleidingen/opleidingenlijst.html?t=fnwi&t=master')
studies_dict = get_links_studies(page)
for study, az_link in studies_dict.items():
    if az_link not in unique_az_links:
        unique_az_links[az_link] = [study]
    else: 
        unique_az_links[az_link].append(study)
    
unique_az_links

{'https://student.uva.nl/ai/az': ["Artificial Intelligence (Master's)"],
 'https://student.uva.nl/bcs/az': ["Brain and Cognitive Sciences (Master's)"],
 'https://student.uva.nl/bmed/az': ["Biomedical Sciences (Master's)"],
 'https://student.uva.nl/bs/az': ["Biological Sciences (Master's)"],
 'https://student.uva.nl/chem/az': ["Chemistry (Master's)"],
 'https://student.uva.nl/cls/az': ["Computational Science (Master's)"],
 'https://student.uva.nl/es/az': ["Earth Sciences (Master's)"],
 'https://student.uva.nl/fs/az': ["Forensic Science (Master's)"],
 'https://student.uva.nl/is/az': ["Information Studies (Master's)"],
 'https://student.uva.nl/log/az': ["Logic (Master's)"],
 'https://student.uva.nl/ls/az': ["Life Sciences (Master's)"],
 'https://student.uva.nl/math/az': ["Mathematics (Master's)"],
 'https://student.uva.nl/mph/az': ["Mathematical Physics (Master's)"],
 'https://student.uva.nl/phys-astro/az': ["Astronomy and Astrophysics (Master's)",
  "Physics (Master's)",
  "Physics and A

There were some troubles when running this automatically, it would create an amount of JSON files and then it would just stop making new ones, even though everything was still running. So a solutions was to do it seperately for each A-Z link.

In [13]:
# get all the articles of an A-Z page
url = 'https://student.uva.nl/ai/az'
az_links_per_study = get_az_links(url)

i = 0
# for each article make a json file
for link in az_links_per_study:
    page = read_in_page(link)
    make_az_jsons_english(link, i, ["Artificial Intelligence (Master's)"])
    i += 1  