# Get the data from the web

The example below extracts data from the web. It uses BeautifulSoup to derive specific parts from the html protocol. In the case below it searches for the html class wikitable from the site https://en.wikipedia.org/wiki/Tilburg_Trappers

    tables = soup.findAll(attrs={'class': re.compile(r".*\bwikitable\b.*")})

selecting the first which is put in a dataframe. 
    

    

In [2]:
#### !/usr/bin/env python3

__author__ = "Fenna Feenstra"

import urllib.request, urllib.parse, urllib.error
import ssl
from bs4 import BeautifulSoup
import re
import pandas as pd


def hack_ssl():
    """ ignores the certificate errors"""
    ctx = ssl.create_default_context()
    ctx.check_hostname = False
    ctx.verify_mode = ssl.CERT_NONE
    return ctx


def open_url(url):
    """ opens url"""
    ctx = hack_ssl()
    html = urllib.request.urlopen(url, context=ctx).read()
    return html
    

def fetch_tables(html):
    """ reads html file as a big string and cleans the html file to make it
        more readable. input: html, output: tables
    """
    soup = BeautifulSoup(html, 'html.parser')
    tables = soup.findAll(attrs={'class': re.compile(r".*\bwikitable\b.*")})
    return tables[0]


def table_df(table):
    """parses the html table to a pandas dataframe"""
    #fetch dimensions
    l = len(table.find_all('tr')) 
    w = len(table.find_all('tr')[0].find_all('td'))
    matrix = [['' for i in range(0,w)] for i in range(0,l)]
    #fetch content
    for i, row in enumerate(table.find_all('tr')):
        for j, column in enumerate(row.find_all('td')):        
            matrix[i][j]=column.get_text().strip()
    #put in df making first row the header
    df = pd.DataFrame(matrix[1:], columns = matrix[0])
    return df


def main():
    html = open_url('https://en.wikipedia.org/wiki/Tilburg_Trappers')
    t = fetch_tables(html)
    df = table_df(t)
    print(df)
    return 0

    
main()
    

     Season  GP   W OTW OTL   L  Pts   GF   GA              Finish  \
0   2018/19  65  44   6   5  10  150  323  174  1st, Oberliga Nord   
1   2017/18  59  43   8   3   5  110  265  129  1st, Oberliga Nord   
2   2016/17  60  42   1   3  14  131  263  145  4th, Oberliga Nord   
3   2015/16  57  40   3   1  11  130  284  119  2nd, Oberliga Nord   
4   2014/15  24  17   2   0   5   38  126   53     2nd, Eredivisie   
5   2013/14  36  30   2   4   4   64  216   64     1st, Eredivisie   
6   2012/13  36  25   2   3   6   82  198   93     2nd, Eredivisie   
7   2011/12  14  11   0   0   3   80   36   33  3rd, North Sea Cup   
8   2010/11  28  19   0   1   8  120   70   58  2nd, North Sea Cup   
9   2009/10  28  17   1   2   8  135   86   55     3rd, Eredivisie   
10  2008/09  24  18   1   0   5  142   78   56     1st, Eredivisie   
11  2007/08  24  20   1   0   3  132   58   62     1st, Eredivisie   
12  2006/07  20  12   1   2   5   88   55   40     2nd, Eredivisie   
13  2005/06  20   7 

0

## Assignment week 01

Check the site https://www.knmi.nl/nederland-nu/seismologie/aardbevingen

On this site you find a table of seismologic activities with the following fields.

    Analysis
    Date and time (UTC)
    Place
    Magnitude
    Depth (km)
    Type of earthquacke
    Details

Your job is to fetch the table with the last 15 seismologic activities. Only select the `'Date and Time (UTC)', 'Place', 'Magnitude', 'Depth (km)', 'Type earthquacke'` columns and put it in a pandas dataframe. See example below. 

The data is not clean, since it might contain data from Belgium or Germany. Remove that data. Further more we would like some more insight in the statistics of the data. Calculate the minimum, maximum, mean and standard deviation of the Magnitude using NumPy. Plot the magnitudes choosing a visualisation of your choice.