# Note:

If running this file, a local mysql database is required that gives **root access**, has a **password set to MyNewPass** and contains a **database called Weather_Data**. Furthermore a table in this database called **weather is required.**

In [1]:
import pandas as pd
import numpy as np
import datetime as dt
from urllib.request import urlopen
import requests
from bs4 import BeautifulSoup
from zipfile import ZipFile
import io
import os.path
import mysql.connector

In [2]:
def create_connection():
    cnx2 = mysql.connector.connect(host='localhost',
                                   user='root', password='MyNewPass',
                                   database='Weather_Data')
    return cnx2

In [3]:
def insert_data(data):
    conn = create_connection()
    cursor = conn.cursor()
    
    for row in data.iterrows():
        row = row[1]
        row[0] = dt.datetime.strptime(row[0], '%d-%b-%Y %H:%M')
        query = "Insert into weather (Observation_date, rain, temp, wet_bulb_temp,dew_point_temp, vapour_pressure, relative_humidity, mean_sea_level_pressure, mean_wind_speed, predominant_wind_direction, height, latitude, longitude,station, county) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s);"
        cursor.execute(query, [items for items in row])
        conn.commit()
    
    print('Finished Insert')
        
    cursor.close()
    conn.close()
        
    

In [4]:
def get_html(url):
    html_content = urlopen(url)
    content = BeautifulSoup(html_content, "lxml")
    html_content.close()
    return content

In [5]:
def data_type(raw_html):
    #look for all forms with name attribute = bar
    forms = raw_html.find_all('form',{'name':'bar'})
    #get all the inupt tags in the form
    inputs = forms[0].find_all('input',{'name':'stntype'})
    #manually set the text as its not wrapped in any tags in the html
    keys = ['Hourly', 'Daily', 'Monthly']
    radio_button_params = {}
    
    for index, input_tag in enumerate(inputs):
        radio_button_params[keys[index]] = input_tag.attrs['value']
    
    return radio_button_params

In [6]:
def get_counties(raw_html):
    forms = raw_html.find_all('form',{'name':'bar'})
    #get all the option tags in the form
    selects = forms[0].find_all('option')
    counties ={}
    for item in selects:
        if item.attrs['value'] != '0':
            counties[item.text] = item.attrs['value']
    
    return counties

In [7]:
def get_stations(raw_html):
    forms = raw_html.find_all('form',{'name':'bar'})
    #get all the select tags in the form
    selects = forms[0].find_all('select',{'name':'stn'})
    options = selects[0].find_all('option')
    stations = {}
    
    for opt in options:
        if opt.text != 'Name':
            stations[opt.text] = opt.attrs['value']
            
    return stations
    

In [8]:
def remove_null(weather_data):   
    weather_data = weather_data.dropna(thresh=12)
    weather_data.fillna(0, inplace=True)
    
    return weather_data

In [9]:
def read_files(data, file_name, key, station_k):
    with ZipFile(io.BytesIO(data.content)) as z:
        for file in z.infolist():
            if file.filename == file_name:
                columns = ['date', 'rain', 'temp', 'wetb', 'dewpt','vappr', 'rhum', 'msl', 'wdsp', 'wddir']

                with z.open(file) as thefile:
                    extra = pd.read_csv(thefile, nrows=2, header=None)
                    height = int(extra[0][1].split()[2])

                    with z.open(file) as thefile:
                        extra = pd.read_csv(thefile,skiprows=2, nrows=1, header=None)
                        latitude = float(extra[0][0].split(':')[1].strip())
                        longitude = float(extra[1][0].split(':')[1].strip())
                        del(extra)

                with z.open(file) as thefile:
                    #header rows start at different points.
                    try:
                        weather_data = pd.read_csv(thefile,skiprows=16,header=0,usecols=columns)
                    except:
                        with z.open(file) as thefile:
                            weather_data = pd.read_csv(thefile,skiprows=23,header=0,usecols=columns)


                    weather_data['height'] = height
                    weather_data['latitude'] = latitude
                    weather_data['longitude'] = longitude
                    weather_data['station'] = station_k
                    weather_data['county'] = key

                    #need to convert empty strings to 0
                    weather_data.replace(' ', np.nan, inplace=True)
                    weather_data = remove_null(weather_data)

                    for cols in weather_data.columns:
                        if cols not in ['date','height','station','county']:
                            weather_data[cols] = weather_data[cols].astype(np.float64)

                    print('Starting Insert')

                    weather_data.reset_index(drop=True, inplace=True)
                    insert_data(weather_data)
    

In [10]:
def get_params(url):
    base_download_url = 'https://cli.fusio.net/cli/climate_data/webdata/hly'

    raw_html = get_html(url)
    radio_buttons = data_type(raw_html)
    
    #make post request with appropriate radio buttons and get updated form html
    
    new_html = requests.post(url, data={'stntype':radio_buttons['Hourly']})
    new_html = BeautifulSoup(new_html.text, "lxml")
    
    counties = get_counties(new_html)
    
    for key, value in counties.items():        
        new_html = requests.post(url, data={'stntype':radio_buttons['Hourly'], 'countyno':counties[key]})
        new_html = BeautifulSoup(new_html.text, "lxml")
        
        stations = get_stations(new_html)
        for station_k, station_v in stations.items():
            print('Preparing insert for county: ', key ,' station: ', station_k)
            
            if str(stations[station_k]) not in ['175','1275']:
                #175 represents pheonix park which has no wind speed/direction data
                
                download_url = base_download_url + str(stations[station_k]) +'.zip'
            
                #Had to change jupyter notebooks data io rate limit from 1mb/sec to 20mb/sec
                #Also changed the time limit from 3 secs to 30 seconds in notebookapp.py

                data = requests.get(download_url)
                file_name = 'hly' + str(stations[station_k]) + '.csv'

                read_files(data, file_name, key, station_k)


In [11]:
def get_all_data():
    # To download full data series, need to send post request that says if its hourly, daily or monthly data
    base_url = 'https://cli.fusio.net/cli/climate_data/showdata.php'
    data = get_params(base_url)
    

In [12]:
%time get_all_data()

Preparing insert for county:  Carlow  station:  Oak_Park


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


Starting Insert
Finished Insert
Preparing insert for county:  Cavan  station:  Ballyhaise
Starting Insert
Finished Insert
Preparing insert for county:  Clare  station:  Shannon_Airport


  """Entry point for launching an IPython kernel.


Starting Insert
Finished Insert
Preparing insert for county:  Cork  station:  Cork_Airport


  """Entry point for launching an IPython kernel.


Starting Insert
Finished Insert
Preparing insert for county:  Cork  station:  Moore_Park
Starting Insert
Finished Insert
Preparing insert for county:  Cork  station:  Roches_Point


  """Entry point for launching an IPython kernel.


Starting Insert
Finished Insert
Preparing insert for county:  Cork  station:  SherkinIsland
Starting Insert
Finished Insert
Preparing insert for county:  Donegal  station:  Finner
Starting Insert
Finished Insert
Preparing insert for county:  Donegal  station:  Malin_head


  """Entry point for launching an IPython kernel.


Starting Insert
Finished Insert
Preparing insert for county:  Dublin  station:  Casement


  """Entry point for launching an IPython kernel.


Starting Insert
Finished Insert
Preparing insert for county:  Dublin  station:  Dublin_Airport


  """Entry point for launching an IPython kernel.


Starting Insert
Finished Insert
Preparing insert for county:  Dublin  station:  PhoenixPark
Preparing insert for county:  Galway  station:  Athenry
Starting Insert
Finished Insert
Preparing insert for county:  Galway  station:  Mace_Head
Starting Insert
Finished Insert
Preparing insert for county:  Kerry  station:  Valentia_Observatory
Starting Insert
Finished Insert
Preparing insert for county:  Mayo  station:  Belmullet
Starting Insert
Finished Insert
Preparing insert for county:  Mayo  station:  Claremorris
Starting Insert
Finished Insert
Preparing insert for county:  Mayo  station:  Knock_Airport


  """Entry point for launching an IPython kernel.


Starting Insert
Finished Insert
Preparing insert for county:  Mayo  station:  Newport
Starting Insert
Finished Insert
Preparing insert for county:  Meath  station:  Dunsany
Starting Insert
Finished Insert
Preparing insert for county:  Roscommon  station:  Mt_Dillon


  """Entry point for launching an IPython kernel.


Starting Insert
Finished Insert
Preparing insert for county:  Sligo  station:  Markree
Preparing insert for county:  Tipperary  station:  Gurteen


  """Entry point for launching an IPython kernel.


Starting Insert
Finished Insert
Preparing insert for county:  Westmeath  station:  Mullingar
Starting Insert
Finished Insert
Preparing insert for county:  Wexford  station:  JohnstownII
Starting Insert
Finished Insert
CPU times: user 1h 23min 33s, sys: 8min 40s, total: 1h 32min 13s
Wall time: 4h 12min 20s


The following MySQL statement was used to create a basic table to store the data

    create table weather (Observation_date DATETIME, rain DOUBLE, temp DOUBLE, wet_bulb_temp DOUBLE, dew_point_temp DOUBLE, vapour_pressure DOUBLE, relative_humidity DOUBLE, mean_sea_level_pressure DOUBLE, mean_wind_speed DOUBLE, predominant_wind_direction DOUBLE, height int, latitude DOUBLE, longitude DOUBLE, station varchar(255), county varchar(255));