<a href="https://colab.research.google.com/github/Giffy/Mobile_footprint_AIBCN/blob/master/4_Dataset_creation_Weather_per_hour.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Dataset creation (Weather history per hour)

Creation of dataset including the weather per day and hour of Barcelona from 2015 to 2017

In [0]:
# Load libraries
import pandas as pd
import requests
from bs4 import BeautifulSoup
from time import sleep
from random import randint
import sys

In [0]:
# Defining the period to query to www.timeanddate.com
years = [2015, 2016, 2017]                                    # Years from 2015 to 2017
months = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]              # Full year
days = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
        11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
        21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]           # All days of month
rain = ['Light rain', 'Rain', 'Drizzle', 'Scattered showers',
       'Rain showers', 'Sprinkles', 'Thunderstorms',
       'Thundershowers', 'Strong thunderstorms',
       'Lots of rain', 'Mixture of precip', 'Sleet',
       'Heavy rain', 'Snow flurries', 'Light snow', 'Snow']   # Defining bad weather from description

In [0]:
# Inicialize dataframe to store the weather data
df = pd.DataFrame()

## Web scrapping to query weather historic data per day and hour

In [26]:
count = 0
num_docs = len(years) * len(months)

for y in years:
    year = str(y)                                             # year to build html url
    for m in months:
        month = '0'+str(m) if len(str(m)) == 1 else str(m)    # month to build html url 
        sleep(randint(0,1))                                   # delay to secure reply and avoid overflow
        for d in days:
            day = '0'+str(d) if len(str(d)) == 1 else str(d)  # day to build html url

            urlPage = 'https://www.timeanddate.com/scripts/cityajax.php?n=spain/barcelona&mode=historic&hd='+year+month+day+'&month='+month+'&year='+year

            req = requests.get(urlPage)                       # request to get html
            soup = BeautifulSoup(req.content,'lxml')          # parsing html request with BeautifulSoup
            
            
            for element in soup.find_all('tr' , {'class':'c1'}):   # extracting lines with relevant data
              hora = element.text.split('m')
              hora = hora[0]+'m'
              hora1 = hora.split(':')
              hora1 = int(hora1[0])
              hora2 = hora.split(' ')
              hora2 = hora2[1]
              
              if hora2 == 'pm':
                hora1 += 12                                   # PM hours in 24 format
              if hora1 == 12 :
                hora1 = 0                                     # Update 12 AM to 0 AM
              if hora1 == 24:
                hora1 = 12                                    # Correct 12PM from 24 to 12
              desc = element.text.split('°F')
              try:
                desc = desc[1].split('.')
                if desc[0] in rain:
                  is_raining = 1
                else:
                  is_raining = 0
              except:
                is_raining = 0                
              
              weather_data = {
                  'data' : [ year+month+day ],
                  'hour' : [ hora1 ],
                  'period' : [ hora2 ],
                  'description' : [ desc[0] ],
                  'rain' : [ is_raining ]
              }
              df= df.append(pd.DataFrame(weather_data), ignore_index=True)   # append data to dataframe
       
        # Progress counter 
        count += 1 
        if (count%50):    
          b = round((count*100 / num_docs),1)
          sys.stdout.write('\r'+ str(b) + "%")

100.0%

In [27]:
df.tail(5)


Unnamed: 0,data,description,hour,period,rain
25414,20171231,Passing clouds,19,pm,0
25415,20171231,Passing clouds,20,pm,0
25416,20171231,Passing clouds,21,pm,0
25417,20171231,Passing clouds,22,pm,0
25418,20171231,Passing clouds,23,pm,0


## Dataframe overview

In [28]:
# check if there are raining days
df.rain.unique()

array([0, 1])

In [29]:
# check if there are raining days
df.hour.unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23])

In [0]:
df.description.unique()

array(['Passing clouds', 'Scattered clouds', 'Clear', 'Sunny',
       'Light rain', 'Partly sunny', 'Broken clouds', 'Partly cloudy',
       'Fog', 'Rain', 'Drizzle', 'Scattered showers', 'Rain showers',
       'Sprinkles', 'Dense fog', 'Thunderstorms', 'Thundershowers',
       'Overcast', 'Strong thunderstorms', 'Mild', 'Lots of rain', 'Cool',
       'Mixture of precip', 'Sleet', 'Light fog', 'Heavy rain',
       'Mostly cloudy', 'More clouds than sun', 'Cloudy', 'Snow flurries',
       'Light snow', 'Snow', 'Low clouds'], dtype=object)

## Export to CSV and Copy dataset to Google Drive

In [0]:
# Export dataframe to CSV
df.to_csv('weather.csv', index= False)

In [0]:
# Connect to Google Drive
from google.colab import drive
drive.mount('/gdrive')

In [0]:
# Compress 'weather.csv' and copy to Google Drive
!tar -czvf weather2.tar.gz weather.csv > /dev/null
!mv weather2.tar.gz /gdrive/My\ Drive/