In [3]:
#Uncomment these lines to install libraries for this script

#!pip install -e git+https://github.com/SohierDane/BigQuery_Helper#egg=bq_helper
#!pip install --upgrade google-cloud-bigquery

In [5]:
import os
import bq_helper
import datetime as datetime
from bq_helper import BigQueryHelper
import pandas as pd
%run country-boundingbox.ipynb

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "Data-mining-afa11af25388.json"
# https://www.kaggle.com/sohier/introduction-to-the-bq-helper-package
noaa = bq_helper.BigQueryHelper(active_project="bigquery-public-data",
                                   dataset_name="ghcn_d")

In [6]:
#Function returns all weather station data within the country between the specified years. Inefficient, but  complete.
#All country IDs can be found in the country-boundingbox file

def weatherDataByCountry(countryID, startYear, endYear):
    coordinates = country_bounding_boxes[countryID][1]
    lonMin = str(coordinates[0])
    latMin = str(coordinates[1])
    lonMax = str(coordinates[2])
    latMax = str(coordinates[3])
    weatherData = pd.DataFrame()
    for i in range(startYear,endYear):
        query = """SELECT
          stations.id,
          weather.date,
          weather.element,
          weather.value
        FROM
          `bigquery-public-data.ghcn_d.ghcnd_"""+str(i)+"""` AS weather INNER JOIN `bigquery-public-data.ghcn_d.ghcnd_stations` AS stations ON 
          weather.id = stations.id
        WHERE
          weather.element = 'TAVG'
          AND stations.latitude >"""+latMin+"""
          AND stations.latitude <"""+latMax+"""
          AND stations.longitude >"""+lonMin+"""
          AND stations.longitude <"""+lonMax+"""
        ORDER BY
            weather.date ASC;
                """
        response2 = noaa.query_to_pandas_safe(query, max_gb_scanned=10)
        weatherData = weatherData.append(response2)
    weatherData.dropna()
    return weatherData

In [7]:
dfWeather= weatherDataByCountry('HN', 2013, 2015)
print(dfWeather.size)
print(dfWeather.head(10))

13964
            id        date element  value
0  HOM00078708  2013-01-01    TAVG  249.0
1  ES003800510  2013-01-01    TAVG  263.0
2  HO000078720  2013-01-01    TAVG  194.0
3  HOM00078705  2013-01-01    TAVG  267.0
4  ES003800510  2013-01-02    TAVG  262.0
5  HO000078711  2013-01-02    TAVG  259.0
6  HOM00078705  2013-01-02    TAVG  253.0
7  HO000078720  2013-01-02    TAVG  209.0
8  HOM00078708  2013-01-02    TAVG  261.0
9  HOM00078708  2013-01-03    TAVG  252.0


In [8]:
#Function finds the closest weather station to a specified coordinates and returns weather data between the specified years.
#Function takes in latitude and longitude coordinates (ensure to add negative signs to coordinates if necessary)

def weatherDataByCoordinates(lat, long, startYear, endYear):
    stationID = closestStationToLoc(lat, long)
    df = weatherDataForStationID(stationID, startYear, endYear)
    df['value'] = df['value'].div(10,axis=0) #fix value
    df["date"] = pd.to_datetime(df["date"]) #set Date as index
    df.set_index('date', inplace=True)
    return df



#Helper functions
def closestStationToLoc(lat, long):
    query = """SELECT weather.id, POWER(POWER(weather.latitude - """+str(lat)+""",2) + POWER(weather.longitude - """+str(long)+""",2), 1/2) AS distance, weather.latitude, weather.longitude
    FROM
      `bigquery-public-data.ghcn_d.ghcnd_stations` AS weather
    ORDER BY
        distance ASC
    LIMIT 10;
            """
    response2 = noaa.query_to_pandas_safe(query, max_gb_scanned=10)
    return response2['id'].iloc[0]

def weatherDataForStationID(ID, startYear, endYear):
    weatherData = pd.DataFrame()
    for i in range(startYear,endYear):
        query = """SELECT
          weather.date,
          weather.value
        FROM
          `bigquery-public-data.ghcn_d.ghcnd_"""+str(i)+"""` AS weather
        WHERE
            weather.element = 'TAVG'
            AND weather.id = '"""+ID+"""'
        ORDER BY
            weather.date ASC;
                """
        response2 = noaa.query_to_pandas_safe(query, max_gb_scanned=10)
        weatherData = weatherData.append(response2)
    weatherData.dropna()
    return weatherData

In [9]:
dfWeather = weatherDataByCoordinates(15.2000, -86.2419, 2013, 2018)
print(dfWeather.size)
print(dfWeather.head(10))
print(dfWeather.dtypes)

1816
            value
date             
2013-01-01   26.7
2013-01-02   25.3
2013-01-03   25.1
2013-01-04   23.8
2013-01-05   25.1
2013-01-06   25.1
2013-01-07   24.8
2013-01-08   25.4
2013-01-09   25.8
2013-01-10   25.8
value    float64
dtype: object


In [10]:
#Turns a dataframe to the moving average with (i days)
# df - Pandas Dataframe
# i - Value for how many days to take the moving average with respect to
def movingAvg(df,i):
    if(i < 1):
        raise Exception('Invalid Int: Cannot find moving average of {}.'.format(sample))
    df['value'] = df['value'].rolling(i,center=True,min_periods=1).mean()
    return df

In [11]:
dfWeather_ma = movingAvg(dfWeather,3)
print(dfWeather_ma.head(10))

                value
date                 
2013-01-01  26.000000
2013-01-02  25.700000
2013-01-03  24.733333
2013-01-04  24.666667
2013-01-05  24.666667
2013-01-06  25.000000
2013-01-07  25.100000
2013-01-08  25.333333
2013-01-09  25.666667
2013-01-10  25.833333


In [29]:
#function to downsample to year month
def downSample(df,sample):
    if(sample == '1Y'):
        df= df.resample(sample).mean()
        df.index = df.index.to_period('y')
    elif(sample == '1M'):
        df= df.resample(sample).mean()
        df.index = df.index.to_period('m')
    else:
        raise Exception('Invalid Sample String: The value was: {}.'.format(sample))
    return df

In [30]:
#DownSample by Month
dfWeather_mo = downSample(dfWeather_ma,'1M')
print(dfWeather_mo.head(10))

#Downsample by Year
dfWeather_yr = downSample(dfWeather_ma,'1Y')
print(dfWeather_yr)

             value
date              
2013-01  23.876344
2013-02  25.026190
2013-03  23.286022
2013-04  25.724444
2013-05  27.325806
2013-06  27.091111
2013-07  26.905128
2013-08  27.295699
2013-09  27.695402
2013-10  26.980645
          value
date           
2013  25.847168
2014  25.512785
2015  26.626301
2016  27.408974
2017  26.500092
