In [7]:
#Uncomment these lines to install libraries for this script

#!pip install -e git+https://github.com/SohierDane/BigQuery_Helper#egg=bq_helper
#!pip install --upgrade google-cloud-bigquery

In [36]:
import os
import bq_helper
import datetime as datetime
from bq_helper import BigQueryHelper
import pandas as pd
%run country-boundingbox.ipynb

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "Data-mining-afa11af25388.json"
# https://www.kaggle.com/sohier/introduction-to-the-bq-helper-package
noaa = bq_helper.BigQueryHelper(active_project="bigquery-public-data",
                                   dataset_name="ghcn_d")

In [41]:
#Function finds the closest weather station to a specified coordinates and returns weather data between the specified years.
#Function takes in latitude and longitude coordinates (ensure to add negative signs to coordinates if necessary)

def weatherDataByCoordinates(lat, long, startYear, endYear):
    stationID = closestStationToLoc(lat, long)
    df = weatherDataForStationID(stationID, startYear, endYear)
    df['value'] = df['value'].div(10,axis=0) #fix value
    df["date"] = pd.to_datetime(df["date"]) #set Date as index
    df.set_index('date', inplace=True)
    return df

def prcpDataByCoordinates(lat, long, startYear, endYear):
    stationID = closestStationToLoc(lat, long)
    df = weatherDataForStationID(stationID, startYear, endYear)
    df['value'] = df['value'].div(10,axis=0) #fix value
    df["date"] = pd.to_datetime(df["date"]) #set Date as index
    df.set_index('date', inplace=True)
    return df


#Helper functions
def closestStationToLoc(lat, long):
    query = """SELECT weather.id, POWER(POWER(weather.latitude - """+str(lat)+""",2) + POWER(weather.longitude - """+str(long)+""",2), 1/2) AS distance, weather.latitude, weather.longitude
    FROM
      `bigquery-public-data.ghcn_d.ghcnd_stations` AS weather
    ORDER BY
        distance ASC
    LIMIT 10;
            """
    response2 = noaa.query_to_pandas_safe(query, max_gb_scanned=10)
    return response2['id'].iloc[0]

def weatherDataForStationID(ID, startYear, endYear):
    weatherData = pd.DataFrame()
    for i in range(startYear,endYear):
        query = """SELECT
          weather.date,
          weather.value
        FROM
          `bigquery-public-data.ghcn_d.ghcnd_"""+str(i)+"""` AS weather
        WHERE
            weather.element = 'TAVG'
            AND weather.id = '"""+ID+"""'
        ORDER BY
            weather.date ASC;
                """
        response2 = noaa.query_to_pandas_safe(query, max_gb_scanned=10)
        weatherData = weatherData.append(response2)
    weatherData.dropna()
    return weatherData

def prcpDataForStationID(ID, startYear, endYear):
    weatherData = pd.DataFrame()
    for i in range(startYear,endYear):
        query = """SELECT
          weather.date,
          weather.value
        FROM
          `bigquery-public-data.ghcn_d.ghcnd_"""+str(i)+"""` AS weather
        WHERE
            weather.element = 'PRCP'
            AND weather.id = '"""+ID+"""'
        ORDER BY
            weather.date ASC;
                """
        response2 = noaa.query_to_pandas_safe(query, max_gb_scanned=10)
        weatherData = weatherData.append(response2)
    weatherData.dropna()
    return weatherData

In [None]:
dfWeather = weatherDataByCoordinates(17.1899, -88.4976, 1990, 2017)
pdfWeather = prcpDataByCoordinates(17.1899, -88.4976, 1990, 2017)
#15.2000° N, 86.2419° W hondouras
#12.8654° N, 85.2072° W Nicaragua - No data Found
#13.7942° N, 88.8965° W El Salvador
#15.7835° N, 90.2308° W Guatemala
#9.7489° N, 83.7534° W Costa Rica
#8.5380° N, 80.7821° W Panama
#17.1899° N, 88.4976° W Belize
##east and South are negitive
print(dfWeather.size)
print(dfWeather.head(10))
print(dfWeather.dtypes)

print(pdfWeather.size)
print(pdfWeather.head(10))
print(pdfWeather.dtypes)

In [87]:
#Turns a dataframe to the moving average with (i days)
# df - Pandas Dataframe
# i - Value for how many days to take the moving average with respect to
def movingAvg(df,i):
    if(i < 1):
        raise Exception('Invalid Int: Cannot find moving average of {}.'.format(sample))
    df['value'] = df['value'].rolling(i,center=True,min_periods=1).mean()
    return df

In [88]:
dfWeather_ma = movingAvg(dfWeather,3)
print(dfWeather_ma.head(10))

                value
date                 
1990-01-01  22.300000
1990-01-02  22.300000
1990-01-03  22.166667
1990-01-04  22.833333
1990-01-05  23.166667
1990-01-06  23.366667
1990-01-07  23.033333
1990-01-08  22.466667
1990-01-09  22.166667
1990-01-10  21.733333


In [89]:
#function to downsample to year month
def downSample(df,sample):
    if(sample == '1Y'):
        df= df.resample(sample).mean()
        df.index = df.index.to_period('y')
    elif(sample == '1M'):
        df= df.resample(sample).mean()
        df.index = df.index.to_period('m')
    else:
        raise Exception('Invalid Sample String: The value was: {}.'.format(sample))
    return df

def downSamplemin(df,sample):
    if(sample == '1Y'):
        df= df.resample(sample).min()
        df.index = df.index.to_period('y')
    elif(sample == '1M'):
        df= df.resample(sample).min()
        df.index = df.index.to_period('m')
    else:
        raise Exception('Invalid Sample String: The value was: {}.'.format(sample))
    return df

def downSamplemax(df,sample):
    if(sample == '1Y'):
        df= df.resample(sample).max()
        df.index = df.index.to_period('y')
    elif(sample == '1M'):
        df= df.resample(sample).max()
        df.index = df.index.to_period('m')
    else:
        raise Exception('Invalid Sample String: The value was: {}.'.format(sample))
    return df

In [90]:
#DownSample by Month
dfWeather_mo = downSample(dfWeather_ma,'1M')
#print(dfWeather_mo.head(10))

#Downsample by Year
dfWeather_yr = downSample(dfWeather_ma,'1Y')
dfWeather_yr = dfWeather_yr.rename(index=str,columns={"value":"tmpAvg"})
#print(dfWeather_yr)

downSamplemin_yr = downSamplemin(pdfWeather,'1Y')
downSamplemin_yr = downSamplemin_yr.rename(index=str,columns={"value":"prcpMin"})
#print(downSamplemin_yr)

downSamplemax_yr =downSamplemax(pdfWeather,'1Y')
downSamplemax_yr = downSamplemax_yr.rename(index=str,columns={'value':'prcpMax'})
#print(downSamplemax_yr)

merge=pd.merge(downSamplemin_yr,downSamplemax_yr, how='inner', left_index=True, right_index=True)
merge=pd.merge(merge,dfWeather_yr, how='inner', left_index=True, right_index=True)
print(merge)

      prcpMin  prcpMax     tmpAvg
date                             
1990     19.6     29.2  22.794973
1991     19.3     28.6  22.830233
1992     18.2     30.7  22.618491
1993     18.1     25.4  22.070402
1994     17.8     26.5  22.751342
1995     19.6     26.4  22.759016
1996     17.3     29.8  22.801692
1997     19.5     26.2  22.942975
1998     19.6     28.3  23.186447
1999     19.3     26.4  21.940018
2000     19.6     25.2  22.364088
2001     20.2     27.4  22.941941
2002     19.9     27.3  23.382875
2003     20.7     26.3  23.053755
2004     20.3     27.3  22.954762
2005     19.6     25.2  22.343379
2006     18.3     25.4  22.610502
2007     18.7     25.9  22.447215
2008     18.6     24.9  22.022404
2009     20.7     25.3  22.705571
2010     19.2     26.1  22.470502
2011     18.3     24.9  22.303014
2012     19.3     25.1  22.433607
2013     20.0     25.3  22.583700
2014     20.2     25.2  22.788037
2015     21.1     26.1  23.226575
2016     19.9     26.3  22.976393


In [91]:
#OLS stuff
honduras = merge.to_csv("Belize1990-2016.csv",sep=',')