## Comparing Weather and Clusters
### Main columns needed are latitude, longitude, datetime

In [73]:
import pandas as pd
import datetime
import numpy as np

import config

import datetime

from darksky import forecast
from datetime import datetime as dt
from datetime import timedelta
from datetime import date
from sqlalchemy import create_engine

In [2]:
# create sqlalchemy engine
engine = create_engine("mysql+mysqlconnector://{user}:{password}@{host}/{dbname}"
                       .format(user=config.db_user,
                               password=config.db_pass,
                               dbname=config.db_name,
                               host=config.db_host))

In [3]:
filename = "fire_lat_lon_data/cluster_ref_alldoy_May23.csv"

In [4]:
cluster = pd.read_csv(filename)

In [5]:
cluster.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 318540 entries, 0 to 318539
Data columns (total 8 columns):
Unnamed: 0           318540 non-null int64
year                 318540 non-null int64
cluster_reference    318540 non-null object
cluster_doy          318540 non-null int64
doy_count            318540 non-null int64
centerpoint_doy      318540 non-null float64
longitude            318540 non-null float64
latitude             318540 non-null float64
dtypes: float64(3), int64(4), object(1)
memory usage: 19.4+ MB


In [6]:
cluster.head(25)

Unnamed: 0.1,Unnamed: 0,year,cluster_reference,cluster_doy,doy_count,centerpoint_doy,longitude,latitude
0,0,2003,2003_0,95,2,131.0,-80.686,25.4148
1,1,2003,2003_0,97,7,131.0,-80.686,25.4148
2,2,2003,2003_0,98,5,131.0,-80.686,25.4148
3,3,2003,2003_0,99,1,131.0,-80.686,25.4148
4,4,2003,2003_0,100,17,131.0,-80.686,25.4148
5,5,2003,2003_0,101,3,131.0,-80.686,25.4148
6,6,2003,2003_0,102,3,131.0,-80.686,25.4148
7,7,2003,2003_0,105,8,131.0,-80.686,25.4148
8,8,2003,2003_0,107,4,131.0,-80.686,25.4148
9,9,2003,2003_0,130,99,131.0,-80.686,25.4148


In [7]:
cluster['datetime'] = (pd.to_datetime(cluster.year, format='%Y') + 
                       cluster['centerpoint_doy'].apply(np.ceil).apply(
                           lambda x: pd.Timedelta(x, unit='D')) - timedelta(days=1) + timedelta(hours=12))

In [8]:
cluster['datetime_cluster'] = (pd.to_datetime(cluster.year, format='%Y') + 
                       cluster['cluster_doy'].apply(np.ceil).apply(
                           lambda x: pd.Timedelta(x, unit='D')) - timedelta(days=1) + timedelta(hours=12))

In [14]:
query = """
SELECT *
    , YEAR(timestamp) weather_year
    , STR_TO_DATE(CONCAT(YEAR(timestamp),'-01-01'), '%Y-%m-%d') start_year_date
    , DATE(timestamp) weather_date
    , DATEDIFF( DATE(timestamp), STR_TO_DATE(CONCAT(YEAR(timestamp),'-01-01'), '%Y-%m-%d')) doy
FROM weather_loading;
"""

In [15]:
weather_all_sql = pd.read_sql(query, engine)

In [16]:
weather_all_sql.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101092 entries, 0 to 101091
Data columns (total 26 columns):
index                  101092 non-null int64
apparentTemperature    99129 non-null float64
cloudCover             79410 non-null float64
dewPoint               98593 non-null float64
humidity               98560 non-null float64
icon                   76919 non-null object
latitude               101092 non-null float64
longitude              101092 non-null float64
precipAccumulation     472 non-null float64
precipIntensity        93518 non-null float64
precipProbability      93518 non-null float64
precipType             10846 non-null object
pressure               61194 non-null float64
summary                77505 non-null object
temperature            99129 non-null float64
time                   101092 non-null int64
timestamp              101092 non-null object
uvIndex                79410 non-null float64
visibility             81109 non-null float64
windBearing         

In [26]:
combo_df = pd.merge(cluster, weather_all_sql,
                    how='left', left_on=['year','centerpoint_doy','longitude','latitude'],
                    right_on = ['weather_year','doy','longitude','latitude'])

In [27]:
combo_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 335841 entries, 0 to 335840
Data columns (total 34 columns):
Unnamed: 0             335841 non-null int64
year                   335841 non-null int64
cluster_reference      335841 non-null object
cluster_doy            335841 non-null int64
doy_count              335841 non-null int64
centerpoint_doy        335841 non-null float64
longitude              335841 non-null float64
latitude               335841 non-null float64
datetime               335841 non-null datetime64[ns]
datetime_cluster       335841 non-null datetime64[ns]
index                  276992 non-null float64
apparentTemperature    271235 non-null float64
cloudCover             209565 non-null float64
dewPoint               269674 non-null float64
humidity               269617 non-null float64
icon                   202631 non-null object
precipAccumulation     1392 non-null float64
precipIntensity        255567 non-null float64
precipProbability      255567 non-null fl

In [28]:
cluster_index = ['longitude','latitude','year','cluster_doy']

In [36]:
combo_df[combo_df.weather_year.isnull()].groupby(cluster_index).nunique()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 0,year,cluster_reference,cluster_doy,doy_count,centerpoint_doy,longitude,latitude,datetime,datetime_cluster,...,timestamp,uvIndex,visibility,windBearing,windGust,windSpeed,weather_year,start_year_date,weather_date,doy
longitude,latitude,year,cluster_doy,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
-124.4420,43.0322,2010,187,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
-124.4230,43.0876,2006,132,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
-124.3887,43.2116,2007,225,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
-124.3804,40.4730,2007,313,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
-124.3797,42.6049,2007,306,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
-67.8817,44.6128,2006,113,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
-67.8520,46.0380,2006,103,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
-67.8104,46.8207,2007,128,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
-67.7009,44.6908,2006,129,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0


In [38]:
combo_df[combo_df['doy'].isnull()]['cluster_reference','latitude','longi'].head()

KeyError: ('cluster_reference', 'latitude', 'longi')

In [52]:
combo_df_all = pd.merge(cluster, weather_all_sql,
                    how='left', left_on=['year','cluster_doy','longitude','latitude'],
                    right_on = ['weather_year','doy','longitude','latitude'])

In [53]:
combo_df_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 325527 entries, 0 to 325526
Data columns (total 34 columns):
Unnamed: 0             325527 non-null int64
year                   325527 non-null int64
cluster_reference      325527 non-null object
cluster_doy            325527 non-null int64
doy_count              325527 non-null int64
centerpoint_doy        325527 non-null float64
longitude              325527 non-null float64
latitude               325527 non-null float64
datetime               325527 non-null datetime64[ns]
datetime_cluster       325527 non-null datetime64[ns]
index                  97162 non-null float64
apparentTemperature    95299 non-null float64
cloudCover             76480 non-null float64
dewPoint               94776 non-null float64
humidity               94744 non-null float64
icon                   74068 non-null object
precipAccumulation     463 non-null float64
precipIntensity        89901 non-null float64
precipProbability      89901 non-null float64
pre

In [54]:
combo_df_missing_all = combo_df_all[combo_df_all['doy'].isnull()]

### Pulling Missing Cluster Weather Points

In [74]:
RAPIDAPI_KEY  = config.darksky_api1
RAPIDAPI_KEY2  = config.darksky_api2

In [75]:
def weather_lookup(df, key=RAPIDAPI_KEY, days_before=0, days_after=0):
    data = []
    for index, row in df.iterrows():
        ts = row['datetime_cluster'].isoformat() 
        lat = row['latitude']
        lon = row['longitude']
        weather = forecast(key, lat, lon, time=ts)
        w_dict = weather['currently']
        w_dict['timestamp'] = ts
        w_dict['latitude'] = lat
        w_dict['longitude'] = lon
        data.append(w_dict)
    return data

In [76]:
firedata = combo_df_missing_all

In [77]:
firedata.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 228365 entries, 0 to 325525
Data columns (total 34 columns):
Unnamed: 0             228365 non-null int64
year                   228365 non-null int64
cluster_reference      228365 non-null object
cluster_doy            228365 non-null int64
doy_count              228365 non-null int64
centerpoint_doy        228365 non-null float64
longitude              228365 non-null float64
latitude               228365 non-null float64
datetime               228365 non-null datetime64[ns]
datetime_cluster       228365 non-null datetime64[ns]
index                  0 non-null float64
apparentTemperature    0 non-null float64
cloudCover             0 non-null float64
dewPoint               0 non-null float64
humidity               0 non-null float64
icon                   0 non-null object
precipAccumulation     0 non-null float64
precipIntensity        0 non-null float64
precipProbability      0 non-null float64
precipType             0 non-null obj

In [78]:
fire_test = firedata[:1]
firedata1 = firedata[:50000]
firedata2 = firedata[50000:100000]
firedata3 = firedata[100000:150000]
firedata4 = firedata[150000:200000]
firedata5 = firedata[200000:250000]

In [79]:
weather_dftest = pd.DataFrame(weather_lookup(fire_test,RAPIDAPI_KEY))

In [81]:
weather_dftest

Unnamed: 0,time,summary,icon,precipIntensity,precipProbability,temperature,apparentTemperature,dewPoint,humidity,pressure,windSpeed,windGust,windBearing,cloudCover,uvIndex,visibility,timestamp,latitude,longitude
0,1049562000,Mostly Cloudy,partly-cloudy-day,0,0,83.5,88.04,70.57,0.65,1017.7,9.73,17.46,126,0.75,6,8.28,2003-04-05T12:00:00,25.4148,-80.686


In [None]:
weather_df1 = pd.DataFrame(weather_lookup(firedata1,RAPIDAPI_KEY))

In [None]:
weather_df2 = pd.DataFrame(weather_lookup(firedata2,RAPIDAPI_KEY))

In [None]:
weather_df3 = pd.DataFrame(weather_lookup(firedata3,RAPIDAPI_KEY))

In [None]:
weather_df4 = pd.DataFrame(weather_lookup(firedata4,RAPIDAPI_KEY))

In [None]:
weather_df5 = pd.DataFrame(weather_lookup(firedata5,RAPIDAPI_KEY))

In [None]:
db_columns = ['apparentTemperature', 'cloudCover', 'dewPoint',
       'humidity', 'icon', 'latitude', 'longitude', 'precipAccumulation',
       'precipIntensity', 'precipProbability', 'precipType', 'pressure',
       'summary', 'temperature', 'time', 'timestamp', 'uvIndex', 'visibility',
       'windBearing', 'windGust', 'windSpeed']

In [None]:
weather_df1 = weather_df1[db_columns]

In [None]:
weather_df1.to_sql('weather_loading', con = engine, if_exists = 'append', chunksize = 100000)

In [None]:
query = """SELECT * FROM weather_loading;
"""

In [None]:
weather_all_sql = pd.read_sql(query, engine)

In [None]:
weather_all_sql.to_csv('weather_150000.csv')

In [None]:
weather_all_sql.count()