## Comparing Weather and Clusters
### Main columns needed are latitude, longitude, datetime

In [25]:
import pandas as pd
import datetime
import numpy as np

import config

import datetime

from darksky import forecast
from datetime import datetime as dt
from datetime import timedelta
from datetime import date
from sqlalchemy import create_engine

In [26]:
# create sqlalchemy engine
engine = create_engine("mysql+mysqlconnector://{user}:{password}@{host}/{dbname}"
                       .format(user=config.db_user,
                               password=config.db_pass,
                               dbname=config.db_name,
                               host=config.db_host))

In [27]:
filename = "fire_lat_lon_data/cluster_ref_alldoy_May23.csv"

In [28]:
cluster = pd.read_csv(filename)

In [29]:
cluster.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 318540 entries, 0 to 318539
Data columns (total 8 columns):
Unnamed: 0           318540 non-null int64
year                 318540 non-null int64
cluster_reference    318540 non-null object
cluster_doy          318540 non-null int64
doy_count            318540 non-null int64
centerpoint_doy      318540 non-null float64
longitude            318540 non-null float64
latitude             318540 non-null float64
dtypes: float64(3), int64(4), object(1)
memory usage: 19.4+ MB


In [30]:
cluster.head()

Unnamed: 0.1,Unnamed: 0,year,cluster_reference,cluster_doy,doy_count,centerpoint_doy,longitude,latitude
0,0,2003,2003_0,95,2,131.0,-80.686,25.4148
1,1,2003,2003_0,97,7,131.0,-80.686,25.4148
2,2,2003,2003_0,98,5,131.0,-80.686,25.4148
3,3,2003,2003_0,99,1,131.0,-80.686,25.4148
4,4,2003,2003_0,100,17,131.0,-80.686,25.4148


In [31]:
cluster['datetime'] = (pd.to_datetime(cluster.year, format='%Y') + 
                       cluster['centerpoint_doy'].apply(np.ceil).apply(
                           lambda x: pd.Timedelta(x, unit='D')) + timedelta(hours=12))

In [89]:
query = """
SELECT *
    , YEAR(timestamp) weather_year
    , STR_TO_DATE(CONCAT(YEAR(timestamp),'-01-01'), '%Y-%m-%d') start_year_date
    , DATE(timestamp) weather_date
    , DATEDIFF( DATE(timestamp), STR_TO_DATE(CONCAT(YEAR(timestamp),'-01-01'), '%Y-%m-%d')) doy
FROM weather_loading;
"""

In [90]:
weather_all_sql = pd.read_sql(query, engine)

In [91]:
weather_all_sql.head()

Unnamed: 0,index,apparentTemperature,cloudCover,dewPoint,humidity,icon,latitude,longitude,precipAccumulation,precipIntensity,...,timestamp,uvIndex,visibility,windBearing,windGust,windSpeed,weather_year,start_year_date,weather_date,doy
0,0,89.32,0.0,66.21,0.5,,29.9614,-100.3969,,,...,2006-08-16T12:00:00,9.0,9.997,180.0,17.25,9.21,2006,2006-01-01,2006-08-16,227
1,1,82.84,0.0,56.64,0.41,,30.3707,-103.3322,,,...,2006-06-22T12:00:00,10.0,9.997,164.0,11.36,4.69,2006,2006-01-01,2006-06-22,172
2,2,85.95,0.0,42.89,0.22,,30.42,-103.2347,,,...,2006-06-19T12:00:00,10.0,9.997,157.0,13.2,5.37,2006,2006-01-01,2006-06-19,169
3,3,84.48,0.0,28.31,0.13,clear-day,30.6933,-104.3193,,0.0,...,2006-05-23T12:00:00,10.0,9.997,218.0,8.99,2.68,2006,2006-01-01,2006-05-23,142
4,4,82.0,0.75,64.37,0.58,partly-cloudy-day,31.1047,-100.5903,,0.0,...,2006-10-03T12:00:00,4.0,9.997,201.0,23.59,15.05,2006,2006-01-01,2006-10-03,275


In [92]:
combo_df = pd.merge(cluster, weather_all_sql,
                    how='left', left_on=['year','centerpoint_doy','longitude','latitude'],
                    right_on = ['weather_year','doy','longitude','latitude'])

In [93]:
combo_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 335841 entries, 0 to 335840
Data columns (total 33 columns):
Unnamed: 0             335841 non-null int64
year                   335841 non-null int64
cluster_reference      335841 non-null object
cluster_doy            335841 non-null int64
doy_count              335841 non-null int64
centerpoint_doy        335841 non-null float64
longitude              335841 non-null float64
latitude               335841 non-null float64
datetime               335841 non-null datetime64[ns]
index                  276992 non-null float64
apparentTemperature    271235 non-null float64
cloudCover             209565 non-null float64
dewPoint               269674 non-null float64
humidity               269617 non-null float64
icon                   202631 non-null object
precipAccumulation     1392 non-null float64
precipIntensity        255567 non-null float64
precipProbability      255567 non-null float64
precipType             29211 non-null object
pre

In [94]:
cluster_index = ['longitude','latitude','year','centerpoint_doy']

In [108]:
combo_df[combo_df.weather_year.isnull()].groupby('year').nunique()

Unnamed: 0_level_0,Unnamed: 0,year,cluster_reference,cluster_doy,doy_count,centerpoint_doy,longitude,latitude,datetime,index,...,timestamp,uvIndex,visibility,windBearing,windGust,windSpeed,weather_year,start_year_date,weather_date,doy
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2003,820,1,225,296,74,135,225,225,135,0,...,0,0,0,0,0,0,0,0,0,0
2004,452,1,188,225,47,120,188,185,120,0,...,0,0,0,0,0,0,0,0,0,0
2005,1362,1,372,345,87,192,369,370,192,0,...,0,0,0,0,0,0,0,0,0,0
2006,33486,1,11120,365,339,364,10898,10637,364,0,...,0,0,0,0,0,0,0,0,0,0
2007,16159,1,4972,365,591,357,4935,4885,357,0,...,0,0,0,0,0,0,0,0,0,0
2008,832,1,253,301,91,173,252,250,173,0,...,0,0,0,0,0,0,0,0,0,0
2009,836,1,298,295,94,170,297,295,170,0,...,0,0,0,0,0,0,0,0,0,0
2010,877,1,286,304,57,173,282,281,173,0,...,0,0,0,0,0,0,0,0,0,0
2011,1261,1,360,294,91,200,358,354,200,0,...,0,0,0,0,0,0,0,0,0,0
2012,766,1,237,298,132,162,237,236,162,0,...,0,0,0,0,0,0,0,0,0,0


In [102]:
combo_df[combo_df['doy'].isnull()]['cluster_reference'].nunique()

18984