# <font color="#C71585"> Location of closest coordinate out of a list of coordinates </font>
### <font color="#C71585"> Connecting NASA cluster centeroids to USDA unique fires </font>
##### - https://www.timvink.nl/closest-coordinates/
##### - https://stackoverflow.com/questions/39107896/efficiently-finding-the-closest-coordinate-pair-from-a-set-in-python

In [1]:
# import libraries
import pandas as pd
import datetime
import numpy as np
from geopy.distance import great_circle
import math
from scipy import spatial
import sklearn
import os

### <font color="#C71585"> Loading Datasets and setting variables </font>

In [31]:
USDA_188MfireDf = pd.read_csv('/Users/nahidmacbook/Documents/DataScience/Data-Wildfire/Fire_Program_Analysis__Fire_Occurrence_Database_Feature_Layer.csv')

In [23]:
year = 2006

In [24]:
NASAFiresClusters20xx = pd.read_csv('/Users/nahidmacbook/Documents/DataScience/Data-Wildfire/NASA-M6-DBSCAN-Clusters-'+str(year)+'.csv')

In [25]:
pd.set_option('display.max_columns', 40)

In [26]:
NASAFiresClusters20xx.shape

(28049, 18)

In [27]:
# add cluster reference
NASAFiresClusters20xx['Cluster_Reference']=range(0, len(NASAFiresClusters20xx))

In [28]:
# add day of year
NASAFiresClusters20xx['DOY'] = pd.DatetimeIndex(NASAFiresClusters20xx['acq_date']).dayofyear

### <font color="#C71585"> Setting up Sample Data for the year </font>

In [32]:
### Creating a smaller DF Unique Fires from USDA 1.88m fire set. 

USDA_188MfireDf = USDA_188MfireDf.drop(['SOURCE_REPORTING_UNIT', 'SOURCE_REPORTING_UNIT_NAME', 'LOCAL_FIRE_REPORT_ID', 
                                    'LOCAL_INCIDENT_ID', 'OWNER_CODE','OWNER_DESCR', 'X', 'Y','FOD_ID', 'FPA_ID', 
                                    'SOURCE_SYSTEM_TYPE', 'SOURCE_SYSTEM'], 1)
USDA_188MfireDf_20xx = USDA_188MfireDf[(USDA_188MfireDf.FIRE_YEAR == year)]
USDA_188MfireDf_20xx.shape

(114004, 28)

### <font color="#C71585"> Encoding Cyclical Feature for finding the closest DOY to the given DOY </font>

In [33]:
# function to encode DOY as a cyclical feature
def encode(data, col, max_val):
    data[col + '_sin'] = np.sin(2 * np.pi * data[col]/max_val)
    data[col + '_cos'] = np.cos(2 * np.pi * data[col]/max_val)
    return data

In [34]:
# encode DOY of TargetDF and Main DF
NASAFiresClusters20xx = encode(NASAFiresClusters20xx, 'DOY', 365)
USDA_188MfireDf_20xx = encode(USDA_188MfireDf_20xx, 'DISCOVERY_DOY', 365)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [35]:
# USDA as the TargetDF
places = []
for index, row in USDA_188MfireDf_20xx.iterrows():
    coordinates = [row['LATITUDE'], row['LONGITUDE'], row['DISCOVERY_DOY_sin'], row['DISCOVERY_DOY_cos']]
    places.append(coordinates)

tree = spatial.KDTree(places)
places[1]

[35.66861111, -94.28055556, 0.017213356155834685, 0.9998518392091162]

In [36]:
# closest 4d coordinate from USDA to the 4d coordinate from NASA
coordinates4D = []
def find_firecluster(lat, lon, DOYsin, DOYcos):
    coordinates4D = [lat, lon, DOYsin, DOYcos]
    closest = tree.query([coordinates4D], p = 4)
    index = closest[1][0]
    return closest

### <font color="#C71585"> Function to locate Closest Coordinates and Pull data features from target DF to main DF </font>

In [37]:
 ### clusterdf will be panda dataframe that has cluster coordinates to cross-ref with the     
def findclustermatch(clusterdf, targetdf):
    clustertotargetdf = pd.DataFrame()
    clustindexnum = 0
    clustindexnum = int(clustindexnum)
    while clustindexnum < len(clusterdf):
        #capture data from target/cluster to then use to find target match in other dataframe:
        clusteryear = clusterdf.iloc[clustindexnum]['year']
        clustermonth = clusterdf.iloc[clustindexnum]['month']
        clusterdoy = clusterdf.iloc[clustindexnum]['DOY']
        clusterlat = clusterdf.iloc[clustindexnum]['latitude']
        clusterlong = clusterdf.iloc[clustindexnum]['longitude']
        clusterdoy_sin = clusterdf.iloc[clustindexnum]['DOY_sin']
        clusterdoy_cos = clusterdf.iloc[clustindexnum]['DOY_cos']
        clusterfrp = clusterdf.iloc[clustindexnum]['frp']
        clusterbrightness = clusterdf.iloc[clustindexnum]['brightness']
        
        # Running cluster find function:
        distance_location = find_firecluster(clusterlat, clusterlong, clusterdoy_sin, clusterdoy_cos)
        targetlocation = distance_location[1]
        targetlocation = int(targetlocation)
        
        #Lines to pull data from the target dataframe, will need to be customized to the target DF. 
        target_lat = targetdf.iloc[targetlocation]['LATITUDE']
        target_long = targetdf.iloc[targetlocation]['LONGITUDE']
        target_firename = targetdf.iloc[targetlocation]['FIRE_NAME']
        target_firesize = targetdf.iloc[targetlocation]['FIRE_SIZE']
        target_fireclass = targetdf.iloc[targetlocation]['FIRE_SIZE_CLASS']
        target_year = targetdf.iloc[targetlocation]['FIRE_YEAR']
        target_discdoy = targetdf.iloc[targetlocation]['DISCOVERY_DOY']
        target_contdoy = targetdf.iloc[targetlocation]['CONT_DOY']
        target_state = targetdf.iloc[targetlocation]['STATE']
        
        #adfsd
        #'cluster_id': [clusterid], 
        cdftemp = pd.DataFrame({'cluster_index':[clustindexnum], 'clusteryear':[clusteryear], 'clusterdoy':[clusterdoy], 'clusterlat': [clusterlat], 'clusterlog': [clusterlong], 'clusterfrp': [clusterfrp], 'clusterbrightness': [clusterbrightness], 'clustermonth': [clustermonth], 
                                'distance': distance_location[0], 'resultrow': distance_location[1], 'targetlat':[target_lat], 'targetlong':[target_long], 'firename':[target_firename],
                                'firesize':[target_firesize], 'fireclass':[target_fireclass],'fire_year':[target_year], 'discovery_doy': [target_discdoy], 'contain_doy': [target_contdoy], 'state': [target_state]})
        
        clustertotargetdf = clustertotargetdf.append(cdftemp, ignore_index = True)
        clustindexnum = clustindexnum + 1
    return clustertotargetdf

In [38]:
cluster_targetmatch = findclustermatch(NASAFiresClusters20xx,USDA_188MfireDf_20xx)
# cluster_targetmatch.sort_values(by='discovery_doy')

In [16]:
# Get the count of instances where the doys are exact matches:
cluster_targetmatch = cluster_targetmatch[(cluster_targetmatch['discovery_doy'] - cluster_targetmatch['clusterdoy']).abs() < 8]

### <font color="#C71585"> Write data back to disk </font>

In [17]:
# if file does not exist write header 
if not os.path.isfile('/Users/nahidmacbook/Documents/DataScience/Data-Wildfire/ConnectedNASA-USDA.csv'):
   cluster_targetmatch.to_csv('/Users/nahidmacbook/Documents/DataScience/Data-Wildfire/ConnectedNASA-USDA.csv', header='column_names')
else: # else it exists so append without writing the header
   cluster_targetmatch.to_csv('/Users/nahidmacbook/Documents/DataScience/Data-Wildfire/ConnectedNASA-USDA.csv', mode='a', header=False)

In [19]:
ToGroup = pd.read_csv('/Users/nahidmacbook/Documents/DataScience/Data-Wildfire/ConnectedNASA-USDA.csv')

In [20]:
# Group Data by State
GroupedDf = ToGroup.groupby(['state']).mean()

In [22]:
GroupedDf.to_csv('/Users/nahidmacbook/Documents/DataScience/Data-Wildfire/GroupedConnectedNASA-USDA.csv', encoding='utf-8')