# Determine Date Ranges across Clusters and Assign Range back to Cluster or Unique Fire Table:

In [3]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt, time
from geopy.distance import great_circle
import math
from scipy import spatial

## Loading data files: Emissions Data with cluster_reference, Centerpoints for Clusters and Clusterpoints

In [2]:
pd.set_option('display.max_columns', 40)

FiresClusters2010 = pd.read_csv('/Users/AlfHaugen/Python/Wildfire_Data/FireExports/Emissions Cluster Data/Emissions2010_DBScan_Clusters.csv')

In [None]:
emission_points_0315 = pd.read_csv('/Users/AlfHaugen/Python/Wildfire_Data/FireExports/Emissions Cluster Data/emissions_03_15_v5.22.csv', encoding='utf-8')
#emission_points_0315 = emission_points_0315.drop(columns=['Unnamed: 0'])

In [None]:
emission_points_0315.groupby['year', 'doy']

In [57]:
centerpoints = pd.read_csv('/Users/AlfHaugen/Python/Wildfire_Data/FireExports/Emissions Cluster Data/centerpoints_03_15_v5.17.csv', encoding='utf-8')

In [10]:
clusterpoints = pd.read_csv('/Users/AlfHaugen/Python/Wildfire_Data/FireExports/Emissions Cluster Data/clusterpoints_03_15_refv5.17.csv', encoding='utf-8')

In [None]:
centerpoints.shape

In [None]:
centerpoints.head()

In [None]:
clusterpoints.head()

# Determining Days within Emissions Cluster:

### Reviewed the number of Clusters that came out of the DBScan and the number of unique cluster_references.  There's a diff of 136, which means when we merged over the cluster references to the Emissions Data, 136 references weren't used. This may be due to lat/longs being very close together and it was just applied incorrectly, but no harm. 

In [None]:
x = len(clusterpoints['cluster_reference'].unique())
print(x)

In [None]:
emission_points_0315['cluster_reference'].nunique()

### Using Groupby to create a new DF to break out days within a cluster, by year. A few ways were tried:

#### Groupby that goes into a list:

In [61]:
em_doy_list = emission_points_0315.groupby(['year','cluster_reference', 'doy'])['doy'].count()
#em_doy_list = pd.DataFrame(em_doy_list)

NameError: name 'emission_points_0315' is not defined

In [None]:
em_doy_list[0:5]

### Setting Groupby to identify Year/Cluster_Reference/each Cluster DOY per Cluster/COY_Count in cluster and placing it into a Panda. 

In [None]:
# Setting Groupby to identify Year/Cluster_Reference/each Cluster DOY per Cluster/COY_Count in cluster and placing it into a Panda. 
#em_doy_list2 = emission_points_0315.groupby( [ "year", "cluster_reference"] ).size().to_frame(name = 'count').reset_index()
em_doy_list2 = emission_points_0315.groupby( [ "year", "cluster_reference", 'doy'] ).size().to_frame(name = 'doy_count').reset_index()
em_doy_list2 = em_doy_list2.rename(columns = {'doy':'cluster_doy'})

In [None]:
print(em_doy_list2.shape)
em_doy_list2[0:10]

In [None]:
em_doy_list2['cluster_doy'] == 0

#### Using Merge to bring over the CenterPoint DOY from the centerpoint data and the related Long/Lat:

In [None]:
em_doy_list_centerpoints = em_doy_list2.merge(right=centerpoints.loc[:,['year','cluster_reference', 'doy', 'longitude', 'latitude']],
                   how='left',
                   left_on=['year','cluster_reference'],
                   right_on=['year','cluster_reference'])

In [None]:
# Renaming the doy from the centerpoint data. 
em_doy_list_centerpoints_May23 = em_doy_list_centerpoints.rename(columns = {'doy':'centerpoint_doy'})

In [None]:
print(em_doy_list_centerpoints_May23.shape)
em_doy_list_centerpoints_May23[0:10]

em_doy_list_centerpoints_May23.to_csv('/Users/AlfHaugen/Python/Wildfire_Data/FireExports/cluster_ref_alldoy_May23.csv', encoding='utf-8')

In [None]:
em_doy_list_centerpoints_May23.loc[em_doy_list_centerpoints_May23['cluster_reference'] == '2003_1990']

----

## Code to determine the Min, Max, Unique Count and a ratio to analyze each cluster to see if it needs to be broken apart for better clustering accuracy. 

In [None]:
em_doy_list_centerpoints_May23.head()

In [None]:
#em_count = emission_points_0315.groupby(["year", "cluster_reference"], as_index=False)["doy"].count()
#em_max = emission_points_0315.groupby(["year", "cluster_reference"], as_index=False)["doy"].max()
#em_min = emission_points_0315.groupby(["year", "cluster_reference"], as_index=False)["doy"].min()
#em_median = emission_points_0315.groupby(["year", "cluster_reference"], as_index=False)["doy"].mean()

em_max = em_doy_list_centerpoints_May23.groupby(["year", "cluster_reference"], as_index=False)["cluster_doy"].max()
em_min = em_doy_list_centerpoints_May23.groupby(["year", "cluster_reference"], as_index=False)["cluster_doy"].min()
em_median = em_doy_list_centerpoints_May23.groupby(["year", "cluster_reference"], as_index=False)["cluster_doy"].mean()
em_doy = em_doy_list_centerpoints_May23.groupby(["year", "cluster_reference"], as_index=False)["cluster_doy"].count()

#print(em_count[0:2])
print(em_max[0:2])
print(em_min[0:2])
print(em_median[0:2])
#print(cluster_count[0:2])

In [None]:
ccount = em_doy_list_centerpoints_May23.groupby(["year", "cluster_reference"], as_index=False)["cluster_doy"].nunique()
ccount = pd.DataFrame(ccount)

In [None]:
em_doy = em_doy.rename(columns = {'cluster_doy':'total_doy_count'})
em_doy.head()

In [None]:
em_doy['doy_max'] = em_max['cluster_doy']

In [None]:
em_doy['doy_min'] = em_min['cluster_doy']

In [None]:
em_doy['unique_count'] = ccount['cluster_doy']

In [None]:
print(em_doy.shape)
em_doy.head()

In [None]:
# Ratio to determine if there's a large variacne in the Range of Time vs the number of Unique DOYs in the cluster.  Values above 2.5 may require splitting. 
em_doy['range_ucount'] = (em_doy['doy_max'] - em_doy['doy_min']) / em_doy['unique_count']

In [None]:
em_doy.head(10)

In [None]:
em_doy1 = em_doy[(em_doy.range_ucount > 3)]
em_doy1.shape

In [None]:
em_doy1[5000:5010]

----

## Determine Days within NASA M6 Data, by Emissions Cluster Reference

In [4]:
# Load NASA M6 Data where the Cluster Ref from Emissions has been tagged. 
NASA_M6_Cluster = pd.read_csv('/Users/AlfHaugen/Python/Wildfire_Data/FireExports/NASA-to-ClustserRef/NASA_M6_ClusterRef_0315.csv')

In [5]:
print(NASA_M6_Cluster.shape)
NASA_M6_Cluster.head()

(1655831, 12)


Unnamed: 0.1,Unnamed: 0,source_lat,source_long,source_year,source_doy,distance,resultrow,targetlat,targetlong,target_doy,target_year,target_clusterref
0,0,31.2482,-85.7553,2004,1,31.187321,6628,31.7278,-85.7461,8,2004,2004_3025
1,1,35.4981,-78.1687,2004,1,25.878173,11904,35.851,-78.1843,7,2004,2004_5314
2,2,35.4299,-77.3891,2004,1,51.30373,11805,35.2661,-77.1672,24,2004,2004_5256
3,3,31.7169,-81.8174,2004,1,25.301232,446,32.0002,-81.6341,5,2004,2004_1108
4,4,32.1135,-83.533,2004,1,43.224914,7221,32.0524,-83.2523,18,2004,2004_3253


In [17]:
NASA_M6 = pd.read_csv('/Users/AlfHaugen/Python/Wildfire_Data/4 NASA/DL_FIRE_M6_110066/fire_archive_M6_110066.csv')

In [18]:
#NASA M6 is larger because it holds 2003 to 2019 data. 
NASA_M6.shape

(2159468, 15)

In [23]:
# Assign the DOY, Month and Year from the Acq Date:

#NASA_M6['doy'] = pd.DatetimeIndex(NASA_M6['acq_date']).day
NASA_M6['datetime'] = pd.to_datetime(NASA_M6['acq_date'], infer_datetime_format=True)
NASA_M6['doy'] = NASA_M6['datetime'].dt.dayofyear
NASA_M6['month'] = pd.DatetimeIndex(NASA_M6['acq_date']).month
NASA_M6['year'] = pd.DatetimeIndex(NASA_M6['acq_date']).year

In [24]:
NASA_M6[0:15]

Unnamed: 0,latitude,longitude,brightness,scan,track,acq_date,acq_time,satellite,instrument,confidence,version,bright_t31,frp,daynight,type,datetime,doy,month,year
0,38.8142,-93.5539,300.8,1.0,1.0,2003-01-01,423,Terra,MODIS,33,6.2,267.0,10.4,N,0,2003-01-01,1,1,2003
1,19.3739,-155.113,318.8,3.1,1.7,2003-01-01,914,Terra,MODIS,97,6.2,288.7,98.4,N,2,2003-01-01,1,1,2003
2,19.3723,-155.1197,316.0,3.1,1.7,2003-01-01,914,Terra,MODIS,90,6.2,288.3,85.0,N,2,2003-01-01,1,1,2003
3,19.3589,-155.1107,325.2,3.1,1.7,2003-01-01,914,Terra,MODIS,100,6.2,292.0,131.8,N,2,2003-01-01,1,1,2003
4,19.3573,-155.1174,319.0,3.1,1.7,2003-01-01,914,Terra,MODIS,97,6.2,290.3,97.3,N,2,2003-01-01,1,1,2003
5,19.3292,-155.1029,340.8,3.1,1.7,2003-01-01,914,Terra,MODIS,100,6.2,291.8,240.5,N,2,2003-01-01,1,1,2003
6,19.3324,-155.0697,325.4,3.1,1.7,2003-01-01,914,Terra,MODIS,100,6.2,290.4,127.3,N,2,2003-01-01,1,1,2003
7,19.3439,-155.108,314.3,3.1,1.7,2003-01-01,914,Terra,MODIS,85,6.2,289.7,73.1,N,2,2003-01-01,1,1,2003
8,19.3162,-155.0919,306.3,3.1,1.7,2003-01-01,914,Terra,MODIS,38,6.2,292.1,40.3,N,2,2003-01-01,1,1,2003
9,19.3252,-155.0829,348.1,1.1,1.0,2003-01-01,1156,Aqua,MODIS,100,6.2,296.8,67.4,N,2,2003-01-01,1,1,2003


In [25]:
#Grouping to see the count by year of records. 
NASA_M6.groupby( [ "year"] ).size().to_frame(name = 'count').reset_index()

Unnamed: 0,year,count
0,2003,114471
1,2004,158385
2,2005,171160
3,2006,126737
4,2007,142420
5,2008,119797
6,2009,115174
7,2010,98322
8,2011,123867
9,2012,137295


In [47]:
print(NASA_M6_Cluster.shape)
NASA_M6_Cluster.head(20)

(1655831, 12)


Unnamed: 0.1,Unnamed: 0,source_lat,source_long,source_year,source_doy,distance,resultrow,targetlat,targetlong,target_doy,target_year,target_clusterref
0,0,31.2482,-85.7553,2004,1,31.187321,6628,31.7278,-85.7461,8,2004,2004_3025
1,1,35.4981,-78.1687,2004,1,25.878173,11904,35.851,-78.1843,7,2004,2004_5314
2,2,35.4299,-77.3891,2004,1,51.30373,11805,35.2661,-77.1672,24,2004,2004_5256
3,3,31.7169,-81.8174,2004,1,25.301232,446,32.0002,-81.6341,5,2004,2004_1108
4,4,32.1135,-83.533,2004,1,43.224914,7221,32.0524,-83.2523,18,2004,2004_3253
5,5,32.1266,-83.5316,2004,1,43.214864,7221,32.0524,-83.2523,18,2004,2004_3253
6,6,27.1612,-80.6466,2004,1,26.57798,15203,26.9801,-80.5778,13,2004,2004_713
7,7,26.6293,-80.7891,2004,1,11.147062,14713,26.4788,-80.8618,3,2004,2004_646
8,8,26.6178,-80.7779,2004,1,11.548614,14713,26.4788,-80.8618,3,2004,2004_646
9,9,26.6193,-80.7906,2004,1,10.719999,14713,26.4788,-80.8618,3,2004,2004_646


In [48]:
NASA_M6_Cluster.loc[NASA_M6_Cluster['source_long'] == -81.8174]

Unnamed: 0.1,Unnamed: 0,source_lat,source_long,source_year,source_doy,distance,resultrow,targetlat,targetlong,target_doy,target_year,target_clusterref
3,3,31.7169,-81.8174,2004,1,25.301232,446,32.0002,-81.6341,5,2004,2004_1108
34809,34809,31.8713,-81.8174,2004,119,16.74649,455,32.0625,-81.7843,125,2004,2004_1111
508142,21790,26.898,-81.8174,2011,80,8.099922,12970,26.8679,-81.8701,83,2011,2011_38
522023,35671,32.9037,-81.8174,2011,103,11.901792,22446,32.9545,-81.7289,107,2011,2011_6987
609006,122654,27.7121,-81.8174,2011,354,40.689143,17502,27.1032,-81.6466,364,2011,2011_55
1149702,22918,26.9389,-81.8174,2012,86,3.172286,4034,26.9383,-81.8494,86,2012,2012_21
1437198,26135,29.1494,-81.8174,2013,99,0.579483,9933,29.1478,-81.8233,99,2013,2013_439
1518253,4842,26.9142,-81.8174,2007,39,4.131155,8482,26.8881,-81.8569,39,2007,2007_31


In [49]:
#Compare NASA M6 with cluster references to the initial NASA M6, looks good by year. 
NASA_M6_Cluster.groupby( [ "source_year"] ).size().to_frame(name = 'count').reset_index()

Unnamed: 0,source_year,count
0,2003,114471
1,2004,158385
2,2005,171160
3,2006,126737
4,2007,142420
5,2008,119797
6,2009,115174
7,2010,98322
8,2011,123867
9,2012,137295


In [51]:
#Checking for dupes. 
dupes1 = NASA_M6_Cluster[NASA_M6_Cluster.duplicated(['source_doy', 'source_year', 'source_lat', 'source_long'], keep=False)]
print(dupes1.shape)
dupes1[0:5]

(42, 12)


Unnamed: 0.1,Unnamed: 0,source_lat,source_long,source_year,source_doy,distance,resultrow,targetlat,targetlong,target_doy,target_year,target_clusterref
44096,44096,65.9323,-144.9847,2004,171,2053.774617,3935,48.6682,-122.8444,234,2004,2004_2028
44288,44288,65.9323,-144.9847,2004,171,2053.774617,3935,48.6682,-122.8444,234,2004,2004_2028
65702,65702,64.4546,-145.4268,2004,184,1978.564147,3935,48.6682,-122.8444,234,2004,2004_2028
66694,66694,64.4546,-145.4268,2004,184,1978.564147,3935,48.6682,-122.8444,234,2004,2004_2028
96502,96502,66.5226,-151.7462,2004,220,2334.072337,3935,48.6682,-122.8444,234,2004,2004_2028


dupes2 = NASA_M6_Cluster.drop_duplicates(subset=['source_lat', 'source_long', 'source_year', 'source_doy'], keep='first')
#dupe3 = NASA_M6_Cluster.drop_duplicates(subset=['resultrow'], keep='first')

dupes2.groupby( [ "source_year"] ).size().to_frame(name = 'count').reset_index()

In [54]:
NASA_M6_within150 = NASA_M6_Cluster[(NASA_M6_Cluster.distance < 150)]
NASA_M6_within150.shape

(1366022, 12)

### Creating Groupby List by Year and Cluster Reference to get unique days in cluster. 

In [66]:
# Setting Groupby to identify Year/Cluster_Reference/each Cluster DOY per Cluster/COY_Count in cluster and placing it into a Panda. 
#em_doy_list2 = emission_points_0315.groupby( [ "year", "cluster_reference"] ).size().to_frame(name = 'count').reset_index()
nasa_doy_list = NASA_M6_within150.groupby( [ "source_year", "target_clusterref", 'source_doy'] ).size().to_frame(name = 'doy_count').reset_index()
#nasa_doy_list = em_doy_list2.rename(columns = {'doy':'cluster_doy'})

In [71]:
print(nasa_doy_list.shape)
nasa_doy_list.head()

(409326, 4)


Unnamed: 0,source_year,target_clusterref,source_doy,doy_count
0,2003,2003_0,98,6
1,2003,2003_0,100,2
2,2003,2003_0,128,11
3,2003,2003_0,129,1
4,2003,2003_0,130,18


In [68]:
nasa_doy_list_centerpoints = nasa_doy_list.merge(right=centerpoints.loc[:,['year','cluster_reference', 'doy', 'longitude', 'latitude']],
                   how='left',
                   left_on=['source_year','target_clusterref'],
                   right_on=['year','cluster_reference'])

In [72]:
nasa_doy_list_centerpoints = nasa_doy_list_centerpoints.drop(columns=['year','cluster_reference'])
print(nasa_doy_list_centerpoints.shape)
nasa_doy_list_centerpoints.head()

(409326, 7)


Unnamed: 0,source_year,target_clusterref,source_doy,doy_count,doy,longitude,latitude
0,2003,2003_0,98,6,131.0,-80.686,25.4148
1,2003,2003_0,100,2,131.0,-80.686,25.4148
2,2003,2003_0,128,11,131.0,-80.686,25.4148
3,2003,2003_0,129,1,131.0,-80.686,25.4148
4,2003,2003_0,130,18,131.0,-80.686,25.4148


In [81]:
nasa_doy_list_centerpoints.rename(columns={'source_year':'year', 'target_clusterref':'cluster_reference', 'source_doy':'cluster_doy', 'doy':'centerpoint_doy'}, inplace=True)
nasa_doy_list_centerpoints.head()

Unnamed: 0,year,cluster_reference,cluster_doy,doy_count,centerpoint_doy,longitude,latitude
0,2003,2003_0,98,6,131.0,-80.686,25.4148
1,2003,2003_0,100,2,131.0,-80.686,25.4148
2,2003,2003_0,128,11,131.0,-80.686,25.4148
3,2003,2003_0,129,1,131.0,-80.686,25.4148
4,2003,2003_0,130,18,131.0,-80.686,25.4148


In [62]:
em_doy_list_centerpoints = pd.read_csv('/Users/AlfHaugen/Python/Wildfire_Data/FireExports/Emissions Cluster Data/cluster_ref_alldoy_May23.csv')

In [78]:
#em_doy_list_centerpoints = em_doy_list_centerpoints.drop(columns=['Unnamed: 0'])
em_doy_list_centerpoints.head(15)

Unnamed: 0,year,cluster_reference,cluster_doy,doy_count,centerpoint_doy,longitude,latitude
0,2003,2003_0,95,2,131.0,-80.686,25.4148
1,2003,2003_0,97,7,131.0,-80.686,25.4148
2,2003,2003_0,98,5,131.0,-80.686,25.4148
3,2003,2003_0,99,1,131.0,-80.686,25.4148
4,2003,2003_0,100,17,131.0,-80.686,25.4148
5,2003,2003_0,101,3,131.0,-80.686,25.4148
6,2003,2003_0,102,3,131.0,-80.686,25.4148
7,2003,2003_0,105,8,131.0,-80.686,25.4148
8,2003,2003_0,107,4,131.0,-80.686,25.4148
9,2003,2003_0,130,99,131.0,-80.686,25.4148


In [82]:
nasa_emissions_doycombined = em_doy_list_centerpoints.append(nasa_doy_list_centerpoints,ignore_index=True)

In [89]:
print(nasa_emissions_doycombined.shape)
nasa_emissions_doycombined = nasa_emissions_doycombined.sort_values(['cluster_reference', 'cluster_doy'])
nasa_emissions_doycombined[0:20]

(727866, 7)


Unnamed: 0,year,cluster_reference,cluster_doy,doy_count,centerpoint_doy,longitude,latitude
0,2003,2003_0,95,2,131.0,-80.686,25.4148
1,2003,2003_0,97,7,131.0,-80.686,25.4148
318540,2003,2003_0,98,6,131.0,-80.686,25.4148
2,2003,2003_0,98,5,131.0,-80.686,25.4148
3,2003,2003_0,99,1,131.0,-80.686,25.4148
318541,2003,2003_0,100,2,131.0,-80.686,25.4148
4,2003,2003_0,100,17,131.0,-80.686,25.4148
5,2003,2003_0,101,3,131.0,-80.686,25.4148
6,2003,2003_0,102,3,131.0,-80.686,25.4148
7,2003,2003_0,105,8,131.0,-80.686,25.4148


In [91]:
nasa_emissions_doycombined.to_csv('/Users/AlfHaugen/Python/Wildfire_Data/FireExports/nasa_emissions_doycombined_May28.csv')

In [90]:
#Emission_Cluster_doy = Emission_Cluster_doy.drop(columns=['Unnamed: 0'])
Emission_Cluster_doy.head()

Unnamed: 0,year,cluster_reference,cluster_doy,doy_count,centerpoint_doy,longitude,latitude
0,2003,2003_0,95,2,131.0,-80.686,25.4148
1,2003,2003_0,97,7,131.0,-80.686,25.4148
2,2003,2003_0,98,5,131.0,-80.686,25.4148
3,2003,2003_0,99,1,131.0,-80.686,25.4148
4,2003,2003_0,100,17,131.0,-80.686,25.4148


# Define Function to Pull DOY from Emissions Data:
### Function will go through Cluster Points file from DBScan and for each index row, will pull [Coordinates, ClusterNum and Year], and match these to the Emissions data file. Function will pull the DOY feature and place it into a new dataframe.

In [None]:
def pulldoyfromtargetdf(datapointsforcluster, targetdf_doy):
    clusterindex = 0
    clusterindex = int(clusterindex)
    
    cluserdatapoints_doy = pd.DataFrame()
    
    ### Test and execution While Loop.  It does work with 1 entry point. 
    #while clusterindex < 770256:
    while clusterindex < len(datapointsforcluster):
        ### for each coordinate point in the FireClusterNum sets which is the cluster group for each centerpoint from the DBScan files, go into the original data set and pull
        ### the doy over into a new dataframe. 
        tempclunum1 = datapointsforcluster.iloc[clusterindex]['ClusterNum']
        templat1 = datapointsforcluster.iloc[clusterindex]['0']
        templong1 = datapointsforcluster.iloc[clusterindex]['1']
        tempyear1 = datapointsforcluster.iloc[clusterindex]['Year']
        tempyear1 = int(tempyear1)
        
        search_doy1 = targetdf_doy.loc[(targetdf_doy['year'] == tempyear1) & (targetdf_doy['latitude'] == templat1) & (targetdf_doy['longitude'] == templong1)]
        ret_doy1 = search_doy1['doy']
        #retdoy1 = int(retdoy1)

        
        doytemp = pd.DataFrame({'latitude':[templat1], 'longitude': [templong1], 'ClusterNum':[tempclunum1], 'Year': [tempyear1], 'doy':[ret_doy1]})
        cluserdatapoints_doy = cluserdatapoints_doy.append(doytemp, ignore_index = True)

        clusterindex = clusterindex + 1     
    return cluserdatapoints_doy                                              
    