# Determine Date Ranges across Clusters and Assign Range back to Cluster or Unique Fire Table:

In [2]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt, time
from geopy.distance import great_circle
import math
from scipy import spatial

## Loading data files: Emissions Data with cluster_reference, Centerpoints for Clusters and Clusterpoints

In [3]:
pd.set_option('display.max_columns', 40)

FiresClusters2010 = pd.read_csv('/Users/AlfHaugen/Python/Wildfire_Data/FireExports/Emissions Cluster Data/Emissions2010_DBScan_Clusters.csv')

In [75]:
emission_points_0315 = pd.read_csv('/Users/AlfHaugen/Python/Wildfire_Data/FireExports/Emissions Cluster Data/emissions_03_15_v522.csv', encoding='utf-8')
emission_points_0315 = emission_points_0315.drop(columns=['Unnamed: 0'])

In [57]:
centerpoints = pd.read_csv('/Users/AlfHaugen/Python/Wildfire_Data/FireExports/Emissions Cluster Data/centerpoints_03_15_v5.17.csv', encoding='utf-8')

In [68]:
clusterpoints = pd.read_csv('/Users/AlfHaugen/Python/Wildfire_Data/FireExports/Emissions Cluster Data/clusterpoints_03_15_refv5.17.csv', encoding='utf-8')

In [58]:
centerpoints.shape

(109299, 25)

In [65]:
centerpoints.head()

Unnamed: 0.1,Unnamed: 0,cluster_ref,cluster_reference,id,year,doy,longitude,latitude,grid10k,covertype,fuelcode,area_burned,prefire_fuel,consumed_fuel,ECO2,ECO,ECH4,EPM2.5,cwd_frac,duff_frac,fuel_moisture_class,burn_source,burnday_source,BSEV,BSEV_flag
0,0,0,2006_0,22.0,2006,208.0,-80.6759,25.4076,5461.0,1.0,1.0,62500.0,394.430634,366.82049,615.157961,25.677434,0.983079,3.741569,0.0,0.0,2.0,1.0,15.0,2.0,0.0
1,1,1,2006_1,96.0,2006,157.0,-80.5824,25.613,6845.0,1.0,1.0,62500.0,299.942136,278.946186,467.792755,19.526233,0.747576,2.845251,0.0,0.0,2.0,1.0,81.0,2.0,0.0
2,2,2,2006_2,164.0,2006,157.0,-80.5513,25.6198,6845.0,1.0,1.0,62500.0,226.077462,210.25204,352.592671,14.717643,0.563475,2.144571,0.0,0.0,2.0,1.0,15.0,2.0,0.0
3,3,3,2006_3,718.0,2006,49.0,-81.1249,26.0165,8222.0,3.0,1600.0,62500.0,6220.097576,2017.491592,3377.280924,155.346853,4.539356,24.00815,0.011189,0.081394,4.0,1.0,15.0,2.0,0.0
4,4,4,2006_4,444.0,2006,155.0,-81.6018,26.0708,8217.0,3.0,1140.0,62500.0,7646.952224,3842.452183,6432.264954,295.868818,8.645517,45.725181,0.008857,0.084716,2.0,1.0,15.0,2.0,1.0


In [69]:
clusterpoints.head()

Unnamed: 0.1,Unnamed: 0,latitude,longitude,ClusterNum,year,cluster_reference
0,0,25.1903,-81.0394,0,2011,2011_0
1,1,25.19,-81.037,0,2011,2011_0
2,2,25.1896,-81.0346,0,2011,2011_0
3,3,25.1926,-81.039,0,2011,2011_0
4,4,25.1922,-81.0366,0,2011,2011_0


### Reviewed the number of Clusters that came out of the DBScan and the number of unique cluster_references.  There's a diff of 136, which means when we merged over the cluster references to the Emissions Data, 136 references weren't used. This may be due to lat/longs being very close together and it was just applied incorrectly, but no harm. 

In [76]:
x = len(clusterpoints['cluster_reference'].unique())
print(x)

109299


In [101]:
emission_points_0315['cluster_reference'].nunique()

109163

### Using Groupby to create a new DF to break out days within a cluster, by year. A few ways were tried:

#### Groupby that goes into a list:

In [181]:
em_doy_list = emission_points_0315.groupby(['year','cluster_reference', 'doy'])['doy'].count()
#em_doy_list = pd.DataFrame(em_doy_list)

In [185]:
em_doy_list[0:5]

year  cluster_reference  doy
2003  2003_0             95      2
                         97      7
                         98      5
                         99      1
                         100    17
Name: doy, dtype: int64

### Setting Groupby to identify Year/Cluster_Reference/each Cluster DOY per Cluster/COY_Count in cluster and placing it into a Panda. 

In [193]:
# Setting Groupby to identify Year/Cluster_Reference/each Cluster DOY per Cluster/COY_Count in cluster and placing it into a Panda. 
#em_doy_list2 = emission_points_0315.groupby( [ "year", "cluster_reference"] ).size().to_frame(name = 'count').reset_index()
em_doy_list2 = emission_points_0315.groupby( [ "year", "cluster_reference", 'doy'] ).size().to_frame(name = 'doy_count').reset_index()
em_doy_list2 = em_doy_list2.rename(columns = {'doy':'cluster_doy'})

In [196]:
print(em_doy_list2.shape)
em_doy_list2[0:10]

(318540, 4)


Unnamed: 0,year,cluster_reference,cluster_doy,doy_count
0,2003,2003_0,95,2
1,2003,2003_0,97,7
2,2003,2003_0,98,5
3,2003,2003_0,99,1
4,2003,2003_0,100,17
5,2003,2003_0,101,3
6,2003,2003_0,102,3
7,2003,2003_0,105,8
8,2003,2003_0,107,4
9,2003,2003_0,130,99


#### Using Merge to bring over the CenterPoint DOY from the centerpoint data and the related Long/Lat:

In [197]:
em_doy_list_centerpoints = em_doy_list2.merge(right=centerpoints.loc[:,['year','cluster_reference', 'doy', 'longitude', 'latitude']],
                   how='left',
                   left_on=['year','cluster_reference'],
                   right_on=['year','cluster_reference'])

In [201]:
# Renaming the doy from the centerpoint data. 
em_doy_list_centerpoints_May23 = em_doy_list_centerpoints.rename(columns = {'doy':'centerpoint_doy'})

In [204]:
print(em_doy_list_centerpoints_May23.shape)
em_doy_list_centerpoints_May23[0:20]

(318540, 7)


Unnamed: 0,year,cluster_reference,cluster_doy,doy_count,centerpoint_doy,longitude,latitude
0,2003,2003_0,95,2,131.0,-80.686,25.4148
1,2003,2003_0,97,7,131.0,-80.686,25.4148
2,2003,2003_0,98,5,131.0,-80.686,25.4148
3,2003,2003_0,99,1,131.0,-80.686,25.4148
4,2003,2003_0,100,17,131.0,-80.686,25.4148
5,2003,2003_0,101,3,131.0,-80.686,25.4148
6,2003,2003_0,102,3,131.0,-80.686,25.4148
7,2003,2003_0,105,8,131.0,-80.686,25.4148
8,2003,2003_0,107,4,131.0,-80.686,25.4148
9,2003,2003_0,130,99,131.0,-80.686,25.4148


In [205]:
em_doy_list_centerpoints_May23.to_csv('/Users/AlfHaugen/Python/Wildfire_Data/FireExports/cluster_ref_alldoy_May23.csv', encoding='utf-8')

----

## Code to determine the Min, Max, Unique Count and a ratio to analyze each cluster to see if it needs to be broken apart for better clustering accuracy. 

In [84]:
#em_count = emission_points_0315.groupby(["year", "cluster_reference"], as_index=False)["doy"].count()
em_max = emission_points_0315.groupby(["year", "cluster_reference"], as_index=False)["doy"].max()
em_min = emission_points_0315.groupby(["year", "cluster_reference"], as_index=False)["doy"].min()
em_median = emission_points_0315.groupby(["year", "cluster_reference"], as_index=False)["doy"].mean()

#print(em_count[0:2])
print(em_max[0:2])
print(em_min[0:2])
print(em_median[0:2])
print(cluster_count[0:2])

   year cluster_reference  doy
0  2003            2003_0  131
1  2003            2003_1  228
   year cluster_reference  doy
0  2003            2003_0   95
1  2003            2003_1  206
   year cluster_reference         doy
0  2003            2003_0  121.037037
1  2003            2003_1  211.722222
cluster_reference
2003_0    162
2003_1     36
Name: doy, dtype: int64


In [85]:
ccount = emission_points_0315.groupby(["year", "cluster_reference"], as_index=False)["doy"].nunique()
ccount = pd.DataFrame(ccount)

In [92]:
em_doy = em_doy.rename(columns = {'doy':'total_count'})

(109163, 1)


Unnamed: 0,doy
0,11
1,18
2,1
3,14
4,1


In [107]:
em_doy.head()

Unnamed: 0,cluster_reference,doy
0,2003_0,162
1,2003_1,36
2,2003_10,3
3,2003_100,9434
4,2003_1000,5


In [108]:
em_doy['doy_max'] = em_max['doy']

In [109]:
em_doy['doy_min'] = em_min['doy']

In [110]:
em_doy['unique_count'] = ccount['doy']

In [111]:
print(em_doy.shape)
em_doy.head()

(109163, 5)


Unnamed: 0,cluster_reference,doy,doy_max,doy_min,unique_count
0,2003_0,162,131,95,11
1,2003_1,36,228,206,18
2,2003_10,3,352,352,1
3,2003_100,9434,304,248,14
4,2003_1000,5,107,107,1


In [206]:
# Ratio to determine if there's a large variacne in the Range of Time vs the number of Unique DOYs in the cluster.  Values above 2.5 may require splitting. 
em_doy['range_ucount'] = (em_doy['doy_max'] - em_doy['doy_min']) / em_doy['unique_count']

In [135]:
em_doy.head(10)

Unnamed: 0,cluster_reference,total_count,doy_max,doy_min,unique_count,range_ucount
0,2003_0,162,131,95,11,3.272727
1,2003_1,36,228,206,18,1.222222
2,2003_10,3,352,352,1,0.0
3,2003_100,9434,304,248,14,4.0
4,2003_1000,5,107,107,1,0.0
5,2003_1001,14,295,282,4,3.25
6,2003_1002,6,122,117,2,2.5
7,2003_1003,3,349,349,1,0.0
8,2003_1004,2,104,104,1,0.0
9,2003_1005,1,104,104,1,0.0


In [199]:
em_doy1 = em_doy[(em_doy.range_ucount > 3)]
em_doy1.shape

(12175, 6)

In [207]:
em_doy1[5000:5010]

Unnamed: 0,cluster_reference,total_count,doy_max,doy_min,unique_count,range_ucount
44064,2008_1668,4,155,147,2,4.0
44072,2008_1675,6,84,53,3,10.333333
44076,2008_168,194,73,4,3,23.0
44081,2008_1684,8,195,184,2,5.5
44087,2008_169,121,345,6,9,37.666667
44089,2008_1691,8,319,307,3,4.0
44092,2008_1694,61,220,81,3,46.333333
44101,2008_1701,14,269,246,7,3.285714
44111,2008_1710,51,70,28,6,7.0
44118,2008_1717,4,190,181,2,4.5


## Select Year for Run

In [None]:
pullyear = 2004

In [None]:
emission_points_xx = emission_points_0315[(emission_points_0315.year == pullyear)]
print(emission_points_xx.shape)
emission_points_xx.head()

In [None]:
cluster_number= emission_points_xx['cluster_reference'].lstrip(

In [None]:
Clusterpoints2010.tail(3)

----

# Define Function to Pull DOY from Emissions Data:
### Function will go through Cluster Points file from DBScan and for each index row, will pull [Coordinates, ClusterNum and Year], and match these to the Emissions data file. Function will pull the DOY feature and place it into a new dataframe.

In [None]:
def pulldoyfromtargetdf(datapointsforcluster, targetdf_doy):
    clusterindex = 0
    clusterindex = int(clusterindex)
    
    cluserdatapoints_doy = pd.DataFrame()
    
    ### Test and execution While Loop.  It does work with 1 entry point. 
    #while clusterindex < 770256:
    while clusterindex < len(datapointsforcluster):
        ### for each coordinate point in the FireClusterNum sets which is the cluster group for each centerpoint from the DBScan files, go into the original data set and pull
        ### the doy over into a new dataframe. 
        tempclunum1 = datapointsforcluster.iloc[clusterindex]['ClusterNum']
        templat1 = datapointsforcluster.iloc[clusterindex]['0']
        templong1 = datapointsforcluster.iloc[clusterindex]['1']
        tempyear1 = datapointsforcluster.iloc[clusterindex]['Year']
        tempyear1 = int(tempyear1)
        
        search_doy1 = targetdf_doy.loc[(targetdf_doy['year'] == tempyear1) & (targetdf_doy['latitude'] == templat1) & (targetdf_doy['longitude'] == templong1)]
        ret_doy1 = search_doy1['doy']
        #retdoy1 = int(retdoy1)

        
        doytemp = pd.DataFrame({'latitude':[templat1], 'longitude': [templong1], 'ClusterNum':[tempclunum1], 'Year': [tempyear1], 'doy':[ret_doy1]})
        cluserdatapoints_doy = cluserdatapoints_doy.append(doytemp, ignore_index = True)

        clusterindex = clusterindex + 1     
    return cluserdatapoints_doy                                              
    