# Notebook for Assigning Cluster Reference ID and Reviewing for Duplicates

In [5]:
import pandas as pd
import numpy as np
from geopy.distance import great_circle
import math
import seaborn as sns
from scipy import spatial
import matplotlib.pyplot as plt
import datetime

## One-Time - Combining Cluster Point CSVs:

#Combine all csv's in a directory, into a new file.
import os 
import glob 
import pandas as pd 

os.chdir('/Users/AlfHaugen/Python/Wildfire_Data/FireExports/Emissions Cluster Data/EmissionClusterPoints')

extension = 'csv' 
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]

#combine all files in the list 
clusterpoints_03_15_v2 = pd.concat([pd.read_csv(f) for f in all_filenames ])

###export to csv 
clusterpoints_03_15_v2.to_csv( "clusterpoints_03_15_v2.csv", index=False, encoding='utf-8-sig')


## Load related DataFrames: Fire Cluster Center Points, Cluster Points and 1.88m Fire Records:

In [6]:
# Expanding number of columns:
pd.set_option('display.max_columns', 40)

In [None]:
# Loading 1.88m fire record table:
usdafiredb_onemil = pd.read_csv('/Users/AlfHaugen/Python/Wildfire_Data/1_188m_USDA Fire Database/Fire_Program_Analysis__Fire_Occurrence_Database_Feature_Layer.csv')

In [7]:
FireCenterPoints = pd.read_csv('/Users/AlfHaugen/Python/Wildfire_Data/FireExports/Emissions Cluster Data/centerpoints_03_15_v2.csv')

In [8]:
emdata_0315 = pd.read_csv('/Users/AlfHaugen/Python/Wildfire_Data/9 Missoula Emisions Data RDS-2017-0039/Emissions_Year/emissions_2003to2015_cleanv1.csv')

In [38]:
ClusterPoints = pd.read_csv('/Users/AlfHaugen/Python/Wildfire_Data/FireExports/Emissions Cluster Data/clusterpoints_03_15_v2.csv', encoding='utf-8')

## Emissions Data

In [None]:
emdata_0315.head(2)

In [None]:
emdata_20xx = emdata_0315[emdata_0315.year == 2004]
emdata_20xx.head(2)

## Fire Cluster Points Data:

In [10]:
print(ClusterPoints.shape)
ClusterPoints[200000:200005]

(5960572, 5)


Unnamed: 0.1,Unnamed: 0,0,1,ClusterNum,Year
200000,200000,31.7406,-109.2987,361,2011
200001,200001,31.7409,-109.2961,361,2011
200002,200002,31.7412,-109.2935,361,2011
200003,200003,31.7415,-109.2909,361,2011
200004,200004,31.7418,-109.2882,361,2011


## Creating New Fire Cluster Point Dataframe:
#### Dropping unused columns
#### Renaming column headers
#### Assigning new Cluster Reference to cluster points

In [39]:
ClusterPoints = ClusterPoints.drop(columns=['Unnamed: 0'])
ClusterPoints[0:1]

Unnamed: 0,0,1,ClusterNum,Year
0,25.1903,-81.0394,0,2011


In [40]:
ClusterPoints = ClusterPoints.rename(columns={'0': 'latitude', '1': 'longitude', 'Year': 'year'})
ClusterPoints[0:5]

Unnamed: 0,latitude,longitude,ClusterNum,year
0,25.1903,-81.0394,0,2011
1,25.19,-81.037,0,2011
2,25.1896,-81.0346,0,2011
3,25.1926,-81.039,0,2011
4,25.1922,-81.0366,0,2011


In [41]:
ClusterPoints['cluster_reference'] = ClusterPoints['year'].astype(str) + "_" + ClusterPoints['ClusterNum'].astype(str)
ClusterPoints[0:10]

Unnamed: 0,latitude,longitude,ClusterNum,year,cluster_reference
0,25.1903,-81.0394,0,2011,2011_0
1,25.19,-81.037,0,2011,2011_0
2,25.1896,-81.0346,0,2011,2011_0
3,25.1926,-81.039,0,2011,2011_0
4,25.1922,-81.0366,0,2011,2011_0
5,25.1918,-81.0342,0,2011,2011_0
6,25.1995,-81.0701,0,2011,2011_0
7,25.1991,-81.0677,0,2011,2011_0
8,25.1952,-81.0411,0,2011,2011_0
9,25.1948,-81.0386,0,2011,2011_0


In [42]:
ClusterPoints[1000:1002]
print(ClusterPoints.shape)

(5960572, 5)


In [45]:
# Determining unique cluster reference points.  Using Unique will include blanks. 
f = len(ClusterPoints['cluster_reference'].unique())
print(f)

109321


In [None]:
# Writing new dataset of cluster points with cluster reference ID to CSV:
ClusterPoints.to_csv('/Users/AlfHaugen/Python/Wildfire_Data/FireExports/Emissions Cluster Data/clusterpoints_03_15_ref1.csv', encoding='utf-8')

## Fire Center Points Data:
#### Renaming columns, adding cluster_reference id, moving column. 

In [15]:
print(FireCenterPoints.shape)
FireCenterPoints[0:2]

(109321, 23)


Unnamed: 0.1,Unnamed: 0,id,year,doy,longitude,latitude,grid10k,covertype,fuelcode,area_burned,prefire_fuel,consumed_fuel,ECO2,ECO,ECH4,EPM2.5,cwd_frac,duff_frac,fuel_moisture_class,burn_source,burnday_source,BSEV,BSEV_flag
0,0,22.0,2006.0,208.0,-80.6759,25.4076,5461.0,1.0,1.0,62500.0,394.430634,366.82049,615.157961,25.677434,0.983079,3.741569,0.0,0.0,2.0,1.0,15.0,2.0,0.0
1,1,96.0,2006.0,157.0,-80.5824,25.613,6845.0,1.0,1.0,62500.0,299.942136,278.946186,467.792755,19.526233,0.747576,2.845251,0.0,0.0,2.0,1.0,81.0,2.0,0.0


In [16]:
FireCenterPoints = FireCenterPoints.rename(columns={'Unnamed: 0': 'cluster_ref'})

In [17]:
FireCenterPoints = FireCenterPoints.astype({"year": int}) 

In [18]:
FireCenterPoints.head(1)

Unnamed: 0,cluster_ref,id,year,doy,longitude,latitude,grid10k,covertype,fuelcode,area_burned,prefire_fuel,consumed_fuel,ECO2,ECO,ECH4,EPM2.5,cwd_frac,duff_frac,fuel_moisture_class,burn_source,burnday_source,BSEV,BSEV_flag
0,0,22.0,2006,208.0,-80.6759,25.4076,5461.0,1.0,1.0,62500.0,394.430634,366.82049,615.157961,25.677434,0.983079,3.741569,0.0,0.0,2.0,1.0,15.0,2.0,0.0


In [19]:
FireCenterPoints['cluster_reference'] = FireCenterPoints['year'].astype(str) + "_" + FireCenterPoints['cluster_ref'].astype(str)

In [20]:
FireCenterPoints[0:5]

Unnamed: 0,cluster_ref,id,year,doy,longitude,latitude,grid10k,covertype,fuelcode,area_burned,prefire_fuel,consumed_fuel,ECO2,ECO,ECH4,EPM2.5,cwd_frac,duff_frac,fuel_moisture_class,burn_source,burnday_source,BSEV,BSEV_flag,cluster_reference
0,0,22.0,2006,208.0,-80.6759,25.4076,5461.0,1.0,1.0,62500.0,394.430634,366.82049,615.157961,25.677434,0.983079,3.741569,0.0,0.0,2.0,1.0,15.0,2.0,0.0,2006_0
1,1,96.0,2006,157.0,-80.5824,25.613,6845.0,1.0,1.0,62500.0,299.942136,278.946186,467.792755,19.526233,0.747576,2.845251,0.0,0.0,2.0,1.0,81.0,2.0,0.0,2006_1
2,2,164.0,2006,157.0,-80.5513,25.6198,6845.0,1.0,1.0,62500.0,226.077462,210.25204,352.592671,14.717643,0.563475,2.144571,0.0,0.0,2.0,1.0,15.0,2.0,0.0,2006_2
3,3,718.0,2006,49.0,-81.1249,26.0165,8222.0,3.0,1600.0,62500.0,6220.097576,2017.491592,3377.280924,155.346853,4.539356,24.00815,0.011189,0.081394,4.0,1.0,15.0,2.0,0.0,2006_3
4,4,444.0,2006,155.0,-81.6018,26.0708,8217.0,3.0,1140.0,62500.0,7646.952224,3842.452183,6432.264954,295.868818,8.645517,45.725181,0.008857,0.084716,2.0,1.0,15.0,2.0,1.0,2006_4


In [21]:
first_col = FireCenterPoints.pop('cluster_reference')
FireCenterPoints.insert(1, 'cluster_reference', first_col)
FireCenterPoints[0:2]

Unnamed: 0,cluster_ref,cluster_reference,id,year,doy,longitude,latitude,grid10k,covertype,fuelcode,area_burned,prefire_fuel,consumed_fuel,ECO2,ECO,ECH4,EPM2.5,cwd_frac,duff_frac,fuel_moisture_class,burn_source,burnday_source,BSEV,BSEV_flag
0,0,2006_0,22.0,2006,208.0,-80.6759,25.4076,5461.0,1.0,1.0,62500.0,394.430634,366.82049,615.157961,25.677434,0.983079,3.741569,0.0,0.0,2.0,1.0,15.0,2.0,0.0
1,1,2006_1,96.0,2006,157.0,-80.5824,25.613,6845.0,1.0,1.0,62500.0,299.942136,278.946186,467.792755,19.526233,0.747576,2.845251,0.0,0.0,2.0,1.0,81.0,2.0,0.0


-----

## Duplicates
### Center Points: identify duplicates in Centerpoint data using DOY, Year, and alternating Lat and Long to then merge cluster coordinates into a single cluster based on distance on the same day of review. 
#### Based on the manual review, create short list of cluster references to merge. 

In [None]:
centerpoint_dupes = FireCenterPoints[FireCenterPoints.duplicated(['doy', 'year', 'longitude'],keep=False)]
centerpoint_dupes_lat = FireCenterPoints[FireCenterPoints.duplicated(['doy', 'year', 'latitude'],keep=False)]

In [None]:
print(centerpoint_dupes_lat.shape)
centerpoint_dupes_lat[50:59]

In [None]:
### Research into single duplicate in the Fire CenterPoint Data:
day = 75
lg = -82.9603

FireCenterPoints.loc[(FireCenterPoints['year'] == 2013) & (FireCenterPoints['doy'] == day) &(FireCenterPoints['longitude'] == lg),['cluster_reference', 'year', 'doy', 'longitude', 'latitude']]

In [None]:
# Seeing how many cluster points are related to the cluster reference, in determining which to merge. 
c1 = '2010_7596'
c2 = '2010_7597'

a = ClusterPoints.loc[(ClusterPoints['cluster_reference'] == c1)]
b = ClusterPoints.loc[(ClusterPoints['cluster_reference'] == c2)]
print(len(a), len(b))
print(b[0:10])
print(a[0:10])

## Duplicate Centerpoints / Merging list:
- 2006_1874 same as 2006_1880 = merged 2006_1874 into 2006_1880.
         2006_3171 lat is off by 2 from 2006_3568
         2006_5430 not equal to '2006_10010'
- 2006_11021 merged into 2006_11012
- 2004_5765 merged into 2004_5772
- 2005_7122 merged into 2005_228
- 2003_3553 mergred into 2003_3572
- 2003_5931 merged into 2003_722
- 2008_2209 merged into 2008_2201
        '2008_4900' not eq to '2008_6046'
- 2008_7540 merged into 2008_7603
- 2009_1551 merged into 2009_1554
- 2009_1952 merged into 2009_1957
- 2009_2127 merged into 2009_2125
        '2009_5224' note eq to '2009_7086'
- 2013_4179 merged into 2013_4365

- 2007_4737 merged into 2007_4730
- 2007_8121 merged into 2007_8120
- 2015_3984 merged into 2015_3968
- 2011_9524 merged into 2011_9537
- 2011_9726 merged into 2011_9722
- 2009_2255 merged into 2009_2256
- 2009_4723 merged into 2009_4724
- 2010_4123 merged into 2010_4124
- 2010_6123 merged into 2010_6133
- 2010_7596 merged into 2010_7597

109321 less 22 equals 109299.

In [54]:
print(ClusterPoints.shape)
print(len(ClusterPoints['cluster_reference'].unique()))
ClusterPoints.head(1)

(5960572, 5)
109318


Unnamed: 0,latitude,longitude,ClusterNum,year,cluster_reference
0,25.1903,-81.0394,0,2011,2011_0


In [53]:
# Method to replace the first reference with the second. 
ClusterPoints['cluster_reference'].mask(ClusterPoints['cluster_reference'] == '2004_5765', '2004_5772', inplace=True)

In [49]:
print(ClusterPoints.shape)
ClusterPoints.loc[(ClusterPoints['cluster_reference'] == '2006_1874')]

(5960572, 5)


Unnamed: 0,latitude,longitude,ClusterNum,year,cluster_reference


In [None]:
ClusterPoints.reset_index(inplace = True, drop = True) 

In [None]:
ClusterPoints.to_csv('/Users/AlfHaugen/Python/Wildfire_Data/FireExports/Emissions Cluster Data/clusterpoints_03_15_refv5.csv', encoding='utf-8')

### Remove centerpoint rows for duplicate cluster references as noted above. 

In [None]:
FireCenterPoints.shape

In [None]:
adrop = '2010_7596'

In [None]:
FireCenterPoints.drop(FireCenterPoints[FireCenterPoints['cluster_reference'] == adrop].index, inplace = True) 

In [None]:
FireCenterPoints.shape

In [None]:
FireCenterPoints.reset_index(inplace = True, drop = True) 

In [None]:
FireCenterPoints.to_csv('/Users/AlfHaugen/Python/Wildfire_Data/FireExports/Emissions Cluster Data/centerpoints_03_15_v5.csv', encoding='utf-8')

## Emissions Data for Database:

In [None]:
print(emdata_0315.shape)
emdata_0315.head(2)

In [None]:
print(emdata_0315.shape)
emdata_0315 = emdata_0315.drop(columns=['Unnamed: 0'])
emdata_0315.head(2)

----

# Assigning Cluster References to the Emissions Dataset:

### Taking the Emissions Dataset (emdata_0315) and by lat/long/year, bringing over the cluster references from the ClusterPoints df. 
## Test 1
### Prepare Emissions and DeDuplicate

In [None]:
emdatatest = emdata_0315[(emdata_0315.year == 2008) & (emdata_0315.doy < 184)]
ClusterPointstest = ClusterPoints[(ClusterPoints.year == 2008)]
print(emdatatest.shape)
print (ClusterPointstest.shape)
print(emdatatest.head(3), ClusterPointstest.head(3))

In [None]:
emdatatest.reset_index(inplace = True, drop = True) 

In [None]:
emdatatest.tail()

In [None]:
dupes1 = emdatatest[emdatatest.duplicated(['doy', 'year', 'longitude', 'latitude'])]
dupes2 = emdatatest.drop_duplicates(subset=['doy', 'year', 'longitude', 'latitude'], keep='first')

In [None]:
print(dupes2.shape)
dupes2[0:1]

### Test Merge:  
#### When doing the merge, sometimes duplicates will occur, therefore you need to de-duplicate the list. 

In [None]:
emdatatestv2 = dupes2.merge(right=ClusterPointstest.loc[:,['latitude', 'longitude','year', 'cluster_reference']],
                   how='left',
                   left_on=['latitude','longitude','year'],
                   right_on=['latitude','longitude','year'])

emdatatestv2 = pd.merge(emdatatest, 
                   ClusterPointstest[['latitude', 'longitude', 'year', 'cluster_reference']],
                     on =['latitude', 'longitude', 'year'],
                    how ='left')

In [None]:
print(emdatatestv2.shape)
emdatatestv2[0:2]

In [None]:
cluster_dupes = emdatatestv2[emdatatestv2.duplicated(['doy', 'year', 'longitude', 'latitude'],)]

In [None]:
print(cluster_dupes.shape)
cluster_dupes[0:20]

In [None]:
dupes3 = emdatatestv2.drop_duplicates(subset=['doy', 'year', 'longitude', 'latitude'], keep='first')

In [None]:
dupes3.shape

--------

## Final - Merge of Cluster Reference to Emissions Data:
### Taking the Emissions Dataset (emdata_0315) and by lat/long/year, bringing over the cluster references from the ClusterPoints df. 

In [55]:
ClusterPoints2 = pd.read_csv('/Users/AlfHaugen/Python/Wildfire_Data/FireExports/Emissions Cluster Data/clusterpoints_03_15_refv5.17.csv', encoding='utf-8')

In [56]:
print(emdata_0315.shape)
print(ClusterPoints.shape)

(5960572, 19)
(5960572, 5)


In [60]:
ClusterPoints2.tail(5)

Unnamed: 0.1,Unnamed: 0,latitude,longitude,ClusterNum,year,cluster_reference
5960567,5960567,48.8939,-121.1569,9262,2008,2008_9262
5960568,5960568,48.8945,-121.1537,9262,2008,2008_9262
5960569,5960569,48.8951,-121.1504,9262,2008,2008_9262
5960570,5960570,48.8967,-121.1545,9262,2008,2008_9262
5960571,5960571,48.9108,-122.0411,9263,2008,2008_9263


In [59]:
clustercount = len(ClusterPoints2['cluster_reference'].unique())
print(clustercount)

109299


In [61]:
ClusterPoints2_nodupe = ClusterPoints2.drop_duplicates(subset=['year', 'longitude', 'latitude'], keep='first')

In [67]:
clustercount2 = len(ClusterPoints2_nodupe['cluster_reference'].unique())
print(clustercount2)
print(ClusterPoints2_nodupe['cluster_reference'].nunique())

109299
109299


In [25]:
emdata_0315_nodup = emdata_0315.drop_duplicates(subset=['doy', 'year', 'longitude', 'latitude'], keep='first')

In [26]:
print(emdata_0315_nodup.shape)
emdata_0315_nodup[0:2]

(5936438, 19)


Unnamed: 0.1,Unnamed: 0,year,doy,longitude,latitude,covertype,fuelcode,area_burned,prefire_fuel,consumed_fuel,ECO2,ECO,ECH4,EPM2.5,cwd_frac,duff_frac,fuel_moisture_class,burnday_source,BSEV
0,0,2008,359,-81.0384,25.1958,3,1600,0.0,6220.097576,1999.75772,3347.594423,153.981344,4.499455,23.797117,0.023231,0.082115,3,81,1
1,1,2008,359,-81.0404,25.1984,3,1600,62500.0,6220.097576,2041.37434,3417.260644,157.185824,4.593092,24.292355,0.022757,0.080441,3,81,2


In [28]:
emdata_0315_ref5 = emdata_0315_nodup.merge(right=ClusterPoints.loc[:,['latitude', 'longitude','year','cluster_reference']],
                   how='left',
                   left_on=['latitude','longitude','year'],
                   right_on=['latitude','longitude','year'])

In [29]:
print(emdata_0315_ref5.shape)
emdata_0315_ref5[0:3]

(5990537, 20)


Unnamed: 0.1,Unnamed: 0,year,doy,longitude,latitude,covertype,fuelcode,area_burned,prefire_fuel,consumed_fuel,ECO2,ECO,ECH4,EPM2.5,cwd_frac,duff_frac,fuel_moisture_class,burnday_source,BSEV,cluster_reference
0,0,2008,359,-81.0384,25.1958,3,1600,0.0,6220.097576,1999.75772,3347.594423,153.981344,4.499455,23.797117,0.023231,0.082115,3,81,1,2008_0
1,1,2008,359,-81.0404,25.1984,3,1600,62500.0,6220.097576,2041.37434,3417.260644,157.185824,4.593092,24.292355,0.022757,0.080441,3,81,2,2008_0
2,2,2008,359,-81.038,25.1981,3,1600,0.0,6220.097576,1999.75772,3347.594423,153.981344,4.499455,23.797117,0.023231,0.082115,3,81,1,2008_0


In [30]:
emclustercount = len(emdata_0315_ref5['cluster_reference'].unique())
print(emclustercount)

109164


In [31]:
emdata_0315_ref5_clean = emdata_0315_ref5.drop_duplicates(subset=['doy', 'year', 'longitude', 'latitude'], keep='first')
print(emdata_0315_ref5_clean.shape)

(5936438, 20)


In [36]:
emclustercount2 = len(emdata_0315_ref5_clean['cluster_reference'].unique())
print(emclustercount2)

109164


In [63]:
emdata_0315_ref5_clean.head()

Unnamed: 0.1,Unnamed: 0,year,doy,longitude,latitude,covertype,fuelcode,area_burned,prefire_fuel,consumed_fuel,ECO2,ECO,ECH4,EPM2.5,cwd_frac,duff_frac,fuel_moisture_class,burnday_source,BSEV,cluster_reference
0,0,2008,359,-81.0384,25.1958,3,1600,0.0,6220.097576,1999.75772,3347.594423,153.981344,4.499455,23.797117,0.023231,0.082115,3,81,1,2008_0
1,1,2008,359,-81.0404,25.1984,3,1600,62500.0,6220.097576,2041.37434,3417.260644,157.185824,4.593092,24.292355,0.022757,0.080441,3,81,2,2008_0
2,2,2008,359,-81.038,25.1981,3,1600,0.0,6220.097576,1999.75772,3347.594423,153.981344,4.499455,23.797117,0.023231,0.082115,3,81,1,2008_0
3,4,2008,359,-81.0594,25.2035,3,1600,0.0,6220.097576,1999.75772,3347.594423,153.981344,4.499455,23.797117,0.023231,0.082115,3,81,1,2008_0
4,5,2008,359,-81.057,25.2032,3,1600,62500.0,6220.097576,2041.37434,3417.260644,157.185824,4.593092,24.292355,0.022757,0.080441,3,81,2,2008_0


thirdcol = emdata_0315_ref5_clean.pop('cluster_reference')
emdata_0315_ref5_clean.insert(2, 'cluster_reference', thirdcol)
emdata_0315_ref5_clean[0:20]

In [64]:
emdata_0315_ref5_clean.tail()

Unnamed: 0.1,Unnamed: 0,year,doy,longitude,latitude,covertype,fuelcode,area_burned,prefire_fuel,consumed_fuel,ECO2,ECO,ECH4,EPM2.5,cwd_frac,duff_frac,fuel_moisture_class,burnday_source,BSEV,cluster_reference
5936433,7254529,2003,181,-119.3634,48.9603,1,1,62500.0,115.560666,107.471419,180.22957,7.522999,0.288023,1.096208,0.0,0.0,2,77,2,2003_428
5936434,7254530,2003,182,-119.3602,48.9609,2,2,0.0,359.23563,323.312067,542.194336,22.631845,0.866476,3.297783,0.0,0.0,2,77,1,2003_428
5936435,7254531,2003,181,-119.3642,48.9625,2,2,62500.0,324.937314,292.443583,490.427888,20.471051,0.783749,2.982925,0.0,0.0,2,77,2,2003_428
5936436,7254532,2003,182,-119.361,48.9631,2,2,62500.0,130.356018,117.320416,196.746338,8.212429,0.314419,1.196668,0.0,0.0,2,77,3,2003_428
5936437,7254533,2003,219,-121.9887,48.5002,3,1200,62500.0,6424.431099,4040.568775,6279.043877,537.395647,30.223454,92.124968,0.236728,0.198929,3,77,3,2003_6230


In [65]:
emdata_0315_ref5_clean.reset_index(inplace = True, drop = True) 

In [66]:
emdata_0315_ref5_clean.to_csv('/Users/AlfHaugen/Python/Wildfire_Data/FireExports/Emissions Cluster Data/emissions_03_15_v522.csv', encoding='utf-8')

---
