# Assign Cluster Reference label from DBScan to Emissions and NASA M6 records.
Includes a review and removal of duplicate records. 

In [6]:
import pandas as pd
import numpy as np
import math
import datetime

from sqlalchemy import create_engine
import config
import config1

In [7]:
# Expanding number of columns:
pd.set_option('display.max_columns', 40)

### One-Time - Combining Cluster Point CSVs into single dataset of underlying emissions records with cluster references:

### Load related DataFrames: Fire Center Points from Emissions, Cluster Points, Emissions from DB. 

In [8]:
config.db_name  # Checking name of DB. 

'db27bvsdruzh45'

In [19]:
# Create sqlalchemy engine to access database:
engine = create_engine("mysql+mysqlconnector://{user}:{password}@{host}/{dbname}"
                       .format(user=config.db_user,
                               password=config.db_pass,
                               dbname=config.db_main,
                               host=config.db_host))

In [20]:
### Skip this block if reading from local disk:

#Create Query Statments
queryWeather = """
SELECT *
FROM Weather;
"""
queryEmissions = """
SELECT *
FROM Emissions_Data;
"""
queryNASA = """
SELECT *
FROM NASA;
"""

In [15]:
emdata_0315 = pd.read_sql(queryEmissions, engine)  # Bring in Emissions records (w/o Null Values) from database. 

In [7]:
fire_centerpoints = pd.read_csv('../data/centerpoints_03_15_v2.csv')

In [None]:
ClusterPoints = pd.read_csv('../data/clusterpoints_03_15_v2.csv', encoding='utf-8')

### Emissions Data

In [16]:
print(emdata_0315.shape)
emdata_0315.head(2)

(5960572, 20)


Unnamed: 0.1,index,Unnamed: 0,year,doy,longitude,latitude,covertype,fuelcode,area_burned,prefire_fuel,consumed_fuel,ECO2,ECO,ECH4,EPM2.5,cwd_frac,duff_frac,fuel_moisture_class,burnday_source,BSEV
0,0,0,2008,359,-81.0384,25.1958,3,1600,0.0,6220.097576,1999.75772,3347.594423,153.981344,4.499455,23.797117,0.023231,0.082115,3,81,1
1,1,1,2008,359,-81.0404,25.1984,3,1600,62500.0,6220.097576,2041.37434,3417.260644,157.185824,4.593092,24.292355,0.022757,0.080441,3,81,2


In [None]:
emdata_20xx = emdata_0315[emdata_0315.year == 2004]
emdata_20xx.head(2)

### Fire Cluster Points data from DBScan.
Detailed records of all underlying emissions records for each created cluster reference from DBScan. 

In [10]:
print(ClusterPoints.shape)
ClusterPoints[200000:200005]

(5960572, 5)


Unnamed: 0.1,Unnamed: 0,0,1,ClusterNum,Year
200000,200000,31.7406,-109.2987,361,2011
200001,200001,31.7409,-109.2961,361,2011
200002,200002,31.7412,-109.2935,361,2011
200003,200003,31.7415,-109.2909,361,2011
200004,200004,31.7418,-109.2882,361,2011


### Creating New Fire Cluster Point Dataframe:
1. Dropping unused columns
2. Renaming column headers
3. Assigning new Cluster Reference to cluster points

In [39]:
ClusterPoints = ClusterPoints.drop(columns=['Unnamed: 0'])  # Dropping column. 
ClusterPoints[0:1]

Unnamed: 0,0,1,ClusterNum,Year
0,25.1903,-81.0394,0,2011


In [40]:
ClusterPoints = ClusterPoints.rename(columns={'0': 'latitude', '1': 'longitude', 'Year': 'year'})  # Rename 0,1 to Lat/Long. 
ClusterPoints[0:5]

Unnamed: 0,latitude,longitude,ClusterNum,year
0,25.1903,-81.0394,0,2011
1,25.19,-81.037,0,2011
2,25.1896,-81.0346,0,2011
3,25.1926,-81.039,0,2011
4,25.1922,-81.0366,0,2011


In [41]:
### Create new Cluster Reference label from original cluster number from DBscan and Year. 
ClusterPoints['cluster_reference'] = ClusterPoints['year'].astype(str) + "_" + ClusterPoints['ClusterNum'].astype(str)
ClusterPoints[0:10]

Unnamed: 0,latitude,longitude,ClusterNum,year,cluster_reference
0,25.1903,-81.0394,0,2011,2011_0
1,25.19,-81.037,0,2011,2011_0
2,25.1896,-81.0346,0,2011,2011_0
3,25.1926,-81.039,0,2011,2011_0
4,25.1922,-81.0366,0,2011,2011_0
5,25.1918,-81.0342,0,2011,2011_0
6,25.1995,-81.0701,0,2011,2011_0
7,25.1991,-81.0677,0,2011,2011_0
8,25.1952,-81.0411,0,2011,2011_0
9,25.1948,-81.0386,0,2011,2011_0


In [42]:
# Checking shape. 
print(ClusterPoints.shape)

(5960572, 5)


In [45]:
# Determining unique cluster reference points.  Using .Unique() will include blanks. 
f = len(ClusterPoints['cluster_reference'].unique())
print(f)

109321


In [None]:
# Writing new dataset of emissions cluster points with cluster reference ID to CSV:
ClusterPoints.to_csv('../data/Emissions Cluster Data/clusterpoints_03_15_ref1.csv', encoding='utf-8')

## Fire Centerpoints from DBScan:
1. Renaming columns, 
2. Adding cluster_reference id, 
3. Moving column. 

In [15]:
print(fire_centerpoints.shape) # Checking shape, and rows match to the detailed cluster points dataframe. 
fire_centerpoints[0:2]

(109321, 23)


Unnamed: 0.1,Unnamed: 0,id,year,doy,longitude,latitude,grid10k,covertype,fuelcode,area_burned,prefire_fuel,consumed_fuel,ECO2,ECO,ECH4,EPM2.5,cwd_frac,duff_frac,fuel_moisture_class,burn_source,burnday_source,BSEV,BSEV_flag
0,0,22.0,2006.0,208.0,-80.6759,25.4076,5461.0,1.0,1.0,62500.0,394.430634,366.82049,615.157961,25.677434,0.983079,3.741569,0.0,0.0,2.0,1.0,15.0,2.0,0.0
1,1,96.0,2006.0,157.0,-80.5824,25.613,6845.0,1.0,1.0,62500.0,299.942136,278.946186,467.792755,19.526233,0.747576,2.845251,0.0,0.0,2.0,1.0,81.0,2.0,0.0


In [16]:
fire_centerpoints = fire_centerpoints.rename(columns={'Unnamed: 0': 'cluster_ref'})

In [17]:
fire_centerpoints = fire_centerpoints.astype({"year": int}) 

In [18]:
fire_centerpoints.head(1)

Unnamed: 0,cluster_ref,id,year,doy,longitude,latitude,grid10k,covertype,fuelcode,area_burned,prefire_fuel,consumed_fuel,ECO2,ECO,ECH4,EPM2.5,cwd_frac,duff_frac,fuel_moisture_class,burn_source,burnday_source,BSEV,BSEV_flag
0,0,22.0,2006,208.0,-80.6759,25.4076,5461.0,1.0,1.0,62500.0,394.430634,366.82049,615.157961,25.677434,0.983079,3.741569,0.0,0.0,2.0,1.0,15.0,2.0,0.0


In [19]:
### Create new Cluster Reference label from original cluster number from DBscan and Year. 
fire_centerpoints['cluster_reference'] = fire_centerpoints['year'].astype(str) + "_" + fire_centerpoints['cluster_ref'].astype(str)

In [20]:
fire_centerpoints[0:5]

Unnamed: 0,cluster_ref,id,year,doy,longitude,latitude,grid10k,covertype,fuelcode,area_burned,prefire_fuel,consumed_fuel,ECO2,ECO,ECH4,EPM2.5,cwd_frac,duff_frac,fuel_moisture_class,burn_source,burnday_source,BSEV,BSEV_flag,cluster_reference
0,0,22.0,2006,208.0,-80.6759,25.4076,5461.0,1.0,1.0,62500.0,394.430634,366.82049,615.157961,25.677434,0.983079,3.741569,0.0,0.0,2.0,1.0,15.0,2.0,0.0,2006_0
1,1,96.0,2006,157.0,-80.5824,25.613,6845.0,1.0,1.0,62500.0,299.942136,278.946186,467.792755,19.526233,0.747576,2.845251,0.0,0.0,2.0,1.0,81.0,2.0,0.0,2006_1
2,2,164.0,2006,157.0,-80.5513,25.6198,6845.0,1.0,1.0,62500.0,226.077462,210.25204,352.592671,14.717643,0.563475,2.144571,0.0,0.0,2.0,1.0,15.0,2.0,0.0,2006_2
3,3,718.0,2006,49.0,-81.1249,26.0165,8222.0,3.0,1600.0,62500.0,6220.097576,2017.491592,3377.280924,155.346853,4.539356,24.00815,0.011189,0.081394,4.0,1.0,15.0,2.0,0.0,2006_3
4,4,444.0,2006,155.0,-81.6018,26.0708,8217.0,3.0,1140.0,62500.0,7646.952224,3842.452183,6432.264954,295.868818,8.645517,45.725181,0.008857,0.084716,2.0,1.0,15.0,2.0,1.0,2006_4


In [21]:
# Move cluster reference label to front of columns. 
first_col = fire_centerpoints.pop('cluster_reference')
fire_centerpoints.insert(1, 'cluster_reference', first_col)
fire_centerpoints[0:2]

Unnamed: 0,cluster_ref,cluster_reference,id,year,doy,longitude,latitude,grid10k,covertype,fuelcode,area_burned,prefire_fuel,consumed_fuel,ECO2,ECO,ECH4,EPM2.5,cwd_frac,duff_frac,fuel_moisture_class,burn_source,burnday_source,BSEV,BSEV_flag
0,0,2006_0,22.0,2006,208.0,-80.6759,25.4076,5461.0,1.0,1.0,62500.0,394.430634,366.82049,615.157961,25.677434,0.983079,3.741569,0.0,0.0,2.0,1.0,15.0,2.0,0.0
1,1,2006_1,96.0,2006,157.0,-80.5824,25.613,6845.0,1.0,1.0,62500.0,299.942136,278.946186,467.792755,19.526233,0.747576,2.845251,0.0,0.0,2.0,1.0,81.0,2.0,0.0


-----
---

# Manual Review of Centerpoint data for clusters that appear to be duplicates and can be merged.
1. Center Points: identify duplicates in Centerpoint data using DOY, Year, and alternating Lat and Long to then merge cluster coordinates into a single cluster based on distance on the same day of review. 
2. Based on the manual review, create short list of cluster references to merge. 

In [None]:
centerpoint_dupes = fire_centerpoints[fire_centerpoints.duplicated(['doy', 'year', 'longitude'],keep=False)]
centerpoint_dupes_lat = fire_centerpoints[fire_centerpoints.duplicated(['doy', 'year', 'latitude'],keep=False)]

In [None]:
print(centerpoint_dupes_lat.shape)
centerpoint_dupes_lat[50:59]

In [None]:
### Research into single duplicate in the Fire CenterPoint Data:
day = 75
lg = -82.9603

fire_centerpoints.loc[(FireCenterPoints['year'] == 2013) & (FireCenterPoints['doy'] == day) &(FireCenterPoints['longitude'] == lg),['cluster_reference', 'year', 'doy', 'longitude', 'latitude']]

In [None]:
# Seeing how many cluster points are related to the cluster reference, in determining which to merge. 
c1 = '2010_7596'
c2 = '2010_7597'

a = ClusterPoints.loc[(ClusterPoints['cluster_reference'] == c1)]
b = ClusterPoints.loc[(ClusterPoints['cluster_reference'] == c2)]
print(len(a), len(b))
print(b[0:10])
print(a[0:10])

## Duplicate Centerpoints / Merging list based on manual review:
- 2006_1874 same as 2006_1880 = merged 2006_1874 into 2006_1880.
         2006_3171 lat is off by 2 from 2006_3568
         2006_5430 not equal to '2006_10010'
- 2006_11021 merged into 2006_11012
- 2004_5765 merged into 2004_5772
- 2005_7122 merged into 2005_228
- 2003_3553 mergred into 2003_3572
- 2003_5931 merged into 2003_722
- 2008_2209 merged into 2008_2201
        '2008_4900' not eq to '2008_6046'
- 2008_7540 merged into 2008_7603
- 2009_1551 merged into 2009_1554
- 2009_1952 merged into 2009_1957
- 2009_2127 merged into 2009_2125
        '2009_5224' note eq to '2009_7086'
- 2013_4179 merged into 2013_4365

- 2007_4737 merged into 2007_4730
- 2007_8121 merged into 2007_8120
- 2015_3984 merged into 2015_3968
- 2011_9524 merged into 2011_9537
- 2011_9726 merged into 2011_9722
- 2009_2255 merged into 2009_2256
- 2009_4723 merged into 2009_4724
- 2010_4123 merged into 2010_4124
- 2010_6123 merged into 2010_6133
- 2010_7596 merged into 2010_7597

109321 less 22 equals 109299.

In [None]:
# Review of the Cluster Points dataset to perform changes in cluster references that were made in the centerpoint review above. 
print(ClusterPoints.shape)
print(len(ClusterPoints['cluster_reference'].unique()))
ClusterPoints.head(1)

In [53]:
# Method to replace the first reference with the second in the cluster points data. 
ClusterPoints['cluster_reference'].mask(ClusterPoints['cluster_reference'] == '2004_5765', '2004_5772', inplace=True)

In [49]:
print(ClusterPoints.shape) # Check to confirm you removed the cluster reference that was moved into the new reference label. 
ClusterPoints.loc[(ClusterPoints['cluster_reference'] == '2006_1874')]

(5960572, 5)


Unnamed: 0,latitude,longitude,ClusterNum,year,cluster_reference


In [None]:
ClusterPoints.reset_index(inplace = True, drop = True) 

## Save updated Cluster Points data into new Dataframe/CSV for further use. 

In [None]:
# Save new cluster points file into CSV. 
ClusterPoints.to_csv('../data/clusterpoints_03_15_refv5.17.csv', encoding='utf-8')

### Remove centerpoint rows for duplicate cluster references as noted above. 

In [None]:
fire_centerpoints.shape

In [None]:
adrop = '2010_7596'  # Cycle through centerpoint cluster references that need to be removed. 

In [None]:
fire_centerpoints.drop(FireCenterPoints[FireCenterPoints['cluster_reference'] == adrop].index, inplace = True) # Cycle through centerpoint cluster references that need to be removed. 

In [None]:
fire_centerpoints.shape

In [None]:
fire_centerpoints.reset_index(inplace = True, drop = True) 

In [None]:
fire_centerpoints.to_csv('../data/Emissions Cluster Data/centerpoints_03_15_v5.csv', encoding='utf-8') # Save new centerpoints data to csv. 

----
----

# Assigning Cluster References to the Emissions Dataset:

In [27]:
# Bringing back up Emissions Data and removing first column. 
print(emdata_0315.shape)
emdata_0315 = emdata_0315.drop(columns=['Unnamed: 0'])
emdata_0315.head(2)

(5960572, 20)


Unnamed: 0,index,year,doy,longitude,latitude,covertype,fuelcode,area_burned,prefire_fuel,consumed_fuel,ECO2,ECO,ECH4,EPM2.5,cwd_frac,duff_frac,fuel_moisture_class,burnday_source,BSEV
0,0,2008,359,-81.0384,25.1958,3,1600,0.0,6220.097576,1999.75772,3347.594423,153.981344,4.499455,23.797117,0.023231,0.082115,3,81,1
1,1,2008,359,-81.0404,25.1984,3,1600,62500.0,6220.097576,2041.37434,3417.260644,157.185824,4.593092,24.292355,0.022757,0.080441,3,81,2


### Merge of Cluster Reference to Emissions Data:
Taking the Emissions Dataset (emdata_0315) and by lat/long/year, bringing over the cluster references from the ClusterPoints df. 

In [55]:
ClusterPoints2 = pd.read_csv('../data/clusterpoints_03_15_refv5.17.csv', encoding='utf-8')  # Taking new cluster points file created above, to then assign reference over to Emissions records. 

In [56]:
# Checking dataframes.
print(emdata_0315.shape)
print(ClusterPoints.shape)

(5960572, 19)
(5960572, 5)


In [60]:
ClusterPoints2.tail(5)

Unnamed: 0.1,Unnamed: 0,latitude,longitude,ClusterNum,year,cluster_reference
5960567,5960567,48.8939,-121.1569,9262,2008,2008_9262
5960568,5960568,48.8945,-121.1537,9262,2008,2008_9262
5960569,5960569,48.8951,-121.1504,9262,2008,2008_9262
5960570,5960570,48.8967,-121.1545,9262,2008,2008_9262
5960571,5960571,48.9108,-122.0411,9263,2008,2008_9263


In [59]:
# New number of centerpoints per labels in cluster point df due to merging close centerpoints. 
clustercount = len(ClusterPoints2['cluster_reference'].unique())
print(clustercount)

109299


In [61]:
ClusterPoints2_nodupe = ClusterPoints2.drop_duplicates(subset=['year', 'longitude', 'latitude'], keep='first')  # Taking out duplicates from cluster points records. 

In [67]:
clustercount2 = len(ClusterPoints2_nodupe['cluster_reference'].unique())
print(clustercount2)
print(ClusterPoints2_nodupe['cluster_reference'].nunique())

109299
109299


### Remove duplicate records from Emissions dataset. 

In [25]:
emdata_0315_nodup = emdata_0315.drop_duplicates(subset=['doy', 'year', 'longitude', 'latitude'], keep='first')

In [26]:
print(emdata_0315_nodup.shape)
emdata_0315_nodup[0:2]

(5936438, 19)


Unnamed: 0.1,Unnamed: 0,year,doy,longitude,latitude,covertype,fuelcode,area_burned,prefire_fuel,consumed_fuel,ECO2,ECO,ECH4,EPM2.5,cwd_frac,duff_frac,fuel_moisture_class,burnday_source,BSEV
0,0,2008,359,-81.0384,25.1958,3,1600,0.0,6220.097576,1999.75772,3347.594423,153.981344,4.499455,23.797117,0.023231,0.082115,3,81,1
1,1,2008,359,-81.0404,25.1984,3,1600,62500.0,6220.097576,2041.37434,3417.260644,157.185824,4.593092,24.292355,0.022757,0.080441,3,81,2


In [28]:
# Merge over cluster reference from underlying cluster points records to the original Emissions dataset. 
emdata_0315_ref5 = emdata_0315_nodup.merge(right=ClusterPoints.loc[:,['latitude', 'longitude','year','cluster_reference']],
                   how='left',
                   left_on=['latitude','longitude','year'],
                   right_on=['latitude','longitude','year'])

In [29]:
print(emdata_0315_ref5.shape)
emdata_0315_ref5[0:3]

(5990537, 20)


Unnamed: 0.1,Unnamed: 0,year,doy,longitude,latitude,covertype,fuelcode,area_burned,prefire_fuel,consumed_fuel,ECO2,ECO,ECH4,EPM2.5,cwd_frac,duff_frac,fuel_moisture_class,burnday_source,BSEV,cluster_reference
0,0,2008,359,-81.0384,25.1958,3,1600,0.0,6220.097576,1999.75772,3347.594423,153.981344,4.499455,23.797117,0.023231,0.082115,3,81,1,2008_0
1,1,2008,359,-81.0404,25.1984,3,1600,62500.0,6220.097576,2041.37434,3417.260644,157.185824,4.593092,24.292355,0.022757,0.080441,3,81,2,2008_0
2,2,2008,359,-81.038,25.1981,3,1600,0.0,6220.097576,1999.75772,3347.594423,153.981344,4.499455,23.797117,0.023231,0.082115,3,81,1,2008_0


In [31]:
emdata_0315_ref5_clean = emdata_0315_ref5.drop_duplicates(subset=['doy', 'year', 'longitude', 'latitude'], keep='first')  # Remove duplicates after merge. 
print(emdata_0315_ref5_clean.shape)

(5936438, 20)


In [36]:
# Review of cluster references assigned to Emissions records. Noted that 135 centerpoint references did not port over, however, all records did receive a label. 
emclustercount2 = len(emdata_0315_ref5_clean['cluster_reference'].unique())
print(emclustercount2)

109164


In [63]:
emdata_0315_ref5_clean.head()

Unnamed: 0.1,Unnamed: 0,year,doy,longitude,latitude,covertype,fuelcode,area_burned,prefire_fuel,consumed_fuel,ECO2,ECO,ECH4,EPM2.5,cwd_frac,duff_frac,fuel_moisture_class,burnday_source,BSEV,cluster_reference
0,0,2008,359,-81.0384,25.1958,3,1600,0.0,6220.097576,1999.75772,3347.594423,153.981344,4.499455,23.797117,0.023231,0.082115,3,81,1,2008_0
1,1,2008,359,-81.0404,25.1984,3,1600,62500.0,6220.097576,2041.37434,3417.260644,157.185824,4.593092,24.292355,0.022757,0.080441,3,81,2,2008_0
2,2,2008,359,-81.038,25.1981,3,1600,0.0,6220.097576,1999.75772,3347.594423,153.981344,4.499455,23.797117,0.023231,0.082115,3,81,1,2008_0
3,4,2008,359,-81.0594,25.2035,3,1600,0.0,6220.097576,1999.75772,3347.594423,153.981344,4.499455,23.797117,0.023231,0.082115,3,81,1,2008_0
4,5,2008,359,-81.057,25.2032,3,1600,62500.0,6220.097576,2041.37434,3417.260644,157.185824,4.593092,24.292355,0.022757,0.080441,3,81,2,2008_0


thirdcol = emdata_0315_ref5_clean.pop('cluster_reference')
emdata_0315_ref5_clean.insert(2, 'cluster_reference', thirdcol)
emdata_0315_ref5_clean[0:20]

In [64]:
emdata_0315_ref5_clean.tail()

Unnamed: 0.1,Unnamed: 0,year,doy,longitude,latitude,covertype,fuelcode,area_burned,prefire_fuel,consumed_fuel,ECO2,ECO,ECH4,EPM2.5,cwd_frac,duff_frac,fuel_moisture_class,burnday_source,BSEV,cluster_reference
5936433,7254529,2003,181,-119.3634,48.9603,1,1,62500.0,115.560666,107.471419,180.22957,7.522999,0.288023,1.096208,0.0,0.0,2,77,2,2003_428
5936434,7254530,2003,182,-119.3602,48.9609,2,2,0.0,359.23563,323.312067,542.194336,22.631845,0.866476,3.297783,0.0,0.0,2,77,1,2003_428
5936435,7254531,2003,181,-119.3642,48.9625,2,2,62500.0,324.937314,292.443583,490.427888,20.471051,0.783749,2.982925,0.0,0.0,2,77,2,2003_428
5936436,7254532,2003,182,-119.361,48.9631,2,2,62500.0,130.356018,117.320416,196.746338,8.212429,0.314419,1.196668,0.0,0.0,2,77,3,2003_428
5936437,7254533,2003,219,-121.9887,48.5002,3,1200,62500.0,6424.431099,4040.568775,6279.043877,537.395647,30.223454,92.124968,0.236728,0.198929,3,77,3,2003_6230


In [65]:
emdata_0315_ref5_clean.reset_index(inplace = True, drop = True) 

In [None]:
# Save emissions records to CSV for future model build. 
emdata_0315_ref5_clean.to_csv('/Users/AlfHaugen/Python/Wildfire_Data/FireExports/Emissions Cluster Data/emissions_03_15_v522.csv', encoding='utf-8')

---


# Assign Cluster Reference to NASA M6 Data:

In [24]:
NASA_M6 = pd.read_sql(queryNASA, engine)  # Bring in NASA records from database. 

In [5]:
# Import file from K-D Tree 'Connect_Datasets_Clusters notebook' that has the cluster reference assigned to the lat/long and year for the NASA M6 records. 
NASA_M6_Cluster = pd.read_csv('../data/NASA_M6_ClusterRef_0315.csv')

In [3]:
# Assign the DOY, Month and Year from the Acq Date:

#NASA_M6['doy'] = pd.DatetimeIndex(NASA_M6['acq_date']).day
NASA_M6['datetime'] = pd.to_datetime(NASA_M6['acq_date'], infer_datetime_format=True)
NASA_M6['doy'] = NASA_M6['datetime'].dt.dayofyear
NASA_M6['month'] = pd.DatetimeIndex(NASA_M6['acq_date']).month
NASA_M6['year'] = pd.DatetimeIndex(NASA_M6['acq_date']).year

In [25]:
print(NASA_M6.shape)
NASA_M6.head()

(2159441, 12)


Unnamed: 0,Key,cluster_ref,lat,lon,brightness,acq_date,acq_time,bright_t31,frp,daynight,year,doy
0,1,2003_1980,38.8142,-93.5539,300.8,2003-01-01,00:04:23,267.0,10.4,N,2003,1
1,2,unknown,19.3739,-155.113,318.8,2003-01-01,00:09:14,288.7,98.4,N,2003,1
2,3,unknown,19.3723,-155.1197,316.0,2003-01-01,00:09:14,288.3,85.0,N,2003,1
3,4,unknown,19.3589,-155.1107,325.2,2003-01-01,00:09:14,292.0,131.8,N,2003,1
4,5,unknown,19.3573,-155.1174,319.0,2003-01-01,00:09:14,290.3,97.3,N,2003,1


In [7]:
# Select only those NASA M6 records where the record was within 150 euclidean distance to the cluster that was assigned to it (or approx. 2.6km).
# This will reduce the number of NASA M6 records to be deployed in the model. 

NASA_M6_Cluster_under150 = NASA_M6_Cluster[NASA_M6_Cluster['distance'] < 150]
NASA_M6_Cluster_under150.shape

(1366022, 12)

In [9]:
NASA_M6_Cluster_under150 = NASA_M6_Cluster_under150.rename(columns={'source_lat': 'latitude', 'source_long': 'longitude', 'source_year': 'year','source_doy':'doy'}) # rename columns.
NASA_M6_Cluster_under150.head(20)

Unnamed: 0.1,Unnamed: 0,latitude,longitude,year,doy,distance,resultrow,targetlat,targetlong,target_doy,target_year,target_clusterref
0,0,31.2482,-85.7553,2004,1,31.187321,6628,31.7278,-85.7461,8,2004,2004_3025
1,1,35.4981,-78.1687,2004,1,25.878173,11904,35.851,-78.1843,7,2004,2004_5314
2,2,35.4299,-77.3891,2004,1,51.30373,11805,35.2661,-77.1672,24,2004,2004_5256
3,3,31.7169,-81.8174,2004,1,25.301232,446,32.0002,-81.6341,5,2004,2004_1108
4,4,32.1135,-83.533,2004,1,43.224914,7221,32.0524,-83.2523,18,2004,2004_3253


In [10]:
NASA_M6_v2 = NASA_M6.merge(right=NASA_M6_Cluster_under150.loc[:,['latitude', 'longitude','year','doy', 'target_clusterref']],
                   how='left',
                   left_on=['latitude','longitude','year', 'doy'],
                   right_on=['latitude','longitude','year', 'doy'])

In [11]:
print(NASA_M6_v2.shape)
NASA_M6_v2.head()

(2159502, 20)


Unnamed: 0,latitude,longitude,brightness,scan,track,acq_date,acq_time,satellite,instrument,confidence,version,bright_t31,frp,daynight,type,datetime,doy,month,year,target_clusterref
0,38.8142,-93.5539,300.8,1.0,1.0,2003-01-01,423,Terra,MODIS,33,6.2,267.0,10.4,N,0,2003-01-01,1,1,2003,2003_1980
1,19.3739,-155.113,318.8,3.1,1.7,2003-01-01,914,Terra,MODIS,97,6.2,288.7,98.4,N,2,2003-01-01,1,1,2003,
2,19.3723,-155.1197,316.0,3.1,1.7,2003-01-01,914,Terra,MODIS,90,6.2,288.3,85.0,N,2,2003-01-01,1,1,2003,
3,19.3589,-155.1107,325.2,3.1,1.7,2003-01-01,914,Terra,MODIS,100,6.2,292.0,131.8,N,2,2003-01-01,1,1,2003,
4,19.3573,-155.1174,319.0,3.1,1.7,2003-01-01,914,Terra,MODIS,97,6.2,290.3,97.3,N,2,2003-01-01,1,1,2003,


In [13]:
NASA_M6_v2.loc[NASA_M6_v2['target_clusterref'] == '2004_3071']  # Searching for a single cluster to double check assignment. 

Unnamed: 0,latitude,longitude,brightness,scan,track,acq_date,acq_time,satellite,instrument,confidence,version,bright_t31,frp,daynight,type,datetime,doy,month,year,target_clusterref
118735,32.3560,-97.8140,309.5,1.1,1.1,2004-02-02,1732,Terra,MODIS,60,6.2,282.8,13.9,D,0,2004-02-02,33,2,2004,2004_3071
119373,32.0950,-97.4140,306.9,1.4,1.2,2004-02-07,1923,Aqua,MODIS,64,6.2,287.8,13.4,D,0,2004-02-07,38,2,2004,2004_3071
120047,32.6840,-98.3922,312.9,1.5,1.2,2004-02-16,1744,Terra,MODIS,73,6.2,287.4,22.7,D,0,2004-02-16,47,2,2004,2004_3071
120300,32.6518,-98.5524,310.2,1.3,1.1,2004-02-17,2000,Aqua,MODIS,61,6.2,293.5,10.5,D,0,2004-02-17,48,2,2004,2004_3071
120303,32.5699,-97.5225,323.7,1.5,1.2,2004-02-17,2000,Aqua,MODIS,81,6.2,296.4,31.8,D,0,2004-02-17,48,2,2004,2004_3071
120305,32.5673,-97.5165,330.3,1.5,1.2,2004-02-17,2000,Aqua,MODIS,86,6.2,296.0,43.8,D,0,2004-02-17,48,2,2004,2004_3071
120306,32.5715,-97.5069,313.6,1.5,1.2,2004-02-17,2000,Aqua,MODIS,69,6.2,294.8,15.6,D,0,2004-02-17,48,2,2004,2004_3071
120516,32.6377,-98.3569,304.2,3.1,1.7,2004-02-18,1905,Aqua,MODIS,37,6.2,292.4,20.6,D,0,2004-02-18,49,2,2004,2004_3071
120517,32.6532,-98.3551,306.0,3.1,1.7,2004-02-18,1905,Aqua,MODIS,54,6.2,292.5,28.2,D,0,2004-02-18,49,2,2004,2004_3071
121361,32.8146,-98.5304,303.1,2.9,1.6,2004-02-21,1803,Terra,MODIS,46,6.2,290.3,19.3,D,0,2004-02-21,52,2,2004,2004_3071


In [16]:
# Review of any duplicates. 
dupes5 = NASA_M6_v2[NASA_M6_v2.duplicated(['doy', 'year', 'longitude', 'latitude'],keep = False)]

In [17]:
dupes5[0:4] # Review of any duplicates. 

Unnamed: 0,latitude,longitude,brightness,scan,track,acq_date,acq_time,satellite,instrument,confidence,version,bright_t31,frp,daynight,type,datetime,doy,month,year,target_clusterref
67555,44.5066,-121.7679,324.6,1.4,1.2,2003-08-24,951,Aqua,MODIS,10,6.2,285.5,33.1,N,0,2003-08-24,236,8,2003,2003_328
67556,44.5066,-121.7679,324.6,1.4,1.2,2003-08-24,951,Aqua,MODIS,10,6.2,285.5,33.1,N,0,2003-08-24,236,8,2003,2003_328
67673,44.5066,-121.7679,340.8,1.1,1.0,2003-08-24,2058,Aqua,MODIS,39,6.2,299.2,39.1,D,0,2003-08-24,236,8,2003,2003_328
67674,44.5066,-121.7679,340.8,1.1,1.0,2003-08-24,2058,Aqua,MODIS,39,6.2,299.2,39.1,D,0,2003-08-24,236,8,2003,2003_328


In [18]:
# Remove duplicates after merge between NASA M6 records and the records with the cluster reference assigned that were <150 euclidean distance. 
NASA_M6_v2 = NASA_M6_v2.drop_duplicates(subset=['doy', 'year', 'longitude', 'latitude'], keep='first')
print(NASA_M6_v2.shape)

(2159441, 20)


In [19]:
# Save new NASA M6 dataset with complete cluster reference assignment. 
NASA_M6_v2.to_csv('../data/NASA_M6_FullData_ClusterRef_May30.csv')