## Data Wrangling

Data wrangling is performed to gather and clean NASA data from various sources before using the cleaned data to predict wildfire with a machine learning model. To train the machine learning model, the following data is collected.

- fire location (label)
- temperature
- solar insolation
- rainfall

### Fire location

Source: https://firms.modaps.eosdis.nasa.gov/active_fire/#firms-txt <br>
Check [here](https://earthdata.nasa.gov/earth-observation-data/near-real-time/firms/v1-vnp14imgt#ed-viirs-375m-attributes) for attribute fields for VIIRS <br>

VIIRS is chosen over MODIS because resolution is higher. For more information, check [here](https://earthdata.nasa.gov/faq/firms-faq)




In [None]:
!pip install netcdf4

In [None]:
import pandas as pd
import numpy as np
import os
from google.colab import drive
import requests
import matplotlib.pyplot as plt
from skimage import io
import cv2
import netCDF4 as nc
import pickle

# Mount google drive
drive.mount('/content/drive/')

In [5]:
# Define file path of fire location dataset file path
fpath_dataset = '/content/drive/Shared drives/NASA Space App/Code/Dataset'
fpath_fire_dataset = os.path.join(fpath_dataset, 'Fire', 'Aus_01_12_19_to_31_01_20.csv')

# Import fire dataset
df = pd.read_csv(fpath_fire_dataset)
df.head()

Unnamed: 0,latitude,longitude,bright_ti4,scan,track,acq_date,acq_time,satellite,instrument,confidence,version,bright_ti5,frp,daynight,type
0,-33.263584,150.323334,335.52,0.42,0.61,2019-12-01,258,N,VIIRS,n,1,298.2,5.97,D,0
1,-33.275116,150.324799,333.53,0.42,0.61,2019-12-01,258,N,VIIRS,n,1,297.07,5.1,D,0
2,-33.272636,150.3703,331.28,0.42,0.61,2019-12-01,258,N,VIIRS,n,1,297.03,1.22,D,0
3,-33.262867,150.546707,331.68,0.41,0.61,2019-12-01,258,N,VIIRS,n,1,301.0,3.47,D,0
4,-33.27985,150.344421,331.86,0.42,0.61,2019-12-01,258,N,VIIRS,n,1,297.38,6.89,D,0


In [6]:
# Drop night data
df = df[df.daynight == 'D'].reset_index(drop=True)

# Make sure all the night data is removed
print(df.daynight.unique())

# Save the data frame
fpath_dataset = '/content/drive/Shared drives/NASA Space App/Code/Dataset'
outpath = os.path.join(fpath_dataset, 'Fire', '[DAY] Aus_01_12_19_to_31_01_20.csv')

df.to_csv(outpath, index=False)

['D']


In [7]:
# Extract useful columns
cols = ['latitude', 'longitude', 'acq_date', 'acq_time', 'type']
df = df[cols]

df.head()

Unnamed: 0,latitude,longitude,acq_date,acq_time,type
0,-33.263584,150.323334,2019-12-01,258,0
1,-33.275116,150.324799,2019-12-01,258,0
2,-33.272636,150.3703,2019-12-01,258,0
3,-33.262867,150.546707,2019-12-01,258,0
4,-33.27985,150.344421,2019-12-01,258,0


In [8]:
# Classify whether each row corresponds to fire or not
df['fire'] = df['type'].apply(lambda x: 1 if x == 0 else 0)
df.head()

Unnamed: 0,latitude,longitude,acq_date,acq_time,type,fire
0,-33.263584,150.323334,2019-12-01,258,0,1
1,-33.275116,150.324799,2019-12-01,258,0,1
2,-33.272636,150.3703,2019-12-01,258,0,1
3,-33.262867,150.546707,2019-12-01,258,0,1
4,-33.27985,150.344421,2019-12-01,258,0,1


In [9]:
# Remove duplicated data for the same latitude and longitude in a single day

# Get unique dates
unique_acq_dates = df.acq_date.unique()

for acq_date in unique_acq_dates:
  df_acq_date = df[df.acq_date == acq_date]

  # Mark the first occurence as True and the rest as False
  bool_duplicates = df_acq_date.duplicated(subset=['latitude', 'longitude'])

  # Remove duplicates
  if sum(bool_duplicates) > 0:
    print('Duplicates on {}'.format(acq_date))

    # # Print duplicated rows
    # with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    #   print(df_acq_date[bool_duplicates])

    df.drop(df_acq_date[bool_duplicates].index.values, inplace=True)

Duplicates on 2020-01-13


In [10]:
# Get more non-fire spots to tackle class imbalance problems

# Get current coordinates
current_coors_list = list(zip(df.latitude, df.longitude))
current_coors_list = [(round(x,1), round(y,1)) for x, y in current_coors_list]

# Create new coordinates
new_lat = np.linspace(-33, -21, 50)
new_long = np.linspace(135, 148, 50)
new_coors_list = []

for x in new_lat:
  for y in new_long:
    new_coors_list.append((round(x,1), round(y,1)))

# Determine new coordinates that are not among current coordinates and use 
# them as non-fire spots
non_fire_coors_list = list(set(new_coors_list) - set(current_coors_list ))

# Store in data frame
new_data = []

# Get unique dates
unique_acq_dates = df.acq_date.unique()

for lat, long in non_fire_coors_list:
    for acq_date in unique_acq_dates:
      new_data.append({'latitude': lat, 
                       'longitude': long,
                       'acq_date': acq_date,
                       'fire': 0})
      
df_non_fire = pd.DataFrame(new_data)

In [11]:
df = pd.concat([df, df_non_fire])
print(df)

         latitude   longitude    acq_date  acq_time  type  fire
0      -33.263584  150.323334  2019-12-01     258.0   0.0     1
1      -33.275116  150.324799  2019-12-01     258.0   0.0     1
2      -33.272636  150.370300  2019-12-01     258.0   0.0     1
3      -33.262867  150.546707  2019-12-01     258.0   0.0     1
4      -33.279850  150.344421  2019-12-01     258.0   0.0     1
...           ...         ...         ...       ...   ...   ...
152577 -30.100000  135.500000  2020-01-27       NaN   NaN     0
152578 -30.100000  135.500000  2020-01-28       NaN   NaN     0
152579 -30.100000  135.500000  2020-01-29       NaN   NaN     0
152580 -30.100000  135.500000  2020-01-30       NaN   NaN     0
152581 -30.100000  135.500000  2020-01-31       NaN   NaN     0

[433060 rows x 6 columns]


### Land surface temperature

Source: https://neo.sci.gsfc.nasa.gov/archive/csv/MOD_LSTD_D/ <br>

In [None]:
# Get land surface temperature for each row
temperature = []

# Get unique dates
unique_acq_dates = df.acq_date.unique()

# Define column and index labels for renaming later
column_labels = np.around(np.arange(0.1,180.1,0.1), 1)
column_labels = np.concatenate((-np.flip(column_labels), column_labels))
index_labels = np.around(np.arange(0.1,90.1,0.1), 1)
index_labels = np.concatenate((np.flip(index_labels), -index_labels))

for acq_date in unique_acq_dates:
  df_acq_date = df[df.acq_date == acq_date]

  # Define url for csv file containing land surface temperature
  url = 'https://neo.sci.gsfc.nasa.gov/archive/csv/MOD_LSTD_D/MOD_LSTD_D_{}.CSV.gz'.format(acq_date)

  # Retrive csv file using url
  response = requests.get(url)

  filename = url.split("/")[-1]
  with open(filename, "wb") as f:
      r = requests.get(url)
      f.write(r.content)

  df_temperature = pd.read_csv(filename, compression='gzip', header=None)

  # Rename index and column of the dataframe for the ease of accessing cell value
  df_temperature.columns = column_labels
  df_temperature.index = index_labels

  for lat, long in list(zip(df_acq_date.latitude, df_acq_date.longitude)):
    temperature.append(df_temperature.loc[round(lat, 1), round(long, 1)])

  print('All the land temperature for {} have been obtained'.format(acq_date))
    
df['land_temperature'] = temperature

In [13]:
# Remove rows with extremely high land temperature (i.e. 99999)
df = df[df.land_temperature != 99999].reset_index(drop=True)
df.describe()

Unnamed: 0,latitude,longitude,acq_time,type,fire,land_temperature
count,247224.0,247224.0,162180.0,162180.0,247224.0,247224.0
mean,-28.041478,141.335695,416.197009,0.028499,0.649593,39.990269
std,6.786669,9.056229,89.23574,0.288177,0.477098,6.217882
min,-42.82745,113.447006,228.0,0.0,0.0,6.417323
25%,-32.953738,136.9,344.0,0.0,0.0,36.45669
50%,-29.3,143.5,413.0,0.0,1.0,42.795277
75%,-23.4,148.775643,450.0,0.0,1.0,45.0
max,-9.114081,153.360321,707.0,3.0,1.0,45.0


### Solar Insolation

Source: https://neo.sci.gsfc.nasa.gov/archive/rgb/CERES_INSOL_D<br>

In [None]:
# Get net_radiation for each row
solar_insolation = []

# Get unique dates
unique_acq_dates = df.acq_date.unique()

# Define column and index labels for renaming later
column_labels = np.around(np.arange(0.25,180.25,0.25), 2)
column_labels = np.concatenate((-np.flip(column_labels), column_labels))
index_labels = np.around(np.arange(0.25,90.25,0.25), 2)
index_labels = np.concatenate((np.flip(index_labels), -index_labels))

url_old = ''

for acq_date in unique_acq_dates:
  df_acq_date = df[df.acq_date == acq_date]

  # Define url for PNG representing solar insolation
  url = 'https://neo.sci.gsfc.nasa.gov/archive/rgb/CERES_INSOL_D/CERES_INSOL_D_{}.PNG'.format(acq_date)
  
  # If url (or data) doesn't exist, then use the previous url (url_old). 
  # Otherwise, update url_old with current url for future use
  request = requests.get(url)
  if request.status_code == 200:
    url_old = url
  else:
    url = url_old
  
  # Retrive PNG using url
  img = io.imread(url)

  # Remove alpha channel of PNG image
  if len(img.shape) > 2 and img.shape[2] == 4:
      # Convert the image from RGBA2RGB
      img = cv2.cvtColor(img, cv2.COLOR_BGRA2BGR)

  # Compress 3D into 2D by computing the euclidean distance for each pixel
  img_norm = np.linalg.norm(img, axis=2)

  # Convert numpy array to data frame
  df_solar_insolation = pd.DataFrame(data=img_norm)

  # Rename index and column of the dataframe for the ease of accessing cell value
  df_solar_insolation.columns = column_labels
  df_solar_insolation.index = index_labels

  for lat, long in list(zip(df_acq_date.latitude, df_acq_date.longitude)):
    # Round latitude and longitude to nearest 0.25
    lat_new = round(lat*4)/4
    long_new = round(long*4)/4

    solar_insolation.append(df_solar_insolation.loc[lat_new, long_new])

  print('All the solar insolation for {} have been obtained'.format(acq_date))
    
df['solar_insolation'] = solar_insolation

### Rainfall

Source: https://gpm1.gesdisc.eosdis.nasa.gov/data/GPM_L3/GPM_3IMERGDF.06/<br>
Check [here](https://docserver.gesdisc.eosdis.nasa.gov/public/project/GPM/IMERG_doc.06.pdf) for attribute fields for rainfall dataset

Check [here](https://disc.gsfc.nasa.gov/information/howto?title=How%20to%20Download%20Data%20Files%20from%20HTTPS%20Service%20with%20wget) for downloading .nc4 file using url and [here](https://towardsdatascience.com/read-netcdf-data-with-python-901f7ff61648) for reading .nc4 file

In [None]:
# Get urls for 2019_12 files in the parent directory
url = "https://gpm1.gesdisc.eosdis.nasa.gov/data/GPM_L3/GPM_3IMERGDF.06/2019/12"
files_2019_12 = !wget -q -nH -nd "https://gpm1.gesdisc.eosdis.nasa.gov/data/GPM_L3/GPM_3IMERGDF.06/2019/12" -O - | grep 3B-DAY.MS.MRG.3IMERG | awk -F'\"' '{print $4}'
files_2019_12 = [file for file in files_2019_12  if file.endswith('.nc4')]
files_2019_12 = [os.path.join(url, file) for file in files_2019_12]

# Save the urls in a text file for future download using wget
with open('2019_12_file.txt', 'w') as f:
    for file in files_2019_12:
        f.write("{}\n".format(file))

# Download files for nasa website
!wget --load-cookies C:\.urs_cookies --save-cookies C:\.urs_cookies --auth-no-challenge=on --keep-session-cookies --ask-user --ask-password --content-disposition -i 2019_12_file.txt

In [None]:
# Get urls for 2020_01 files in the parent directory
url = "https://gpm1.gesdisc.eosdis.nasa.gov/data/GPM_L3/GPM_3IMERGDF.06/2020/01"
files_2020_01 = !wget -q -nH -nd "https://gpm1.gesdisc.eosdis.nasa.gov/data/GPM_L3/GPM_3IMERGDF.06/2020/01" -O - | grep 3B-DAY.MS.MRG.3IMERG | awk -F'\"' '{print $4}'
files_2020_01 = [file for file in files_2020_01  if file.endswith('.nc4')]
files_2020_01 = [os.path.join(url, file) for file in files_2020_01]

# Save the urls in a text file for future download using wget
with open('2020_01_file.txt', 'w') as f:
    for file in files_2020_01:
        f.write("{}\n".format(file))

# Download files for nasa website
!wget --load-cookies C:\.urs_cookies --save-cookies C:\.urs_cookies --auth-no-challenge=on --keep-session-cookies --ask-user --ask-password --content-disposition -i 2020_01_file.txt

In [None]:
# Get rainfall for each row
rainfall = []

# Get unique dates
unique_acq_dates = df.acq_date.unique()

# Define column and index labels for renaming later
column_labels = np.around(np.arange(0.1,180.1,0.1), 1)
column_labels = np.concatenate((-np.flip(column_labels), column_labels))
index_labels = np.around(np.arange(0.1,90.1,0.1), 1)
index_labels = np.concatenate((np.flip(index_labels), -index_labels))

for acq_date in unique_acq_dates:
  df_acq_date = df[df.acq_date == acq_date]

  # Define name of the file containing rainfall
  filename = '3B-DAY.MS.MRG.3IMERG.{}-S000000-E235959.V06.nc4'.format(acq_date.replace('-', ''))

  # Retrive file
  ds = nc.Dataset(filename)

  # Retrive rainfall array
  arr_rainfall = np.rot90(ds['precipitationCal'][:].squeeze())
  
  # Replace missing value in rainfall array with NaN
  unmasked_arr_rainfall = arr_rainfall.data
  unmasked_arr_rainfall[unmasked_arr_rainfall == -9999.9] = np.nan

  # Convert numpy array to data frame
  df_rainfall = pd.DataFrame(data=arr_rainfall)

  # Rename index and column of the dataframe for the ease of accessing cell value
  df_rainfall.columns = column_labels
  df_rainfall.index = index_labels

  for lat, long in list(zip(df_acq_date.latitude, df_acq_date.longitude)):
    rainfall.append(df_rainfall.loc[round(lat, 1), round(long, 1)])

  print('All the rainfall for {} have been obtained'.format(acq_date))
    
df['rainfall'] = rainfall

In [18]:
df.describe()

Unnamed: 0,latitude,longitude,acq_time,type,fire,land_temperature,solar_insolation,rainfall
count,247224.0,247224.0,162180.0,162180.0,247224.0,247224.0,247224.0,247224.0
mean,-28.041478,141.335695,416.197009,0.028499,0.649593,39.990269,315.361273,0.815578
std,6.786669,9.056229,89.23574,0.288177,0.477098,6.217882,18.517741,3.521952
min,-42.82745,113.447006,228.0,0.0,0.0,6.417323,99.040396,0.0
25%,-32.953738,136.9,344.0,0.0,0.0,36.45669,307.798635,0.0
50%,-29.3,143.5,413.0,0.0,1.0,42.795277,319.336813,0.0
75%,-23.4,148.775643,450.0,0.0,1.0,45.0,329.449541,0.079013
max,-9.114081,153.360321,707.0,3.0,1.0,45.0,348.364751,138.224762


### Save cleaned dataset

In [19]:
# Save the data frame with non-fire rows (tackle class imbalance problem)
fpath_dataset = '/content/drive/Shared drives/NASA Space App/Code/Dataset'
outpath = os.path.join(fpath_dataset, 'Fire', 'master_dataset_with_non_fire.csv')

df.to_csv(outpath, index=False)

In [20]:
# Save the data frame with majority of fire rows
fpath_dataset = '/content/drive/Shared drives/NASA Space App/Code/Dataset'
outpath = os.path.join(fpath_dataset, 'Fire', 'master_dataset.csv')

df_fire = df[df.acq_time.notna()].reset_index(drop=True)
df_fire.to_csv(outpath, index=False)