<a href="https://colab.research.google.com/github/MapleWolfe/Milestone_2/blob/main/Feature_Generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data extraction from TF records

## installs, imports, pre-sets

In [12]:
from google.colab import drive
import zipfile
import tensorflow as tf
import pandas as pd
import numpy as np
import skimage
from scipy.ndimage import distance_transform_edt


## Loading TF records from google drive

In [3]:
# let's mount the drive
drive.mount('/content/drive')

# let's look into the zip file stored in the google drive
wild_fire_file_path = '/content/drive/MyDrive/next_day_wildfire.zip'
wildfire_zip =  zipfile.ZipFile(wild_fire_file_path, 'r')
file_names = wildfire_zip.namelist()

print('number of TF records:', len(file_names))
print('file names of tf records within the zip:')
print(file_names)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
number of TF records: 19
file names of tf records within the zip:
['next_day_wildfire_spread_eval_00.tfrecord', 'next_day_wildfire_spread_eval_01.tfrecord', 'next_day_wildfire_spread_test_00.tfrecord', 'next_day_wildfire_spread_test_01.tfrecord', 'next_day_wildfire_spread_train_00.tfrecord', 'next_day_wildfire_spread_train_01.tfrecord', 'next_day_wildfire_spread_train_02.tfrecord', 'next_day_wildfire_spread_train_03.tfrecord', 'next_day_wildfire_spread_train_04.tfrecord', 'next_day_wildfire_spread_train_05.tfrecord', 'next_day_wildfire_spread_train_06.tfrecord', 'next_day_wildfire_spread_train_07.tfrecord', 'next_day_wildfire_spread_train_08.tfrecord', 'next_day_wildfire_spread_train_09.tfrecord', 'next_day_wildfire_spread_train_10.tfrecord', 'next_day_wildfire_spread_train_11.tfrecord', 'next_day_wildfire_spread_train_12.tfrecord', 'next_day_wildfire_spread_

In [4]:
# unzipping one file at a time
def one_file_unzip(tf_record_file_name, zipfile_variable):
  extracted_record_path = zipfile_variable.extract(tf_record_file_name)
  raw_dataset = tf.data.TFRecordDataset(extracted_record_path)
  return raw_dataset

# yielding out one record at a time
def extract_one_row(tf_record_dataset):
  for i, raw_record in enumerate(tf_record_dataset.take(raw_dataset.cardinality().numpy())):
    one_record_dict = {}
    example = tf.train.Example()
    example.ParseFromString(raw_record.numpy())

    for key, feature in example.features.feature.items():

      kind = feature.WhichOneof('kind')
      one_record_dict[key] = np.array(getattr(feature, kind).value).reshape(64,64)
#remove this 'break' below
    break
  yield one_record_dict

## let's create features from all images

### feature description given by dataset maker

In [5]:
# data variables

INPUT_FEATURES = ['elevation', 'th', 'vs',  'tmmn', 'tmmx', 'sph',
                  'pr', 'pdsi', 'NDVI', 'population', 'erc', 'PrevFireMask']

OUTPUT_FEATURES = ['FireMask']


# underlying feature value ranges:
# (min_clip, max_clip, mean, standard deviation)

feature_description_dict = {
    # Elevation in m: between 0.1 percentile and 99.9 percentile
    'elevation': (0.0, 3141.0, 657.3003, 649.0147),

    # Palmer Drought Severity Index: between 0.1 percentile and 99.9 percentile
    'pdsi': (-6.12974870967865, 7.876040384292651, -0.0052714925, 2.6823447),

    #Vegetation index times 10,000: between -1 and 1
    'NDVI': (-9821.0, 9996.0, 5157.625, 2466.6677),

    # Precipitation in mm: between 0.0 and 99.9 percentile
    'pr': (0.0, 44.53038024902344, 1.7398051, 4.482833),

    # Specific humidity: between 0 and 1
    'sph': (0., 1., 0.0071658953, 0.0042835088),

    # Wind direction in degrees clockwise from north: between 0 and 360.
    'th': (0., 360.0, 190.32976, 72.59854),

    #Min temp: between 253.15 kelvin and 99.9 percentile
    'tmmn': (253.15, 298.94891357421875, 281.08768, 8.982386),

    #Max temp: between 253.15 kelvin and 99.9 percentile
    'tmmx': (253.15, 315.09228515625, 295.17383, 9.815496),

    # Wind speed in m/s: between 0. and 99.9 percentile
    'vs': (0.0, 10.024310074806237, 3.8500874, 1.4109988),

    # NFDRS fire danger index energy release component BTU's per square foot.
    # 0., 99.9 percentile
    'erc': (0.0, 106.24891662597656, 37.326267, 20.846027),

    # Population density: between 0 and 99.9 percentile
    'population': (0., 2534.06298828125, 25.531384, 154.72331),

    # We don't want to normalize the FireMasks.
    # 1 indicates fire, 0 no fire, -1 unlabeled data
    'PrevFireMask': (-1., 1., 0., 1.),
    'FireMask': (-1., 1., 0., 1.)
}


### Min Max scaling

In [10]:
# lets define the min max scaling function
def min_max_scaling(array,min_val,max_val):
    scaled_array = np.clip((array - min_val) / (max_val - min_val), 0, 1)
    return scaled_array

# let's apply guassian smoothing
def gaussian_smoothing(image_array,sigma_val):
  smooth_array = skimage.filters.gaussian(image_array, sigma=1)
  return smooth_array

#lets get the rate of change and mean,
def local_pixel_features(image_array,radius_val):
  footprint = skimage.morphology.disk(radius_val)
  gradient_array = skimage.filters.rank.gradient(image_array, footprint)
  mean_array = skimage.filters.rank.mean(image_array, footprint)
  return gradient_array,mean_array

#use altitude edge to identify whether pixel is at a similar altitude as any pixel that has fire
def fire_pixel_shared_altitude(row_dict, normalized_array, previous_day_fire = 'PrevFireMask'):
  edges_array = skimage.filters.canny(normalized_array)
  inverted_edges_array = np.logical_not(edges_array).astype(int)
  edge_label_array = skimage.measure.label(inverted_edges_array)

  previous_fire = row_dict[previous_day_fire]
  fire_edge_labels = (edge_label_array*previous_fire)

  unique_regions_with_fire = np.unique(fire_edge_labels.flatten())
  non_zero_unique_regions = unique_regions_with_fire[unique_regions_with_fire != 0]

  fire_at_same_altitude = np.isin(edge_label_array, non_zero_unique_regions).astype(int)
  return fire_at_same_altitude

def distance_to_fire(record_dict,feature):
  fire_mask_array = record_dict[feature]

# let's apply it on all features except firemasks
def build_features(record_dict,min_max_dict,sigma_val,radius_val):
  feature_list = record_dict.keys()
  gradient_dict = {}
  for a_feature in feature_list:
    if a_feature not in ['PrevFireMask','FireMask']:
     #min max scaling
     feature_min = min_max_dict[a_feature][0]
     feature_max = min_max_dict[a_feature][1]
     scaled_array = min_max_scaling(record_dict[a_feature],feature_min,feature_max)
     #guassian smoothing
     smoothen_array = gaussian_smoothing(scaled_array,sigma_val)

     #local pixel values: gradient values(rate of change), local mean val.
     gradient_array,mean_array = local_pixel_features(smoothen_array,radius_val)

     #lets label pixels if they are at the same elevation (to account for cliffs/mountains/chasms) as the fire
     # here we aren't using smoothened array
     if a_feature == 'elevation':
      fire_at_altitude_array = fire_pixel_shared_altitude(record_dict, scaled_array)

    if a_feature == 'PrevFireMask':



  return gradient_dict

In [11]:
raw_dataset = one_file_unzip('next_day_wildfire_spread_eval_00.tfrecord', wildfire_zip)
output = extract_one_row(raw_dataset)
for a_output in output:
  print(build_features(a_output,feature_description_dict,1,3))

(64, 64)
(64, 64)
(64, 64)
(64, 64)
(64, 64)
(64, 64)
(64, 64)
(64, 64)
(64, 64)
(64, 64)
(64, 64)
(64, 64)
(64, 64)
{'vs': array([[0, 0, 0, ..., 1, 1, 1],
       [0, 0, 0, ..., 1, 1, 1],
       [0, 0, 0, ..., 1, 1, 1],
       ...,
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1]], dtype=uint8), 'NDVI': array([[ 9,  9,  9, ..., 15, 17, 16],
       [11, 10,  9, ..., 18, 18, 12],
       [16, 11, 11, ..., 19, 18, 14],
       ...,
       [ 9, 10, 11, ..., 29, 26, 20],
       [ 9,  9, 10, ..., 20, 20, 20],
       [ 9, 10, 11, ..., 20, 16, 15]], dtype=uint8), 'tmmn': array([[1, 1, 1, ..., 1, 0, 0],
       [1, 1, 1, ..., 1, 1, 0],
       [1, 1, 1, ..., 1, 1, 0],
       ...,
       [0, 0, 0, ..., 1, 1, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 0]], dtype=uint8), 'PrevFireMask': array([[1, 1, 1, ..., 1, 0, 0],
       [1, 1, 1, ..., 1, 1, 0],
       [1, 1, 1, ..., 1, 1, 0],
       ...,
       [0, 0, 0, ..., 1, 1, 1],
       [0, 

  gradient_array = skimage.filters.rank.gradient(image_array, footprint)


{}