<a href="https://colab.research.google.com/github/MapleWolfe/Milestone_2/blob/Jai/TFrecord_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data extraction from TF records

## installs, imports, pre-sets

In [None]:
!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!python rapidsai-csp-utils/colab/pip-install.py

In [8]:
from google.colab import drive
import zipfile
import tensorflow as tf
import pandas as pd
import numpy as np
import skimage
from scipy.ndimage import distance_transform_edt
import warnings
warnings.filterwarnings('ignore')
import cudf
import cupy as cp

## Loading TF records from google drive

In [9]:
# let's mount the drive
drive.mount('/content/drive')

# let's look into the zip file stored in the google drive
wild_fire_file_path = '/content/drive/MyDrive/next_day_wildfire.zip'
wildfire_zip =  zipfile.ZipFile(wild_fire_file_path, 'r')
tf_record_file_names = wildfire_zip.namelist()

print('number of TF records:', len(tf_record_file_names))
print('file names of tf records within the zip:')
print(tf_record_file_names)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
number of TF records: 19
file names of tf records within the zip:
['next_day_wildfire_spread_eval_00.tfrecord', 'next_day_wildfire_spread_eval_01.tfrecord', 'next_day_wildfire_spread_test_00.tfrecord', 'next_day_wildfire_spread_test_01.tfrecord', 'next_day_wildfire_spread_train_00.tfrecord', 'next_day_wildfire_spread_train_01.tfrecord', 'next_day_wildfire_spread_train_02.tfrecord', 'next_day_wildfire_spread_train_03.tfrecord', 'next_day_wildfire_spread_train_04.tfrecord', 'next_day_wildfire_spread_train_05.tfrecord', 'next_day_wildfire_spread_train_06.tfrecord', 'next_day_wildfire_spread_train_07.tfrecord', 'next_day_wildfire_spread_train_08.tfrecord', 'next_day_wildfire_spread_train_09.tfrecord', 'next_day_wildfire_spread_train_10.tfrecord', 'next_day_wildfire_spread_train_11.tfrecord', 'next_day_wildfire_spread_train_12.tfrecord', 'next_day_wildfire_spread_

In [15]:
# unzipping one file at a time
def one_file_unzip(tf_record_file_name, zipfile_variable):
  extracted_record_path = zipfile_variable.extract(tf_record_file_name)
  raw_dataset = tf.data.TFRecordDataset(extracted_record_path)
  return raw_dataset

# yielding out one record at a time
def extract_one_row(tf_record_dataset):
  for i, raw_record in enumerate(tf_record_dataset.take(raw_dataset.cardinality().numpy())):
    one_record_dict = {}
    example = tf.train.Example()
    example.ParseFromString(raw_record.numpy())

    for key, feature in example.features.feature.items():

      kind = feature.WhichOneof('kind')
      one_record_dict[key] = np.array(getattr(feature, kind).value).reshape(64,64)
    yield one_record_dict

## let's create features from all images

### feature description given by dataset maker

In [11]:
# data variables

INPUT_FEATURES = ['elevation', 'th', 'vs',  'tmmn', 'tmmx', 'sph',
                  'pr', 'pdsi', 'NDVI', 'population', 'erc', 'PrevFireMask']

OUTPUT_FEATURES = ['FireMask']


# underlying feature value ranges:
# (min_clip, max_clip, mean, standard deviation)

feature_description_dict = {
    # Elevation in m: between 0.1 percentile and 99.9 percentile
    'elevation': (0.0, 3141.0, 657.3003, 649.0147),

    # Palmer Drought Severity Index: between 0.1 percentile and 99.9 percentile
    'pdsi': (-6.12974870967865, 7.876040384292651, -0.0052714925, 2.6823447),

    #Vegetation index times 10,000: between -1 and 1
    'NDVI': (-9821.0, 9996.0, 5157.625, 2466.6677),

    # Precipitation in mm: between 0.0 and 99.9 percentile
    'pr': (0.0, 44.53038024902344, 1.7398051, 4.482833),

    # Specific humidity: between 0 and 1
    'sph': (0., 1., 0.0071658953, 0.0042835088),

    # Wind direction in degrees clockwise from north: between 0 and 360.
    'th': (0., 360.0, 190.32976, 72.59854),

    #Min temp: between 253.15 kelvin and 99.9 percentile
    'tmmn': (253.15, 298.94891357421875, 281.08768, 8.982386),

    #Max temp: between 253.15 kelvin and 99.9 percentile
    'tmmx': (253.15, 315.09228515625, 295.17383, 9.815496),

    # Wind speed in m/s: between 0. and 99.9 percentile
    'vs': (0.0, 10.024310074806237, 3.8500874, 1.4109988),

    # NFDRS fire danger index energy release component BTU's per square foot.
    # 0., 99.9 percentile
    'erc': (0.0, 106.24891662597656, 37.326267, 20.846027),

    # Population density: between 0 and 99.9 percentile
    'population': (0., 2534.06298828125, 25.531384, 154.72331),

    # We don't want to normalize the FireMasks.
    # 1 indicates fire, 0 no fire, -1 unlabeled data
    'PrevFireMask': (-1., 1., 0., 1.),
    'FireMask': (-1., 1., 0., 1.)
}


### Feature generation

In [16]:
# lets define the min max scaling function
def min_max_scaling(array,min_val,max_val):
    scaled_array = np.clip((array - min_val) / (max_val - min_val), 0, 1)
    return scaled_array

# let's apply guassian smoothing
def gaussian_smoothing(image_array,sigma_val):
  smooth_array = skimage.filters.gaussian(image_array, sigma=1)
  return smooth_array

#lets get the rate of change and mean,
def local_pixel_features(image_array,radius_val):
  footprint = skimage.morphology.disk(radius_val)
  gradient_array = skimage.filters.rank.gradient(image_array, footprint)
  mean_array = skimage.filters.rank.mean(image_array, footprint)
  return gradient_array,mean_array

#use altitude edge to identify whether pixel is at a similar altitude as any pixel that has fire
def fire_pixel_shared_altitude(row_dict, normalized_array, previous_day_fire = 'PrevFireMask'):
  edges_array = skimage.feature.canny(normalized_array)
  inverted_edges_array = np.logical_not(edges_array).astype(int)
  edge_label_array = skimage.measure.label(inverted_edges_array)

  previous_fire = row_dict[previous_day_fire]
  fire_edge_labels = (edge_label_array*previous_fire)

  unique_regions_with_fire = np.unique(fire_edge_labels.flatten())
  non_zero_unique_regions = unique_regions_with_fire[unique_regions_with_fire != 0]

  fire_at_same_altitude = np.isin(edge_label_array, non_zero_unique_regions).astype(int)
  return fire_at_same_altitude

def distance_to_fire(row_dict,feature):
  # we need to clip the fire mask to account for -1 values (missing values where the satellite was unable to get a clear image)
  # for now we take them as no fire objects, however we will not be accounting for these pixels in our model.
  fire_mask_array = row_dict[feature].clip(0,1)
  inverted_mask_array = 1 - fire_mask_array
  distance_transform_array = distance_transform_edt(inverted_mask_array)
  return distance_transform_array


In [17]:
# let's apply it on all features
def build_features(record_dict,min_max_dict,sigma_val,radius_val):
  feature_list = record_dict.keys()
  output_feature_dict = {}
  for a_feature in feature_list:
    if a_feature not in ['PrevFireMask','FireMask']:
     #min max scaling
     feature_min = min_max_dict[a_feature][0]
     feature_max = min_max_dict[a_feature][1]
     scaled_array = min_max_scaling(record_dict[a_feature],feature_min,feature_max)
     #guassian smoothing
     smoothen_array = gaussian_smoothing(scaled_array,sigma_val)

     #local pixel values: gradient values(rate of change), local mean val.
     gradient_array,mean_array = local_pixel_features(smoothen_array,radius_val)

     #lets now add these features to our output:
     output_feature_dict[a_feature+'_'+'scaled_smoothened_values'] = smoothen_array.flatten()
     output_feature_dict[a_feature+'_'+'local_gradient'] = gradient_array.flatten()
     output_feature_dict[a_feature+'_'+'local_mean'] = mean_array.flatten()

     #lets label pixels if they are at the same elevation (to account for cliffs/mountains/chasms) as the fire
     # here we aren't using smoothened array
    if a_feature == 'elevation':
      fire_at_altitude_array = fire_pixel_shared_altitude(record_dict, scaled_array)
      output_feature_dict['fire_at_similar_altitude'] = fire_at_altitude_array.flatten()
     #lets move are features into a dict.

    # get pixel eucledian distance from fire
    if a_feature == 'PrevFireMask':
      distance_array = distance_to_fire(record_dict,a_feature)
      output_feature_dict['PrevFireMask'] = record_dict[a_feature].flatten()
      output_feature_dict['distance_from_fire'] = distance_array.flatten()

    if a_feature == 'FireMask':
      output_feature_dict['FireMask'] = record_dict[a_feature].flatten()

  return output_feature_dict



In [None]:
%%time
image_count = 0
image_id = 0
data_frame_dict = {}
single_record_list = []

for a_tf_record in tf_record_file_names:
  raw_dataset = one_file_unzip(a_tf_record, wildfire_zip)
  row_extraction_generator = extract_one_row(raw_dataset)

  for a_row in row_extraction_generator:
    all_features_dict_array = build_features(a_row,feature_description_dict,sigma_val=1,radius_val=3)
    image_id += 1
    image_count +=1
    image_number_array = np.full(4096, image_id)
    all_features_dict_array['image_id'] = image_number_array
    if image_count == 1:
      all_features_dataframe = cudf.DataFrame.from_dict(all_features_dict_array)
    else:
      single_row_df = cudf.DataFrame.from_dict(all_features_dict_array)
      all_features_dataframe = all_features_dataframe.append(single_row_df, ignore_index=True)

    if image_count > 200:
      single_record_list.append(all_features_dataframe)
      image_count = 0

  if image_count % 200 != 0:
      single_record_list.append(all_features_dataframe)
      image_count = 0

  big_df = cudf.concat(single_record_list, ignore_index=True)
  data_frame_dict[a_tf_record] = big_df.to_pandas()
  image_count = 0
  print('completed: ', a_tf_record)

In [23]:
data_frame_dict['next_day_wildfire_spread_eval_00.tfrecord']

Unnamed: 0,tmmn_scaled_smoothened_values,tmmn_local_gradient,tmmn_local_mean,NDVI_scaled_smoothened_values,NDVI_local_gradient,NDVI_local_mean,FireMask,population_scaled_smoothened_values,population_local_gradient,population_local_mean,...,sph_local_mean,th_scaled_smoothened_values,th_local_gradient,th_local_mean,PrevFireMask,distance_from_fire,erc_scaled_smoothened_values,erc_local_gradient,erc_local_mean,image_id
0,0.572232,1,145,0.772659,9,193,0.0,0.000520,0,0,...,2,0.403058,2,103,0.0,58.309519,0.133307,0,34,1
1,0.572295,1,145,0.773073,9,193,0.0,0.000553,0,0,...,2,0.403026,2,103,0.0,58.215118,0.133334,0,34,1
2,0.572384,1,145,0.767925,9,193,0.0,0.000640,1,0,...,2,0.402963,2,103,0.0,58.137767,0.133375,0,34,1
3,0.572481,1,145,0.757142,8,192,0.0,0.001004,1,0,...,2,0.402880,2,103,0.0,58.077534,0.133415,0,34,1
4,0.572581,1,145,0.745681,9,192,0.0,0.001870,1,0,...,2,0.402791,2,103,0.0,58.034473,0.133447,0,34,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4095995,0.567221,4,143,0.658794,17,165,0.0,0.000274,0,0,...,1,0.630729,0,161,0.0,38.209946,0.942560,1,240,1000
4095996,0.565923,3,143,0.665685,17,167,0.0,0.000379,0,0,...,1,0.631192,1,161,0.0,38.897301,0.942009,1,240,1000
4095997,0.564628,3,143,0.672484,15,168,0.0,0.000413,0,0,...,1,0.631776,1,161,0.0,39.597980,0.941533,1,240,1000
4095998,0.563398,4,143,0.682407,13,169,0.0,0.000418,0,0,...,1,0.632424,1,161,0.0,40.311289,0.941140,0,240,1000
