# RTS Dataset Formatting

# TODO

-  Make the script find and download the current version of the RTS dataset and metadata description rather than providing the filepath manually
- remove filepaths and metadata columns that I set
-  think through environment sharing (rts_dataset.rproj, .env, renv)
-  finalize file organization/filepaths so that they don't differ between R and Python

# Set-Up

In [59]:
import os
import uuid
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib
import warnings

In [60]:
pd.set_option("display.max_columns", 100)

Workaround for horizontal scrollbars not working in Firefox:

In [61]:
from IPython.display import display, HTML
display(HTML("<style>.jp-OutputArea-output {display:flex}</style>"))

## User-Defined Input

Before starting, copy your new shapefile into the input_data directory. Make sure that your shapefile uses EPSG:3413 (WGS 84 / NSIDC Sea Ice Polar Stereographic North) as the coordinate system, otherwise this script will not work.

Provide the file name to the data:

In [62]:
new_data_file = 'rts_dataset_test_polygons_new.shp'  # set this
new_data_filepath = os.path.join(
  '../input_data',
  new_data_file)

Provide the names of any metadata fields in your new file that are not already in the official RTS Data Set (please check the list to ensure that the field has not been included previously) that you would like to be included in the compiled data set:

In [63]:
# Use the format `AbbreviatedName: FullName`, where the FullName should be a human-readable name and the AbbreviatedName should be the ESRI shapefile driver abbreviated version of the the FullName.
# Example:
# new_fields = {'CstmCl1': 'CustomColumn1'}
new_fields = {'CstmCl1' : 'CustomColumn1'}

Have you already created RTS centroid columns, or would you like them to be created within this script? Provide either TRUE, if the columns do not exist yet, or FALSE, if you have already created them:

In [64]:
# Example: 
# calculate_centroid = False
calculate_centroid = False

Would you like your formatted new data to be output in its own file (in which case you will email the file of new features to us to merge with the compiled data set) or appended the compiled dataset (in which case you will commit your updated file to your forked github repository and create a pull request to add the file to the official github repository). Your decision here should mostly be based on your comfort with github. If you have no idea what the second half of that sentence means, please opt for the separate file and email it to us.

In [65]:
# Example
# separate_file = True
separate_file = False

# Functions

## add_empty_columns

In [66]:
def add_empty_columns(df, column_names):
  
  for name in column_names:
    if name not in df.columns:
      df[name] = pd.NA
      
  return df

## check_intersection_info

In [67]:
def check_intersection_info(df):

  duplicated_uuids = df['UUID'].duplicated()  
  duplicated_uuids = df.loc[duplicated_uuids, 'UUID']
  
  df['int_info_complete'] = (
    (df['Intersections'].isnull()) & (df['SelfIntersectionIndices'].str.len() == 0) |
    (~df['Intersections'].isnull()) & (df['RepeatRTS'].notnull() | df['MergedRTS'].notnull() | df['StabilizedRTS'].notnull() | df['AccidentalOverlap'].notnull()) | 
    (df['SelfIntersectionIndices'].str.len() > 0) & (df['UUID'].isin(duplicated_uuids))
  )
  
  if not df['int_info_complete'].all():
    print(df[~df['int_info_complete']])
    raise Exception('Incomplete intersection information provided. See printed rows.')
  
  print('Intersection information is complete.')

## get_earliest_uuid

Return `UUID` from feature with earliest `BaseMapDate` for features in `new_data` that overlap eachother.

In [68]:
def get_earliest_uuid(df_subset, df):

    uuids = [df_subset['UUID']] + [x for x in df_subset['SelfIntersectionIndices'].split(',')]
     
    df = df[df.UUID.isin(uuids)]
    
    earliest_df = df[df.BaseMapDate == df.BaseMapDate.min()]
    
    return earliest_df.UUID.iloc[0]

## run_formatting_checks

In [69]:
def check_lat(lat):
    correct_type = type(lat[0]) == np.float64
    missing_values = pd.isna(lat).values.any()
    reasonable_values = np.all(lat.between(-90, 90))

    if not correct_type:
        raise ValueError('The CentroidLat column is not numeric. Ensure that latitude is reported as decimal degress in WGS 84.')
    elif missing_values:
        raise ValueError('The CentroidLat column is missing values.')
    elif not reasonable_values:
        raise ValueError('Unexpected values found in the CentroidLat column. Ensure that CentroidLat is listed as decimal degress in WGS 84.')

def check_lon(lon):
    correct_type = type(lon[0]) == np.float64
    missing_values = pd.isna(lon).values.any()
    reasonable_values = np.all(lon.between(-180, 180))

    if not correct_type:
        raise ValueError('The CentroidLon column is not numeric. Ensure that longitude is reported as decimal degress in WGS 84.')
    elif missing_values:
        raise ValueError('The CentroidLon column is missing values.')
    elif not reasonable_values:
        raise ValueError('Unexpected values found in the CentroidLon column. Ensure that longitude is listed as decimal degress in WGS 84.')

def check_region(region):
    correct_type = type(region[0]) == str
    missing_values = (region == '').values.any()

    if not correct_type:
        raise ValueError('The RegionName column is not a string.')
    elif missing_values:
        raise ValueError('The RegionName column is missing values.')

def check_creator(creator):
    correct_type = type(creator[0]) == str
    missing_values = (creator == '').values.any()

    if not correct_type:
        raise ValueError('The CreatorLab column is not a string.')
    elif missing_values:
        raise ValueError('The CreatorLab column is missing values.')

def check_contribution_date(contribution_date):
    correct_type = type(contribution_date[0]) == pd._libs.tslibs.timestamps.Timestamp
    missing_values = (contribution_date == '').values.any()

    if not correct_type:
        raise ValueError('The ContributionDate column is not a string.')
    elif missing_values:
        raise ValueError('The ContributionDate column is missing values.')
  
def check_basemap_date(basemap_date):
    correct_type = pd.Series([
        type(pd.to_datetime(row)) == pd.core.indexes.datetimes.DatetimeIndex 
         for row in basemap_date.str.split(',')
    ]).values.all()
    missing_values = ((basemap_date.str.split(',', expand = True)).iloc[:, 0] == '').any()

    if not correct_type:
        raise ValueError('The BaseMapDate column does not contain dates (or they are improperly formatted).')
    elif missing_values:
        raise ValueError('The BaseMapDate column is missing values.')
  
def check_source(source):
    correct_type = type(source[0]) == str
    missing_values = (source == '').values.any()

    if not correct_type:
        raise ValueError('The BaseMapSource column is not a string.')
    elif missing_values:
        raise ValueError('The BaseMapSource column is missing values.')

def check_resolution(resolution):
    correct_type = type(resolution[0]) == np.float64
    missing_values = pd.isna(resolution).values.any()

    if not correct_type:
        raise ValueError('The BaseMapResolution column is not a numeric.')
    elif missing_values:
        raise ValueError('The BaseMapResolution column is missing values.')

def check_train_class(train_class):
    correct_type = type(train_class[0]) == str
    missing_values = (train_class == '').values.any()

    if not correct_type:
        raise ValueError('The TrainClass column is not a string.')
    elif missing_values:
        raise ValueError('The TrainClass column is missing values.')

def run_formatting_checks(df):
    check_lat(df.CentroidLat)
    check_lon(df.CentroidLon)
    check_region(df.RegionName)
    check_creator(df.CreatorLab)
    check_contribution_date(df.ContributionDate)
    check_basemap_date(df.BaseMapDate)
    check_source(df.BaseMapSource)
    check_resolution(df.BaseMapResolution)
    check_train_class(df.TrainClass)
    
    print('Formatting looks good!')


# Import Metadata Description File

In [70]:
### update this to download the current version automatically ##################
col_metadata = pd.read_csv('../input_data/metadata_description.csv')
################################################################################

required_fields = (
    pd.Series(
        col_metadata[col_metadata.Required == True].FullColumnName.values, 
        index = col_metadata[col_metadata.Required == True].AbbreviatedColumnName
    )
    .to_dict()
)
optional_fields = (
    pd.Series(
        col_metadata[col_metadata.Required == False].FullColumnName.values, 
        index = col_metadata[col_metadata.Required == False].AbbreviatedColumnName
    )
    .to_dict()
)

all_fields = dict(required_fields,
                  **optional_fields,
                  **new_fields
                 )

# Import Official and New RTS Data Files

In [71]:
### update this to find the dataset online and download it #####################
rts_file = 'rts_dataset_test_polygons_current.shp'
rts_data_filepath = os.path.join(
  '../input_data',
  rts_file
  )

rts_data = (
    gpd.read_file(rts_data_filepath)
    .rename(columns = dict(required_fields, **optional_fields))
    .filter(items = list(required_fields.values()) + list(optional_fields.values()) + ['geometry'])
)

rts_data.ContributionDate = pd.to_datetime(rts_data.ContributionDate)
    
for field in list(required_fields.values()): # Check if all required columns are present
    if field not in rts_data.columns:
        raise ValueError('{field} is missing. Has the RTS data set been modified since download?'.format(field = repr(field)))

# for field in list(optional_fields.values()): # Check if all optional columns are present
#     if field not in rts_data.columns:
#         raise ValueError('{field} is missing. Has the RTS data set been modified since download?'.format(field = repr(field)))

rts_data
################################################################################


Unnamed: 0,CentroidLat,CentroidLon,RegionName,CreatorLab,ContributionDate,BaseMapDate,BaseMapSource,BaseMapResolution,TrainClass,UUID,StabilizedRTS,MergedRTS,Area,geometry
0,70.016684,68.33918,Yamal-Gydan,Rodenhizer,2023-09-01,"2022-05-01,2022-09-30",WorldView-2,4.0,Positive,e857c4d6-13cf-5aa9-bcf6-637823b17e86,,,7581.395967,"POLYGON ((2007198.307 865988.469, 2007189.916 ..."
1,70.01622,68.339167,Yamal-Gydan,Rodenhizer,2023-09-01,"2022-05-01,2022-09-30",WorldView-2,4.0,Positive,2cc8036c-218d-5bd0-8ff8-f0cb01ffc261,,,3621.349764,"POLYGON ((2007253.161 866032.001, 2007235.776 ..."
2,70.01648,68.332416,Yamal-Gydan,Rodenhizer,2023-09-01,"2022-05-01,2022-09-30",WorldView-2,4.0,Positive,182a35b7-98c7-5775-b90e-6f0b0ca2d0ba,,,1339.292585,"POLYGON ((2007310.378 865857.070, 2007340.692 ..."
3,70.015499,68.3295,Yamal-Gydan,Rodenhizer,2023-09-01,"2022-05-01,2022-09-30",WorldView-2,4.0,Positive,3f9b82c6-2232-53dc-a4cd-d97538e0ecb9,,,3482.02968,"POLYGON ((2007453.557 865845.775, 2007456.723 ..."
4,70.014513,68.332959,Yamal-Gydan,Rodenhizer,2023-09-01,"2022-05-01,2022-09-30",WorldView-2,4.0,Positive,f87bcdb3-8f9d-5414-a0b3-71daf6b59d99,,,134.941981,"POLYGON ((2007514.965 865926.094, 2007508.132 ..."
5,70.014369,68.334934,Yamal-Gydan,Rodenhizer,2023-09-01,"2022-05-01,2022-09-30",WorldView-2,4.0,Positive,66560cd2-46cb-5f48-b51f-001335b5bb48,,,411.580601,"POLYGON ((2007496.803 865994.362, 2007488.866 ..."


In [72]:
if calculate_centroid:
    new_data = gpd.read_file(new_data_filepath)

    new_data["CntrdLt"] = new_data.centroid.y
    new_data["CntrdLn"] = new_data.centroid.x

    new_data = (
        new_data    
        .rename(columns = dict(
            {key:required_fields[key] for key in [key for key in list(required_fields.keys()) if key != 'UUID']},
            **optional_fields,
            **new_fields
            )
                )
        .filter(items = list(required_fields.values()) + list(optional_fields.values()) + list(new_fields.values()) + ['geometry'])
        )

    new_data.ContributionDate = pd.to_datetime(new_data.ContributionDate)
    
    for field in [item for item in list(required_fields.values()) if item not in ['UUID']]: # Check if all required columns are present
        if field not in new_data.columns:
            raise ValueError('{field} is missing. Ensure that all required fields (except UUID) are present prior to running this script'
                             .format(field = repr(field)))

    for field in [item for item in list(new_fields.values()) if item not in ['UUID']]: # Check if all new columns are present
        if field not in new_data.columns:
            raise ValueError('{field} is missing. Did you specify the name of the new metadata field correctly?'.format(field = repr(field)))

else:
    
    new_data = (
        gpd.read_file(new_data_filepath)
        .rename(columns = dict(
            {key:required_fields[key] for key in [key for key in list(required_fields.keys()) if key != 'UUID']},
            **optional_fields,
            **new_fields
            )
                )
        .filter(items = list(required_fields.values()) + list(optional_fields.values()) + list(new_fields.values()) + ['geometry'])
        )
    
    new_data.ContributionDate = pd.to_datetime(new_data.ContributionDate)
    
    for field in [item for item in list(required_fields.values()) if item not in ['UUID']]: # Check if all required columns are present
        if field not in new_data.columns:
            raise ValueError('{field} is missing. Ensure that all required fields (except UUID) are present prior to running this script'
                             .format(field = repr(field)))

    for field in [item for item in list(new_fields.values()) if item not in ['UUID']]: # Check if all new columns are present
        if field not in new_data.columns:
            raise ValueError('{field} is missing. Did you specify the name of the new metadata field correctly?'
                             .format(field = repr(field)))

new_data

Unnamed: 0,CentroidLat,CentroidLon,RegionName,CreatorLab,ContributionDate,BaseMapDate,BaseMapSource,BaseMapResolution,TrainClass,CustomColumn1,geometry
0,70.016553,68.339261,Yamal-Gydan,Rodenhizer,2023-09-28,"2023-05-01,2023-09-30",WorldView-2,4.0,Positive,,"POLYGON ((2007199.012 865984.608, 2007188.217 ..."
1,70.015426,68.340715,Yamal-Gydan,Rodenhizer,2023-09-28,"2023-05-01,2023-09-30",WorldView-2,4.0,Positive,,"POLYGON ((2007289.337 866129.959, 2007283.521 ..."
2,70.016517,68.33235,Yamal-Gydan,Rodenhizer,2023-09-28,"2023-05-01,2023-09-30",WorldView-2,4.0,Positive,,"POLYGON ((2007340.152 865838.514, 2007326.959 ..."
3,70.015305,68.331148,Yamal-Gydan,Rodenhizer,2023-09-28,"2023-05-01,2023-09-30",WorldView-2,4.0,Positive,,"POLYGON ((2007453.557 865845.775, 2007456.416 ..."
4,70.01457,68.333416,Yamal-Gydan,Rodenhizer,2023-09-28,"2023-05-01,2023-09-30",WorldView-2,4.0,Positive,,"POLYGON ((2007492.587 865949.766, 2007492.342 ..."
5,70.014478,68.334955,Yamal-Gydan,Rodenhizer,2023-09-28,"2023-05-01,2023-09-30",WorldView-2,4.0,Positive,,"POLYGON ((2007479.747 865990.850, 2007472.484 ..."
6,70.015426,68.340715,Yamal-Gydan,Rodenhizer,2023-09-28,"2022-05-01,2022-9-30",WorldView-2,4.0,Positive,,"POLYGON ((2007289.337 866129.959, 2007283.521 ..."


# Check Metadata Format of New Data

In [73]:
run_formatting_checks(new_data)

Formatting looks good!


# Generate UUIDs

Set seed for UUID generation (R) by concatenating all required metadata columns (except UUID) into a single string

In [74]:
new_data.CentroidLat = np.round(new_data.CentroidLat, 13)
new_data.CentroidLon = np.round(new_data.CentroidLon, 13)
new_data.ContributionDate = new_data.ContributionDate.dt.strftime('%Y-%m-%d')
c = new_data.BaseMapResolution == new_data.BaseMapResolution.astype(int)
new_data.loc[c,'BaseMapResolutionStr'] = new_data.BaseMapResolution.astype(int).astype(str)
new_data.loc[~c,'BaseMapResolutionStr'] = new_data.BaseMapResolution.astype(str)

In [75]:
new_data['seed'] = (
    new_data[[
        'CentroidLat', 
        'CentroidLon', 
        'RegionName', 
        'CreatorLab', 
        'BaseMapDate', 
        'BaseMapSource', 
        'BaseMapResolutionStr', 
        'TrainClass'
    ]].apply(
        lambda row: ''.join(row.values.astype(str)),
        axis = 1
    )
)
new_data.seed

0    70.016552920425468.3392611042346Yamal-GydanRod...
1    70.015426344950268.3407146769001Yamal-GydanRod...
2    70.016516997782768.3323502773845Yamal-GydanRod...
3    70.015305015354768.3311476725512Yamal-GydanRod...
4    70.014569984934568.3334155050299Yamal-GydanRod...
5    70.01447819631568.3349546118791Yamal-GydanRode...
6    70.015426344950268.3407146769001Yamal-GydanRod...
Name: seed, dtype: object

Generate UUIDs

In [76]:
new_data['UUID'] = [str(uuid.uuid5(uuid.NAMESPACE_DNS, name = seed)) for seed in new_data.seed]
new_data.UUID

0    2d21346f-66e0-50af-b4b3-5f38cd00a4ac
1    1bba7b2d-90aa-5e53-9d57-022a049ccd1c
2    9b405462-a712-5581-bfe5-04ddcfe38926
3    15a960b9-2050-528e-847f-2ea21a11da4f
4    ee636bfb-8586-5449-932f-8ffa1c91a748
5    8f3d9522-cfa2-580c-b632-3b4c00894da0
6    b9b9b724-1d5e-5dce-a997-798895d4836a
Name: UUID, dtype: object

# Check for Intersections with RTS Data Set

Find intersecting RTS polygons from the official RTS data set and retrieve their UUIDs. Create an empty column for the UUIDs of polygons that have been repeated that will be manually populated.

In [79]:
def get_intersecting_uuids(polygon, df):
    intersections = [','.join(gpd.overlay(polygon, df, how='intersection').UUID_2)]
    return intersections

def get_touching_uuids(polygon, df):
    adjacent_polys = [','.join([uuid for rts, uuid in zip(df.geometry, df.UUID) if polygon.geometry.touches(rts).reset_index()[0][0]])]
    return adjacent_polys

def remove_adjacent_polys(intersections, adjacent_polys):
    intersections = [item.split(',') for item in intersections]
    adjacent_polys = [item.split(',') for item in adjacent_polys]
    fixed_intersections = []
    for idx in range(0, len(intersections)):
        fixed_intersection = [[intersection for intersection in intersections[idx] if intersection not in adjacent_polys[idx]]]
        fixed_intersections = fixed_intersections + fixed_intersection
    fixed_intersections = [','.join(item) for item in fixed_intersections]
    return fixed_intersections

intersections = []
for idx in range(0,new_data.shape[0]):
    new_intersections = get_intersecting_uuids(new_data.iloc[[idx]], rts_data)
    intersections = intersections + new_intersections
    
new_data['Intersections'] = intersections

adjacent_polys = []
for idx in range(0,new_data.shape[0]):
    new_adjacent_polys = get_touching_uuids(new_data.iloc[[idx]], rts_data)
    adjacent_polys = adjacent_polys + new_adjacent_polys
    
new_data['AdjacentPolys'] = adjacent_polys

new_data.Intersections = remove_adjacent_polys(new_data.Intersections, new_data.AdjacentPolys)
new_data.drop('AdjacentPolys', axis=1)

overlapping_data = new_data.copy()
overlapping_data = overlapping_data[overlapping_data.Intersections.str.len() > 0]
overlapping_data

if overlapping_data.shape[0] > 0:
    if 'RepeatRTS' not in list(overlapping_data.columns.values):
        overlapping_data['RepeatRTS'] = ['']*overlapping_data.shape[0]
    if 'MergedRTS' not in list(overlapping_data.columns.values):
        overlapping_data['MergedRTS'] = ['']*overlapping_data.shape[0]
    if 'StabilizedRTS' not in list(overlapping_data.columns.values):
        overlapping_data['StabilizedRTS'] = ['']*overlapping_data.shape[0]

    overlapping_data['AccidentalOverlap'] = ['']*overlapping_data.shape[0]

    overlapping_data.to_file(
        os.path.join(
            '../python_output',
            new_data_file.split('.')[0] + "_overlapping_polygons.shp"
        )
    )

Unnamed: 0,CentroidLat,CentroidLon,RegionName,CreatorLab,ContributionDate,BaseMapDate,BaseMapSource,BaseMapResolution,TrainClass,CustomColumn1,geometry,BaseMapResolutionStr,seed,UUID,Intersections,AdjacentPolys
0,70.016553,68.339261,Yamal-Gydan,Rodenhizer,2023-09-28,"2023-05-01,2023-09-30",WorldView-2,4.0,Positive,,"POLYGON ((2007199.012 865984.608, 2007188.217 ...",4,70.016552920425468.3392611042346Yamal-GydanRod...,2d21346f-66e0-50af-b4b3-5f38cd00a4ac,"e857c4d6-13cf-5aa9-bcf6-637823b17e86,2cc8036c-...",
2,70.016517,68.33235,Yamal-Gydan,Rodenhizer,2023-09-28,"2023-05-01,2023-09-30",WorldView-2,4.0,Positive,,"POLYGON ((2007340.152 865838.514, 2007326.959 ...",4,70.016516997782768.3323502773845Yamal-GydanRod...,9b405462-a712-5581-bfe5-04ddcfe38926,182a35b7-98c7-5775-b90e-6f0b0ca2d0ba,
3,70.015305,68.331148,Yamal-Gydan,Rodenhizer,2023-09-28,"2023-05-01,2023-09-30",WorldView-2,4.0,Positive,,"POLYGON ((2007453.557 865845.775, 2007456.416 ...",4,70.015305015354768.3311476725512Yamal-GydanRod...,15a960b9-2050-528e-847f-2ea21a11da4f,3f9b82c6-2232-53dc-a4cd-d97538e0ecb9,
4,70.01457,68.333416,Yamal-Gydan,Rodenhizer,2023-09-28,"2023-05-01,2023-09-30",WorldView-2,4.0,Positive,,"POLYGON ((2007492.587 865949.766, 2007492.342 ...",4,70.014569984934568.3334155050299Yamal-GydanRod...,ee636bfb-8586-5449-932f-8ffa1c91a748,f87bcdb3-8f9d-5414-a0b3-71daf6b59d99,
5,70.014478,68.334955,Yamal-Gydan,Rodenhizer,2023-09-28,"2023-05-01,2023-09-30",WorldView-2,4.0,Positive,,"POLYGON ((2007479.747 865990.850, 2007472.484 ...",4,70.01447819631568.3349546118791Yamal-GydanRode...,8f3d9522-cfa2-580c-b632-3b4c00894da0,66560cd2-46cb-5f48-b51f-001335b5bb48,


At this point, you will need to manually check all polygons with intersections against the polygons in the official RTS data set in your preferred GIS software and save the output to

In [20]:
os.path.join('../python_output', new_data_file.split('.')[0] + "_overlapping_polygons_edited.shp")

'../python_output/rts_dataset_test_polygons_new_overlapping_polygons_edited.shp'

When possible/necessary, try to find imagery that matches the date of the intersecting polygons - this may require contacting the lab that did the original delineation.

Your job is to inspect each of the polygons listed in the 'Intersections' column compared to the new RTS feature and manually copy and paste the UUIDs from the 'Intersections' column into the 'RepeatRTS', 'StabilizedRTS', 'MergedRTS', or 'AccidentalOverlap' based on the relationship between the two polygons.

- Paste the UUID into the RepeatRTS column when the new RTS feature is the same RTS feature as the RTS feature in the 'Intersections' column, but was delineated at a different point in time, by a different lab at the same point in time, or from different imagery at the same point in time. The RTS feature is the same when it was the result of the same RTS initiation event.

- Paste the UUID into the StabilizedRTS column when the RTS feature in the 'Intersections' column is a stabilized RTS scar as of the date of the imagery used in the new RTS delineations.

- Paste the UUID into the MergedRTS column when multiple RTS features in the 'Intersections' column merged to form the new RTS feature.

- Paste the UUID into the AccidentalOverlap column when inaccuracies in delineation of separate RTS features lead to overlap (e.g. features that are very close to each other and the polygons barely touch). 

When this is done, each of the UUIDs in the Intersections column should have been copied into one (and only one) of the 'RepeatRTS', 'StabilizedRTS', 'MergedRTS', or 'AccidentalOverlap' columns.


# Load Manually Edited File and Join to New Data

Add the 'RepeatRTS', 'StabilizedRTS', and 'MergedRTS' columns that you just edited back into `new_data`.

In [21]:
overlapping_data.seed[0]

'70.016552920425468.3392611042346Yamal-GydanRodenhizer2023-05-01,2023-09-30WorldView-24Positive'

In [22]:
edited_file = os.path.join('../python_output', new_data_file.split('.')[0] + "_overlapping_polygons_edited.shp")

if os.path.exists(edited_file):
    overlapping_data = (
        gpd.read_file(edited_file)
        .rename(columns = {'UUID': 'UUID',
                           'Intrsct': 'Intersections', 
                           'ReptRTS': 'RepeatRTS', 
                           'MrgdRTS': 'MergedRTS',
                           'StblRTS': 'StabilizedRTS',
                           'AccdntO': 'AccidentalOverlap'
                           })
        .filter(items = ['UUID', 'Intersections', 'RepeatRTS', 'MergedRTS', 'StabilizedRTS', 'AccidentalOverlap'])
        )

    new_data = pd.merge(new_data, 
                        overlapping_data, 
                        how = 'outer',
                        on = ['UUID', 'Intersections'])

    new_data.loc[~new_data.RepeatRTS.isnull(), 'UUID'] = new_data.RepeatRTS[~new_data.RepeatRTS.isnull()]

else:
    new_data['RepeatRTS'] = ['']*new_data.shape[0]
    new_data['MergedRTS'] = ['']*new_data.shape[0]
    new_data['StabilizedRTS'] = ['']*new_data.shape[0]
    new_data['AccidentalOverlap'] = ['']*new_data.shape[0]
    
    warnings.warn("No manually edited file has been imported. This is okay if there were no overlapping polygons, but is a problem otherwise.")

new_data

Unnamed: 0,CentroidLat,CentroidLon,RegionName,CreatorLab,ContributionDate,BaseMapDate,BaseMapSource,BaseMapResolution,TrainClass,CustomColumn1,geometry,BaseMapResolutionStr,seed,UUID,Intersections,RepeatRTS,MergedRTS,StabilizedRTS,AccidentalOverlap
0,70.016553,68.339261,Yamal-Gydan,Rodenhizer,2023-09-28,"2023-05-01,2023-09-30",WorldView-2,4.0,Positive,,"POLYGON ((2007199.012 865984.608, 2007188.217 ...",4,70.016552920425468.3392611042346Yamal-GydanRod...,2d21346f-66e0-50af-b4b3-5f38cd00a4ac,"e857c4d6-13cf-5aa9-bcf6-637823b17e86,2cc8036c-...",,"e857c4d6-13cf-5aa9-bcf6-637823b17e86,2cc8036c-...",,
1,70.015426,68.340715,Yamal-Gydan,Rodenhizer,2023-09-28,"2023-05-01,2023-09-30",WorldView-2,4.0,Positive,,"POLYGON ((2007289.337 866129.959, 2007283.521 ...",4,70.015426344950268.3407146769001Yamal-GydanRod...,1bba7b2d-90aa-5e53-9d57-022a049ccd1c,,,,,
2,70.016517,68.33235,Yamal-Gydan,Rodenhizer,2023-09-28,"2023-05-01,2023-09-30",WorldView-2,4.0,Positive,,"POLYGON ((2007340.152 865838.514, 2007326.959 ...",4,70.016516997782768.3323502773845Yamal-GydanRod...,182a35b7-98c7-5775-b90e-6f0b0ca2d0ba,182a35b7-98c7-5775-b90e-6f0b0ca2d0ba,182a35b7-98c7-5775-b90e-6f0b0ca2d0ba,,,
3,70.015305,68.331148,Yamal-Gydan,Rodenhizer,2023-09-28,"2023-05-01,2023-09-30",WorldView-2,4.0,Positive,,"POLYGON ((2007453.557 865845.775, 2007456.416 ...",4,70.015305015354768.3311476725512Yamal-GydanRod...,15a960b9-2050-528e-847f-2ea21a11da4f,3f9b82c6-2232-53dc-a4cd-d97538e0ecb9,,,3f9b82c6-2232-53dc-a4cd-d97538e0ecb9,
4,70.01457,68.333416,Yamal-Gydan,Rodenhizer,2023-09-28,"2023-05-01,2023-09-30",WorldView-2,4.0,Positive,,"POLYGON ((2007492.587 865949.766, 2007492.342 ...",4,70.014569984934568.3334155050299Yamal-GydanRod...,ee636bfb-8586-5449-932f-8ffa1c91a748,f87bcdb3-8f9d-5414-a0b3-71daf6b59d99,,,,f87bcdb3-8f9d-5414-a0b3-71daf6b59d99
5,70.014478,68.334955,Yamal-Gydan,Rodenhizer,2023-09-28,"2023-05-01,2023-09-30",WorldView-2,4.0,Positive,,"POLYGON ((2007479.747 865990.850, 2007472.484 ...",4,70.01447819631568.3349546118791Yamal-GydanRode...,66560cd2-46cb-5f48-b51f-001335b5bb48,66560cd2-46cb-5f48-b51f-001335b5bb48,66560cd2-46cb-5f48-b51f-001335b5bb48,,,
6,70.015426,68.340715,Yamal-Gydan,Rodenhizer,2023-09-28,"2022-05-01,2022-9-30",WorldView-2,4.0,Positive,,"POLYGON ((2007289.337 866129.959, 2007283.521 ...",4,70.015426344950268.3407146769001Yamal-GydanRod...,b9b9b724-1d5e-5dce-a997-798895d4836a,,,,,


# Check for Intersections within New RTS Data Set

Intersections within the new data set are assumed to be repeat delineations of the same RTS feature. If this is not true (e.g. if you have delineated an old RTS scar and an active RTS feature on top of it), this code will not assign UUIDs properly. In this case, please get in touch with us to determine how to proceed.

In [23]:
intersections = []
for idx in range(0,new_data.shape[0]):
    new_intersections = get_intersecting_uuids(new_data.iloc[[idx]], new_data.drop([idx]))
    intersections = intersections + new_intersections
    
new_data['SelfIntersectionIndices'] = intersections

adjacent_polys = []
for idx in range(0,new_data.shape[0]):
    new_adjacent_polys = get_touching_uuids(new_data.iloc[[idx]], new_data.drop(idx))
    adjacent_polys = adjacent_polys + new_adjacent_polys
    
new_data['AdjacentPolys'] = adjacent_polys

new_data.Intersections = remove_adjacent_polys(new_data.Intersections, new_data.AdjacentPolys)
new_data.drop('AdjacentPolys', axis=1)


new_data.loc[new_data.SelfIntersectionIndices.str.len() > 0, 'UUID'] = (
    new_data[new_data.SelfIntersectionIndices.str.len() > 0]
    .apply(get_earliest_uuid, df = new_data, axis = 1)
)

new_data

Unnamed: 0,CentroidLat,CentroidLon,RegionName,CreatorLab,ContributionDate,BaseMapDate,BaseMapSource,BaseMapResolution,TrainClass,CustomColumn1,geometry,BaseMapResolutionStr,seed,UUID,Intersections,RepeatRTS,MergedRTS,StabilizedRTS,AccidentalOverlap,SelfIntersectionIndices
0,70.016553,68.339261,Yamal-Gydan,Rodenhizer,2023-09-28,"2023-05-01,2023-09-30",WorldView-2,4.0,Positive,,"POLYGON ((2007199.012 865984.608, 2007188.217 ...",4,70.016552920425468.3392611042346Yamal-GydanRod...,2d21346f-66e0-50af-b4b3-5f38cd00a4ac,"e857c4d6-13cf-5aa9-bcf6-637823b17e86,2cc8036c-...",,"e857c4d6-13cf-5aa9-bcf6-637823b17e86,2cc8036c-...",,,
1,70.015426,68.340715,Yamal-Gydan,Rodenhizer,2023-09-28,"2023-05-01,2023-09-30",WorldView-2,4.0,Positive,,"POLYGON ((2007289.337 866129.959, 2007283.521 ...",4,70.015426344950268.3407146769001Yamal-GydanRod...,b9b9b724-1d5e-5dce-a997-798895d4836a,,,,,,b9b9b724-1d5e-5dce-a997-798895d4836a
2,70.016517,68.33235,Yamal-Gydan,Rodenhizer,2023-09-28,"2023-05-01,2023-09-30",WorldView-2,4.0,Positive,,"POLYGON ((2007340.152 865838.514, 2007326.959 ...",4,70.016516997782768.3323502773845Yamal-GydanRod...,182a35b7-98c7-5775-b90e-6f0b0ca2d0ba,182a35b7-98c7-5775-b90e-6f0b0ca2d0ba,182a35b7-98c7-5775-b90e-6f0b0ca2d0ba,,,,
3,70.015305,68.331148,Yamal-Gydan,Rodenhizer,2023-09-28,"2023-05-01,2023-09-30",WorldView-2,4.0,Positive,,"POLYGON ((2007453.557 865845.775, 2007456.416 ...",4,70.015305015354768.3311476725512Yamal-GydanRod...,15a960b9-2050-528e-847f-2ea21a11da4f,3f9b82c6-2232-53dc-a4cd-d97538e0ecb9,,,3f9b82c6-2232-53dc-a4cd-d97538e0ecb9,,
4,70.01457,68.333416,Yamal-Gydan,Rodenhizer,2023-09-28,"2023-05-01,2023-09-30",WorldView-2,4.0,Positive,,"POLYGON ((2007492.587 865949.766, 2007492.342 ...",4,70.014569984934568.3334155050299Yamal-GydanRod...,ee636bfb-8586-5449-932f-8ffa1c91a748,f87bcdb3-8f9d-5414-a0b3-71daf6b59d99,,,,f87bcdb3-8f9d-5414-a0b3-71daf6b59d99,
5,70.014478,68.334955,Yamal-Gydan,Rodenhizer,2023-09-28,"2023-05-01,2023-09-30",WorldView-2,4.0,Positive,,"POLYGON ((2007479.747 865990.850, 2007472.484 ...",4,70.01447819631568.3349546118791Yamal-GydanRode...,66560cd2-46cb-5f48-b51f-001335b5bb48,66560cd2-46cb-5f48-b51f-001335b5bb48,66560cd2-46cb-5f48-b51f-001335b5bb48,,,,
6,70.015426,68.340715,Yamal-Gydan,Rodenhizer,2023-09-28,"2022-05-01,2022-9-30",WorldView-2,4.0,Positive,,"POLYGON ((2007289.337 866129.959, 2007283.521 ...",4,70.015426344950268.3407146769001Yamal-GydanRod...,b9b9b724-1d5e-5dce-a997-798895d4836a,,,,,,1bba7b2d-90aa-5e53-9d57-022a049ccd1c


# Check Completeness of Intersection Information

In [24]:
check_intersection_info(new_data)

Intersection information is complete.


# Final Column Selection

In [25]:
new_data = add_empty_columns(
    new_data, 
    [col for col in list(optional_fields.values()) if col not in ['StabilizedRTS', 'MergedRTS']]
)

new_data = new_data[list(all_fields.values()) + ['geometry']]

new_data

Unnamed: 0,CentroidLat,CentroidLon,RegionName,CreatorLab,ContributionDate,BaseMapDate,BaseMapSource,BaseMapResolution,TrainClass,UUID,StabilizedRTS,MergedRTS,Area,CustomColumn1,geometry
0,70.016553,68.339261,Yamal-Gydan,Rodenhizer,2023-09-28,"2023-05-01,2023-09-30",WorldView-2,4.0,Positive,2d21346f-66e0-50af-b4b3-5f38cd00a4ac,,"e857c4d6-13cf-5aa9-bcf6-637823b17e86,2cc8036c-...",,,"POLYGON ((2007199.012 865984.608, 2007188.217 ..."
1,70.015426,68.340715,Yamal-Gydan,Rodenhizer,2023-09-28,"2023-05-01,2023-09-30",WorldView-2,4.0,Positive,b9b9b724-1d5e-5dce-a997-798895d4836a,,,,,"POLYGON ((2007289.337 866129.959, 2007283.521 ..."
2,70.016517,68.33235,Yamal-Gydan,Rodenhizer,2023-09-28,"2023-05-01,2023-09-30",WorldView-2,4.0,Positive,182a35b7-98c7-5775-b90e-6f0b0ca2d0ba,,,,,"POLYGON ((2007340.152 865838.514, 2007326.959 ..."
3,70.015305,68.331148,Yamal-Gydan,Rodenhizer,2023-09-28,"2023-05-01,2023-09-30",WorldView-2,4.0,Positive,15a960b9-2050-528e-847f-2ea21a11da4f,3f9b82c6-2232-53dc-a4cd-d97538e0ecb9,,,,"POLYGON ((2007453.557 865845.775, 2007456.416 ..."
4,70.01457,68.333416,Yamal-Gydan,Rodenhizer,2023-09-28,"2023-05-01,2023-09-30",WorldView-2,4.0,Positive,ee636bfb-8586-5449-932f-8ffa1c91a748,,,,,"POLYGON ((2007492.587 865949.766, 2007492.342 ..."
5,70.014478,68.334955,Yamal-Gydan,Rodenhizer,2023-09-28,"2023-05-01,2023-09-30",WorldView-2,4.0,Positive,66560cd2-46cb-5f48-b51f-001335b5bb48,,,,,"POLYGON ((2007479.747 865990.850, 2007472.484 ..."
6,70.015426,68.340715,Yamal-Gydan,Rodenhizer,2023-09-28,"2022-05-01,2022-9-30",WorldView-2,4.0,Positive,b9b9b724-1d5e-5dce-a997-798895d4836a,,,,,"POLYGON ((2007289.337 866129.959, 2007283.521 ..."


In [26]:
print(rts_data.columns)
print(new_data.columns)

Index(['CentroidLat', 'CentroidLon', 'RegionName', 'CreatorLab',
       'ContributionDate', 'BaseMapDate', 'BaseMapSource', 'BaseMapResolution',
       'TrainClass', 'UUID', 'StabilizedRTS', 'MergedRTS', 'Area', 'geometry'],
      dtype='object')
Index(['CentroidLat', 'CentroidLon', 'RegionName', 'CreatorLab',
       'ContributionDate', 'BaseMapDate', 'BaseMapSource', 'BaseMapResolution',
       'TrainClass', 'UUID', 'StabilizedRTS', 'MergedRTS', 'Area',
       'CustomColumn1', 'geometry'],
      dtype='object')


In [27]:
if separate_file:
    new_data.to_file(os.path.join('../python_output', new_data_file.split('.')[0] + "_formatted.shp"))
else:
    rts_data.ContributionDate = rts_data.ContributionDate.dt.strftime('%d-%m-%Y')
    rts_data = add_empty_columns(
        rts_data, 
        [col for col in list(new_fields.values())]
        )
    rts_data = rts_data[list(all_fields.values()) + ['geometry']]
    updated_data = pd.concat([rts_data, new_data])
    updated_data.to_file(os.path.join('../python_output', rts_file))

  updated_data = pd.concat([rts_data, new_data])
  updated_data.to_file(os.path.join('../python_output', rts_file))


Now you are ready to submit

In [28]:
if separate_file:
    os.path.join('../python_output', new_data_file.split('.')[0] + "_formatted.shp")
else:
    os.path.join('../python_output', rts_file)