# RTS Dataset Formatting

# TODO

-  Make the script find and download the current version of the RTS dataset and metadata description rather than providing the filepath manually
- remove filepaths and metadata columns that I set
-  think through environment sharing (rts_dataset.rproj, .env, renv)
-  finalize file organization/filepaths so that they don't differ between R and Python

# Set-Up

In [14]:
import os
import uuid
import numpy as np
import pandas as pd
import geopandas as gpd

## User-Defined Input

Before starting, copy your new file into the input_data directory.

Provide the file name to the data:

In [154]:
new_data_file = 'rts_dataset_test_polygons_new.shp'  # set this
new_data_filepath = os.path.join(
  '../input_data',
  new_data_file)

Provide the names of any metadata fields in your new file that are not already in the official RTS Data Set (please check the list to ensure that the field has not been included previously):

In [155]:
# Use the format `AbbreviatedName: FullName`, where the FullName should be a human-readable name and the AbbreviatedName should be the ESRI shapefile driver abbreviated version of the the FullName.
# Example:
# new_fields = {'CstmCl1': 'CustomColumn1'}
new_fields = {'CstmCl1' : 'CustomColumn1'}

Have you already created RTS centroid columns, or would you like them to be created within this script? Provide either TRUE, if the columns do not exist yet, or FALSE, if you have already created them:

In [156]:
# Example: 
# calculate_centroid = False
calculate_centroid = False

# Functions

## run_formatting_checks

In [157]:
def check_lat(lat):
    correct_type = type(lat[0]) == np.float64
    missing_values = pd.isna(lat).values.any()
    reasonable_values = np.all(lat.between(-180, 180))

    if not correct_type:
        raise ValueError('The CentroidLat column is not numeric. Ensure that latitude is reported as decimal degress in WGS 84.')
    elif missing_values:
        raise ValueError('The CentroidLat column is missing values.')
    elif not reasonable_values:
        raise ValueError('Unexpected values found in the CentroidLat column. Ensure that CentroidLat is listed as decimal degress in WGS 84.')

def check_lon(lon):
    correct_type = type(lon[0]) == np.float64
    missing_values = pd.isna(lon).values.any()
    reasonable_values = np.all(lon.between(-180, 180))

    if not correct_type:
        raise ValueError('The CentroidLon column is not numeric. Ensure that longitude is reported as decimal degress in WGS 84.')
    elif missing_values:
        raise ValueError('The CentroidLon column is missing values.')
    elif not reasonable_values:
        raise ValueError('Unexpected values found in the CentroidLon column. Ensure that longitude is listed as decimal degress in WGS 84.')

def check_region(region):
    correct_type = type(region[0]) == str
    missing_values = (region == '').values.any()

    if not correct_type:
        raise ValueError('The RegionName column is not a string.')
    elif missing_values:
        raise ValueError('The RegionName column is missing values.')

def check_creator(creator):
    correct_type = type(creator[0]) == str
    missing_values = (creator == '').values.any()

    if not correct_type:
        raise ValueError('The CreatorLab column is not a string.')
    elif missing_values:
        raise ValueError('The CreatorLab column is missing values.')

def check_contribution_date(contribution_date):
    correct_type = type(contribution_date[0]) == pd._libs.tslibs.timestamps.Timestamp
    missing_values = (contribution_date == '').values.any()

    if not correct_type:
        raise ValueError('The ContributionDate column is not a string.')
    elif missing_values:
        raise ValueError('The ContributionDate column is missing values.')
  
def check_basemap_date(basemap_date):
    correct_type = pd.Series([
        type(pd.to_datetime(row, dayfirst = True)) == pd.core.indexes.datetimes.DatetimeIndex 
         for row in basemap_date.str.split(',')
    ]).values.all()
    missing_values = ((basemap_date.str.split(',', expand = True)).iloc[:, 0] == '').any()

    if not correct_type:
        raise ValueError('The BaseMapDate column does not contain dates (or they are improperly formatted).')
    elif missing_values:
        raise ValueError('The BaseMapDate column is missing values.')
  
def check_source(source):
    correct_type = type(source[0]) == str
    missing_values = (source == '').values.any()

    if not correct_type:
        raise ValueError('The BaseMapSource column is not a string.')
    elif missing_values:
        raise ValueError('The BaseMapSource column is missing values.')

def check_resolution(resolution):
    correct_type = type(resolution[0]) == np.float64
    missing_values = pd.isna(resolution).values.any()

    if not correct_type:
        raise ValueError('The BaseMapResolution column is not a numeric.')
    elif missing_values:
        raise ValueError('The BaseMapResolution column is missing values.')

def check_train_class(train_class):
    correct_type = type(train_class[0]) == str
    missing_values = (train_class == '').values.any()

    if not correct_type:
        raise ValueError('The TrainClass column is not a string.')
    elif missing_values:
        raise ValueError('The TrainClass column is missing values.')

def run_formatting_checks(df):
    check_lat(df.CentroidLat)
    check_lon(df.CentroidLon)
    check_region(df.RegionName)
    check_creator(df.CreatorLab)
    check_contribution_date(df.ContributionDate)
    check_basemap_date(df.BaseMapDate)
    check_source(df.BaseMapSource)
    check_resolution(df.BaseMapResolution)
    check_train_class(df.TrainClass)
    
    print('Formatting looks good!')


# Import Metadata Description File

In [158]:
### update this to download the current version automatically ##################
col_metadata = pd.read_csv('../input_data/metadata_description.csv')
################################################################################

required_fields = (
    pd.Series(
        col_metadata[col_metadata.Required == True].FullColumnName.values, 
        index = col_metadata[col_metadata.Required == True].AbbreviatedColumnName
    )
    .to_dict()
)
optional_fields = (
    pd.Series(
        col_metadata[col_metadata.Required == False].FullColumnName.values, 
        index = col_metadata[col_metadata.Required == False].AbbreviatedColumnName
    )
    .to_dict()
)

# Import Official and New RTS Data Files

In [159]:
### update this to find the dataset online and download it #####################
rts_data_filepath = os.path.join(
  '../input_data',
  'rts_dataset_test_polygons_current.shp'
  )

rts_data = (
    gpd.read_file(rts_data_filepath)
    .rename(columns = dict(required_fields, **optional_fields))
    .filter(items = list(required_fields.values()) + list(optional_fields.values()) + ['geometry'])
)

rts_data.ContributionDate = pd.to_datetime(rts_data.ContributionDate)
    
for field in list(required_fields.values()): # Check if all required columns are present
    if field not in rts_data.columns:
        raise ValueError('{field} is missing. Has the RTS data set been modified since download?'.format(field = repr(field)))

# for field in list(optional_fields.values()): # Check if all optional columns are present
#     if field not in rts_data.columns:
#         raise ValueError('{field} is missing. Has the RTS data set been modified since download?'.format(field = repr(field)))

rts_data
################################################################################


Unnamed: 0,CentroidLat,CentroidLon,RegionName,CreatorLab,ContributionDate,BaseMapDate,BaseMapSource,BaseMapResolution,TrainClass,UUID,StabilizedRTS,MergedRTS,geometry
0,68.33918,70.016684,Yamal-Gydan,Rodenhizer,2023-09-01,"01-05-2022,30-09-2022",WorldView-2,4.0,Positive,a6cf10e7-6515-5ab7-b7bb-66606b16ef10,,,"POLYGON ((2007198.307 865988.469, 2007189.916 ..."
1,68.339167,70.01622,Yamal-Gydan,Rodenhizer,2023-09-01,"01-05-2022,30-09-2022",WorldView-2,4.0,Positive,dd137c86-6db6-57da-bb46-961c89845a97,,,"POLYGON ((2007253.161 866032.001, 2007235.776 ..."
2,68.332416,70.01648,Yamal-Gydan,Rodenhizer,2023-09-01,"01-05-2022,30-09-2022",WorldView-2,4.0,Positive,83b6a6ee-b498-5307-ada5-c4558e63a611,,,"POLYGON ((2007310.378 865857.070, 2007340.692 ..."
3,68.3295,70.015499,Yamal-Gydan,Rodenhizer,2023-09-01,"01-05-2022,30-09-2022",WorldView-2,4.0,Positive,6c8a822f-e027-561d-915f-f3cdfc462594,,,"POLYGON ((2007453.557 865845.775, 2007456.723 ..."
4,68.332959,70.014513,Yamal-Gydan,Rodenhizer,2023-09-01,"01-05-2022,30-09-2022",WorldView-2,4.0,Positive,ec6c9c21-1404-552d-8f09-6016f68ed15b,,,"POLYGON ((2007514.965 865926.094, 2007508.132 ..."
5,68.334934,70.014369,Yamal-Gydan,Rodenhizer,2023-09-01,"01-05-2022,30-09-2022",WorldView-2,4.0,Positive,6484485f-b005-58ca-addb-f79829f5a929,,,"POLYGON ((2007496.803 865994.362, 2007488.866 ..."


In [160]:
if calculate_centroid:
    new_data = gpd.read_file(new_data_filepath)

    new_data["CntrdLt"] = new_data.centroid.x
    new_data["CntrdLn"] = new_data.centroid.y

    new_data = (
        new_data    
        .rename(columns = dict(
            {key:required_fields[key] for key in [key for key in list(required_fields.keys()) if key != 'UUID']},
            **optional_fields,
            **new_fields
            )
                )
        .filter(items = list(required_fields.values()) + list(optional_fields.values()) + list(new_fields.values()) + ['geometry'])
        )

    new_data.ContributionDate = pd.to_datetime(new_data.ContributionDate)
    
    for field in [item for item in list(required_fields.values()) if item not in ['UUID']]: # Check if all required columns are present
        if field not in new_data.columns:
            raise ValueError('{field} is missing. Ensure that all required fields (except UUID) are present prior to running this script'
                             .format(field = repr(field)))

    for field in [item for item in list(new_fields.values()) if item not in ['UUID']]: # Check if all new columns are present
        if field not in new_data.columns:
            raise ValueError('{field} is missing. Did you specify the name of the new metadata field correctly?'.format(field = repr(field)))

else:
    
    new_data = (
        gpd.read_file(new_data_filepath)
        .rename(columns = dict(
            {key:required_fields[key] for key in [key for key in list(required_fields.keys()) if key != 'UUID']},
            **optional_fields,
            **new_fields
            )
                )
        .filter(items = list(required_fields.values()) + list(optional_fields.values()) + list(new_fields.values()) + ['geometry'])
        )
    
    new_data.ContributionDate = pd.to_datetime(new_data.ContributionDate)
    
    for field in [item for item in list(required_fields.values()) if item not in ['UUID']]: # Check if all required columns are present
        if field not in new_data.columns:
            raise ValueError('{field} is missing. Ensure that all required fields (except UUID) are present prior to running this script'
                             .format(field = repr(field)))

    for field in [item for item in list(new_fields.values()) if item not in ['UUID']]: # Check if all new columns are present
        if field not in new_data.columns:
            raise ValueError('{field} is missing. Did you specify the name of the new metadata field correctly?'
                             .format(field = repr(field)))

new_data

Unnamed: 0,CentroidLat,CentroidLon,RegionName,CreatorLab,ContributionDate,BaseMapDate,BaseMapSource,BaseMapResolution,TrainClass,CustomColumn1,geometry
0,68.339261,70.016553,Yamal-Gydan,Rodenhizer,2023-09-28,"01-05-2023,30-09-2023",WorldView-2,4.0,Positive,,"POLYGON ((2007199.012 865984.608, 2007188.217 ..."
1,68.340715,70.015426,Yamal-Gydan,Rodenhizer,2023-09-28,"01-05-2023,30-09-2023",WorldView-2,4.0,Positive,,"POLYGON ((2007289.337 866129.959, 2007283.521 ..."
2,68.33235,70.016517,Yamal-Gydan,Rodenhizer,2023-09-28,"01-05-2023,30-09-2023",WorldView-2,4.0,Positive,,"POLYGON ((2007340.152 865838.514, 2007326.959 ..."
3,68.331148,70.015305,Yamal-Gydan,Rodenhizer,2023-09-28,"01-05-2023,30-09-2023",WorldView-2,4.0,Positive,,"POLYGON ((2007453.557 865845.775, 2007456.416 ..."
4,68.333416,70.01457,Yamal-Gydan,Rodenhizer,2023-09-28,"01-05-2023,30-09-2023",WorldView-2,4.0,Positive,,"POLYGON ((2007492.587 865949.766, 2007492.342 ..."
5,68.334955,70.014478,Yamal-Gydan,Rodenhizer,2023-09-28,"01-05-2023,30-09-2023",WorldView-2,4.0,Positive,,"POLYGON ((2007479.747 865990.850, 2007472.484 ..."
6,68.340715,70.015426,Yamal-Gydan,Rodenhizer,2023-09-28,"01-05-2022,30-9-2022",WorldView-2,4.0,Positive,,"POLYGON ((2007289.337 866129.959, 2007283.521 ..."


# Check Metadata Format of New Data

In [161]:
run_formatting_checks(new_data)

Formatting looks good!


# Generate UUIDs

Set seed for UUID generation (R) by concatenating all required metadata columns (except UUID) into a single string

In [165]:
new_data.CentroidLat = np.round(new_data.CentroidLat, 13)
new_data.CentroidLon = np.round(new_data.CentroidLon, 13)
new_data.ContributionDate = new_data.ContributionDate.dt.strftime('%d-%m-%Y')
c = new_data.BaseMapResolution == new_data.BaseMapResolution.astype(int)
new_data.loc[c,'BaseMapResolutionStr'] = new_data.BaseMapResolution.astype(int).astype(str)
new_data.loc[~c,'BaseMapResolutionStr'] = new_data.BaseMapResolution.astype(str)

AttributeError: Can only use .dt accessor with datetimelike values

In [166]:
new_data['seed'] = (
    new_data[[
        'CentroidLat', 
        'CentroidLon', 
        'RegionName', 
        'CreatorLab', 
        'ContributionDate', 
        'BaseMapDate', 
        'BaseMapSource', 
        'BaseMapResolutionStr', 
        'TrainClass'
    ]].apply(
        lambda row: ''.join(row.values.astype(str)),
        axis = 1
    )
)
new_data.seed

0    68.339261104234670.0165529204254Yamal-GydanRod...
1    68.340714676900170.0154263449502Yamal-GydanRod...
2    68.332350277384570.0165169977827Yamal-GydanRod...
3    68.331147672551270.0153050153547Yamal-GydanRod...
4    68.333415505029970.0145699849345Yamal-GydanRod...
5    68.334954611879170.014478196315Yamal-GydanRode...
6    68.340714676900170.0154263449502Yamal-GydanRod...
Name: seed, dtype: object

Generate UUIDs

In [167]:
new_data['UUID'] = [str(uuid.uuid5(uuid.NAMESPACE_DNS, name = seed)) for seed in new_data.seed]
new_data.UUID

0    7aa12b12-bc66-5cb5-aaa4-79ca55c4515a
1    65ad3115-083e-5d5f-a98d-43a49ac220b9
2    4dc81d34-3a76-5bc9-bd05-2c8d59fe15a1
3    077d50c3-550d-5dcc-a5d0-cc82d2ebbae3
4    5c170b46-ded0-582d-be79-293160b0554d
5    5e62e8b5-2489-5b04-9344-601ed863cb55
6    4bf95199-c169-54b1-85c8-f20cc32ac29e
Name: UUID, dtype: object

'68.339261104234670.0165529204254Yamal-GydanRodenhizer28-09-202301-05-2023,30-09-2023WorldView-24Positive'