# RTS Dataset Formatting

# TODO

-  Make the script find and download the current version of the RTS dataset and metadata description rather than providing the filepath manually
- remove filepaths and metadata columns that I set
-  think through environment sharing (rts_dataset.rproj, .env, renv)
-  finalize file organization/filepaths so that they don't differ between R and Python

# Set-Up

In [1]:
import os
import uuid
import pandas as pd
import geopandas as gpd

## User-Defined Input

Before starting, copy your new file into the input_data directory.

Provide the file name to the data:

In [19]:
new_data_file = 'rts_dataset_test_polygons_new.shp'  # set this
new_data_filepath = os.path.join(
  '../input_data',
  new_data_file)

Provide the names of any metadata fields in your new file that are not already in the official RTS Data Set (please check the list to ensure that the field has not been included previously):

In [3]:
# Use the format `AbbreviatedName: FullName`, where the FullName should be a human-readable name and the AbbreviatedName should be the ESRI shapefile driver abbreviated version of the the FullName.
# Example:
# new_fields = {'CstmCl1': 'CustomColumn1'}
new_fields = {'CstmCl1' : 'CustomColumn1'}

Have you already created RTS centroid columns, or would you like them to be created within this script? Provide either TRUE, if the columns do not exist yet, or FALSE, if you have already created them:

In [58]:
# Example: 
# calculate_centroid = False
calculate_centroid = False

# Functions

In [None]:
def filter_all(df, columns):
    
    
def ChangeColumnNames(DataFrameFileLocation):
    x = DictKeyValuesFromText()
    df = pd.read_csv(DataFrameFileLocation)
    for y in df.columns:
        if y not in x.keys():
            i = input("The column " +  y +  " is not in the list, give a name:")
            df.rename(columns={y:i}) 
        else:
            df.rename(columns={y:x[y]})

    return df

# Import Metadata Description File

In [8]:
### update this to download the current version automatically ##################
col_metadata = pd.read_csv('../input_data/metadata_description.csv')
################################################################################

required_fields = (
    pd.Series(
        col_metadata[col_metadata.Required == True].FullColumnName.values, 
        index = col_metadata[col_metadata.Required == True].AbbreviatedColumnName
    )
    .to_dict()
)
optional_fields = (
    pd.Series(
        col_metadata[col_metadata.Required == False].FullColumnName.values, 
        index = col_metadata[col_metadata.Required == False].AbbreviatedColumnName
    )
    .to_dict()
)

# Import Official and New RTS Data Files

In [17]:
### update this to find the dataset online and download it #####################
rts_data_filepath = os.path.join(
  '../input_data',
  'rts_dataset_test_polygons_current.shp'
  )

rts_data = (
    gpd.read_file(rts_data_filepath)
    .rename(columns = dict(required_fields, **optional_fields))
    .filter(items = list(required_fields.values()) + list(optional_fields.values()) + ['geometry'])
)

for field in list(required_fields.values()): # Check if all required columns are present
    if field not in rts_data.columns:
        raise ValueError('{field} is missing. Has the RTS data set been modified since download?'.format(field = repr(field)))

for field in list(optional_fields.values()): # Check if all optional columns are present
    if field not in rts_data.columns:
        raise ValueError('{field} is missing. Has the RTS data set been modified since download?'.format(field = repr(field)))

rts_data
################################################################################


Unnamed: 0,CentroidLat,CentroidLon,RegionName,CreatorLab,ContributionDate,BaseMapDate,BaseMapSource,BaseMapResolution,TrainClass,UUID,StabilizedRTS,MergedRTS
0,68.33918,70.016684,Yamal-Gydan,Rodenhizer,2023-09-01,"01-05-2022,30-09-2022",WorldView-2,4.0,Positive,a6cf10e7-6515-5ab7-b7bb-66606b16ef10,,
1,68.339167,70.01622,Yamal-Gydan,Rodenhizer,2023-09-01,"01-05-2022,30-09-2022",WorldView-2,4.0,Positive,dd137c86-6db6-57da-bb46-961c89845a97,,
2,68.332416,70.01648,Yamal-Gydan,Rodenhizer,2023-09-01,"01-05-2022,30-09-2022",WorldView-2,4.0,Positive,83b6a6ee-b498-5307-ada5-c4558e63a611,,
3,68.3295,70.015499,Yamal-Gydan,Rodenhizer,2023-09-01,"01-05-2022,30-09-2022",WorldView-2,4.0,Positive,6c8a822f-e027-561d-915f-f3cdfc462594,,
4,68.332959,70.014513,Yamal-Gydan,Rodenhizer,2023-09-01,"01-05-2022,30-09-2022",WorldView-2,4.0,Positive,ec6c9c21-1404-552d-8f09-6016f68ed15b,,
5,68.334934,70.014369,Yamal-Gydan,Rodenhizer,2023-09-01,"01-05-2022,30-09-2022",WorldView-2,4.0,Positive,6484485f-b005-58ca-addb-f79829f5a929,,


In [59]:
if calculate_centroid:
    new_data = gpd.read_file(new_data_filepath)

    new_data["CntrdLt"] = new_data.centroid.x
    new_data["CntrdLn"] = new_data.centroid.y

    new_data = (
        new_data    
        .rename(columns = dict(
            {key:required_fields[key] for key in [key for key in list(required_fields.keys()) if key != 'UUID']},
            **optional_fields,
            **new_fields
            )
                )
        .filter(items = list(required_fields.values()) + list(optional_fields.values()) + list(new_fields.values()) + ['geometry'])
        )
    
    for field in [item for item in list(required_fields.values()) if item not in ['UUID']]: # Check if all required columns are present
        if field not in new_data.columns:
            raise ValueError('{field} is missing. Ensure that all required fields (except UUID) are present prior to running this script'
                             .format(field = repr(field)))

    for field in [item for item in list(new_fields.values()) if item not in ['UUID']]: # Check if all new columns are present
        if field not in new_data.columns:
            raise ValueError('{field} is missing. Did you specify the name of the new metadata field correctly?'.format(field = repr(field)))

else:
    
    new_data = (
        gpd.read_file(new_data_filepath)
        .rename(columns = dict(
            {key:required_fields[key] for key in [key for key in list(required_fields.keys()) if key != 'UUID']},
            **optional_fields,
            **new_fields
            )
                )
        .filter(items = list(required_fields.values()) + list(optional_fields.values()) + list(new_fields.values()) + ['geometry'])
        )

    for field in [item for item in list(required_fields.values()) if item not in ['UUID']]: # Check if all required columns are present
        if field not in new_data.columns:
            raise ValueError('{field} is missing. Ensure that all required fields (except UUID) are present prior to running this script'
                             .format(field = repr(field)))

    for field in [item for item in list(new_fields.values()) if item not in ['UUID']]: # Check if all new columns are present
        if field not in new_data.columns:
            raise ValueError('{field} is missing. Did you specify the name of the new metadata field correctly?'
                             .format(field = repr(field)))

new_data

Unnamed: 0,CentroidLat,CentroidLon,RegionName,CreatorLab,ContributionDate,BaseMapDate,BaseMapSource,BaseMapResolution,TrainClass,CustomColumn1,geometry
0,68.339261,70.016553,Yamal-Gydan,Rodenhizer,2023-09-28,"01-05-2023,30-09-2023",WorldView-2,4.0,Positive,,"POLYGON ((2007199.012 865984.608, 2007188.217 ..."
1,68.340715,70.015426,Yamal-Gydan,Rodenhizer,2023-09-28,"01-05-2023,30-09-2023",WorldView-2,4.0,Positive,,"POLYGON ((2007289.337 866129.959, 2007283.521 ..."
2,68.33235,70.016517,Yamal-Gydan,Rodenhizer,2023-09-28,"01-05-2023,30-09-2023",WorldView-2,4.0,Positive,,"POLYGON ((2007340.152 865838.514, 2007326.959 ..."
3,68.331148,70.015305,Yamal-Gydan,Rodenhizer,2023-09-28,"01-05-2023,30-09-2023",WorldView-2,4.0,Positive,,"POLYGON ((2007453.557 865845.775, 2007456.416 ..."
4,68.333416,70.01457,Yamal-Gydan,Rodenhizer,2023-09-28,"01-05-2023,30-09-2023",WorldView-2,4.0,Positive,,"POLYGON ((2007492.587 865949.766, 2007492.342 ..."
5,68.334955,70.014478,Yamal-Gydan,Rodenhizer,2023-09-28,"01-05-2023,30-09-2023",WorldView-2,4.0,Positive,,"POLYGON ((2007479.747 865990.850, 2007472.484 ..."
6,68.340715,70.015426,Yamal-Gydan,Rodenhizer,2023-09-28,"01-05-2022,30-9-2022",WorldView-2,4.0,Positive,,"POLYGON ((2007289.337 866129.959, 2007283.521 ..."


In [60]:
optional_fields

{'Area': 'Area'}