# Ground Data Preprocessing Notebook

This notebook standardizes raw ground data (geometry + attributes)
into a harmonized format required by the automation pipeline.

## Expected Inputs
- Geometry file: .shp, .geojson, or .parquet 
- Attribute file: .xlsx or .csv (optional)

## Important Variables

- Crop Type
- Sow Date 
- Cropping calendar information
- Provider Name

## Output
- Standardized parquet file ready for ingestion




In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
import xarray as xr
from shapely.geometry import Polygon
import matplotlib.pyplot as plt
import sys
from datetime import datetime
import utils
import scripts.crop_type
import scripts.geom_processing
import scripts.observation_processing
import scripts.automation
import ipywidgets as widgets
from IPython.display import display
import geopandas as gpd
import pandas as pd
import utils

from utils import crop_variations, season_codes, variation_codes, group_codes #, crop_codes, database_schema
from scripts.crop_type import map_variations, code_variations, parse_crop_type, add_crop_code
from scripts.geom_processing import check_invalid_geoms, check_repeating_geoms, check_repeating_geoms_s2cell
from scripts.observation_processing import to_datetime,to_numeric,assign_date,yield_process

#sys.path.append('..')

In [2]:
# ======================
# USER INPUT SECTION
# ======================


COUNTRY = "Ukraine"
PROVIDER_NAME = "ExampleProvider"
SEASON_YEAR = 2023
JOIN_KEY = "Field"


INPUT_GEOMETRY = "data/2023_kernel.parquet"
#(OPTIONAL)
INPUT_ATTRIBUTES = "data/2023_kernel_table.xlsx"

# Remove INPUT_ATTRIBUTES if not present
TEST_PATH = list([INPUT_GEOMETRY,INPUT_ATTRIBUTES])

#Important column in the data
PATH_NAME = INPUT_GEOMETRY + " + " + INPUT_ATTRIBUTES 


In [3]:
#Merges the geospatial and tabular files (if required)
dataset =scripts.automation.read_ground_data(file_paths=TEST_PATH, join_key=(JOIN_KEY,JOIN_KEY))
#We work with yearly data 
dataset = dataset[dataset['Year'] == SEASON_YEAR]
dataset.head()


--- DEBUGGING EXCEL LOAD: 2023_kernel_table.xlsx ---
Columns detected: ['Unnamed: 0', 'Year', 'Crop', 'Region', 'Yield', 'Yield_clean', 'Harvest_area', 'Field_area', 'Planned_area', 'Sowing', 'Harvesting', 'N', 'P', 'K', 'Field']
Row count: 4195
First 2 rows:
   Unnamed: 0  Year   Crop               Region Yield Yield_clean  \
0           0  2022  Herbs  Кременчуцький район   NaN         NaN   
1           1  2022  Herbs      Оржицький район   NaN         NaN   

   Harvest_area  Field_area  Planned_area Sowing Harvesting          N  \
0         120.9       122.6         122.6    NaT        NaT  11181.120   
1         187.6        93.9          93.9    NaT        NaT  11222.442   

        P      K  Field  
0    0.00    0.0      0  
1  479.64  831.0      1  
Sanitizing keys: Casting 'Field' and 'Field'...
Merging on Field == Field


Unnamed: 0.1,geometry,Field,Unnamed: 0,Year,Crop,Region,Yield,Yield_clean,Harvest_area,Field_area,Planned_area,Sowing,Harvesting,N,P,K
1,"POLYGON Z ((33.20118 49.19568 -100000, 33.2011...",0,547,2023,Herbs,Кременчуцький район,,,,122.6,120.9,NaT,NaT,11297.6,0.0,0.0
3,"POLYGON Z ((32.61933 49.99669 -100000, 32.6191...",1,548,2023,Herbs,Оржицький район,,,,93.9,93.9,NaT,NaT,,,
5,"POLYGON Z ((32.17066 50.28364 -100000, 32.1706...",10,555,2023,Herbs,Пирятинський район,,,,84.09,84.09,2023-04-17,NaT,7406.0,0.0,0.0
7,"MULTIPOLYGON Z (((32.9605 50.31382 -100000, 32...",1000,1132,2023,Soy,Чорнухинський район,,,,160.31,160.77,2023-05-14,NaT,5554.8,1028.8,257.2
8,"POLYGON Z ((32.96351 50.30416 -100000, 32.9631...",1001,1133,2023,Soy,Чорнухинський район,,,,34.03,34.03,2023-05-12,NaT,1161.98,217.6,54.4


## 1. Geometric Operations

In [4]:
#Project to EPSG:4326
dataset_4326 = dataset.to_crs(4326)

#Check invalid or null geoms 
dataset_check_valid_geoms = scripts.geom_processing.check_invalid_geoms(gdf = dataset_4326)
if dataset_check_valid_geoms[0].empty:
    print("no invalid geoms")
else:
    print("The invalid geoms are:",dataset_check_valid_geoms[0])

#Check duplicate geoms    
#Function repeating_geoms_s2cell returns duplicate geoms along with the entire dataset with Geom id assigned to them 
dataset_4326_check_duplicate_geoms = scripts.geom_processing.check_repeating_geoms_s2cell(gdf = dataset_check_valid_geoms[1], col_name = "Geom_id", granularity = 25)
dataset_4326_duplicate_geoms = dataset_4326_check_duplicate_geoms[0] 
print("Duplicate geoms: ",len(dataset_4326_duplicate_geoms))
print("Dataset with Geom_ids: ",len(dataset_4326_check_duplicate_geoms[1]))

#If duplicate geoms exist in the data, they do not get removed. Check if they are actual duplicates from the same year. If you want to remove them, uncomment the next line 
#dataset_4326_non_duplicate_geoms = dataset_4326_check_duplicate_geoms[1].drop(dataset_4326_duplicate_geoms.index)
dataset_4326_non_duplicate_geoms = dataset_4326_check_duplicate_geoms[1]
#Remove 3D geoms

dataset_4326_non_duplicate_geoms["geometry"] = dataset_4326_non_duplicate_geoms["geometry"].apply(scripts.geom_processing.geometry_remove_z)
dataset_4326_non_duplicate_geoms.columns

no invalid geoms



  centroids = gdf.geometry.centroid


No duplicated geoms in this dataset
Geom ids assigned to the geodataframe


Duplicate geoms:  0
Dataset with Geom_ids:  2134


Index(['geometry', 'Field', 'Unnamed: 0', 'Year', 'Crop', 'Region', 'Yield',
       'Yield_clean', 'Harvest_area', 'Field_area', 'Planned_area', 'Sowing',
       'Harvesting', 'N', 'P', 'K', 'lat', 'lon', 'Geom_id'],
      dtype='str')

## 2. Crop Type Operations

In [5]:
#======CROP TYPE OPERATIONS================
ct_column = 'Crop'
#Wrapper function removes all the null crop type columns and assigns crop, group and season to each row
dataset_4326_non_duplicate_geoms = scripts.crop_type.remove_null_ct(dataset=dataset_4326_non_duplicate_geoms,ct_column_name=ct_column)

data_crop = scripts.automation.crop_type_preprocess(dataset=dataset_4326_non_duplicate_geoms, crop_column = ct_column)
dataset_4326_non_duplicate_geoms = dataset_4326_non_duplicate_geoms.drop(columns=[ct_column])
data_crop.head()

Number of null crop rows removed from the dataset:  0
The crop types in this dataset:  Crop
Sunflower    607
Soybean      419
Maize        363
Wheat        336
Rapeseed     269
Fallow       114
Grassland     20
Barley         6
Name: count, dtype: int64
The number of variations in this dataset:  Series([], Name: count, dtype: int64)
The number of group in this dataset:  Crop_Group
Oilseeds          876
Cereals           705
Legumes/Pulses    419
Non Crop          134
Name: count, dtype: int64


Unnamed: 0,Crop_Group,Crop_Season,Crop_Variation,Crop
0,Non Crop,,,Grassland
1,Non Crop,,,Grassland
2,Non Crop,,,Grassland
3,Legumes/Pulses,,,Soybean
4,Legumes/Pulses,,,Soybean


## 3. Yield Operations

In [10]:
#======YIELD OPERATIONS================

yield_col = "Yield" #Assign yield column name

#Convert the yield col to numeric
dataset_4326_non_duplicate_geoms[yield_col] = scripts.observation_processing.to_numeric(series = dataset_4326_non_duplicate_geoms[yield_col])

#Convert the yield col units to ton/ha (db standard)
dataset_4326_non_duplicate_geoms[yield_col] = scripts.observation_processing.yield_process(column = dataset_4326_non_duplicate_geoms[yield_col],current_unit = "ton/ha")

The yield units have been successfully changed from ton/ha to ton/ha 



## 4. Datetime Operations

In [11]:
#Datetime columns standardization
cols_to_convert = {
    "Sowing": {"new_col": "Sow_Date", "format": "%Y-%m-%d"},
    #"Harvesting": {"new_col": "Harvest_Date", "format": "%Y-%m-%d"}  
}
dataset_4326_non_duplicate_geoms = scripts.observation_processing.convert_multiple_datetime(df = dataset_4326_non_duplicate_geoms, dict_mapping = cols_to_convert,func = scripts.observation_processing.to_datetime)
#dataset_4326_non_duplicate_geoms["Sow_Date"].unique()

## 5. Allocation of required columns

In [12]:
#Assign provider name and Src_file  
dataset_4326_non_duplicate_geoms["Provider"] = PROVIDER_NAME
dataset_4326_non_duplicate_geoms["Src_File"] =  PATH_NAME 

#Join all the outputs 
data_refined_ct = pd.concat([data_crop,dataset_4326_non_duplicate_geoms],axis=1)

#Assign sowing information 
data_refined_sy = scripts.automation.assign_sow_info(dataset = data_refined_ct)
#data_refined_sy['Sow_Day'].value_counts()


#data_refined_ct["Sow_Day"].unique()

Columns 'Sow_Year', 'Sow_Month' and 'Sow_Date' assigned from 'Sow_Date'


## 6. Allocation of Est_Sow_Year

In [13]:
data_refined = scripts.automation.assign_est_sow_year(country = 'Ukraine', dataset = data_refined_sy, season_year = SEASON_YEAR)
data_refined['Est_Sow_Year'].value_counts()

Column 'Est_Sow_Year' successfully assigned


Est_Sow_Year
2023    129
Name: count, dtype: Int64

## 7. Allocation of Season id 

In [14]:
#Assign season id to all the rows 
data_final_refined = scripts.observation_processing.assign_season_id(gdf = data_refined)
data_final_refined.dtypes

Crop_Group                   str
Crop_Season                  str
Crop_Variation           float64
Crop                         str
geometry                geometry
Field                        str
Unnamed: 0                 int64
Year                       int64
Region                       str
Yield                    float64
Yield_clean               object
Harvest_area             float64
Field_area               float64
Planned_area             float64
Sowing            datetime64[us]
Harvesting        datetime64[us]
N                        float64
P                        float64
K                        float64
lat                      float64
lon                      float64
Geom_id                      str
Sow_Date          datetime64[us]
Provider                     str
Src_File                     str
Sow_Year                   Int64
Sow_Month                  Int64
Sow_Day                    Int64
Est_Sow_Year               Int64
Season_id                    str
dtype: obj

## 8. Standardization of column names

In [None]:
#Assign final standardized columns names 
final_cols = scripts.automation.display_final_cols(df = data_final_refined, schema_dict = utils.target_schema_grouped_temp)
final_cols.show()

VBox(children=(Accordion(children=(VBox(children=(HBox(children=(HTML(value="<b>Geom_id *</b> <span style='col…

In [13]:
rename_dict = final_cols.get_mapping()
rename_dict

{'Geom_id': 'Geom_id',
 'Season_id': 'Season_id',
 'Sow_Year': 'Sow_Year',
 'Est_Sow_Year': 'Est_Sow_Year',
 'Field': 'Src_id',
 'Src_File': 'Src_File',
 'Provider': 'Provider',
 'Crop': 'Crop',
 'geometry': 'geometry',
 'Crop_Season': 'Crop_Season',
 'Crop_Group': 'Crop_Group',
 'Crop_Variation': 'Crop_Variation',
 'Sow_Date': 'Sow_Date',
 'Sow_Day': 'Sow_Day',
 'Yield': 'Yield'}

## Final Output 

Export standardized dataset

In [19]:
data_standard = data_final_refined.rename(columns=rename_dict)
data_standard = data_standard[list(v for k,v in rename_dict.items())]
data_standard = gpd.GeoDataFrame(data_standard,geometry='geometry')
data_standard.columns

Index(['Geom_id', 'Season_id', 'Sow_Year', 'Est_Sow_Year', 'Src_id',
       'Src_File', 'Provider', 'Crop', 'geometry', 'Crop_Season', 'Crop_Group',
       'Crop_Variation', 'Sow_Date', 'Sow_Day', 'Yield'],
      dtype='str')

In [20]:
data_standard.to_file("data/check_file.geojson")