In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
import xarray as xr
from shapely.geometry import Polygon
import matplotlib.pyplot as plt
import sys
from datetime import datetime
import utils
import scripts.crop_type
import scripts.geom_processing
import scripts.observation_processing
import scripts.automation
import ipywidgets as widgets
from IPython.display import display
import geopandas as gpd
import pandas as pd
import utils

from utils import crop_variations, season_codes, variation_codes, group_codes #, crop_codes, database_schema
from scripts.crop_type import map_variations, code_variations, parse_crop_type, add_crop_code
from scripts.geom_processing import check_invalid_geoms, check_repeating_geoms, check_repeating_geoms_s2cell
from scripts.observation_processing import to_datetime,to_numeric,assign_date,yield_process

#sys.path.append('..')


In [10]:
#Checking the generalizability of the data harmonization process for simple datasets (shapefile/geojson/parquet + xlsx/csv)
# We can test this out on datasets like Zelena/IMC/Kernel to get standardized outputs without needing a lot of manual interjection

test_path_shp = "data/2021_zelena_dolyna.shp"
test_path_1 = list([test_path_shp])
#test_path_par = "data/2023_kernel.parquet"
#test_path_xlsx = "data/2023_kernel_table.xlsx"


In [11]:
#test_path_par = "../../data/raw/UKRAINE/KERNEL/2023_kernel/2023_kernel.parquet"
#test_path_xlsx = "../../data/raw/UKRAINE/KERNEL/2023_kernel/2023_kernel_table.xlsx"

#test_path_2 = list([test_path_par,test_path_xlsx])

#Merges the geospatial and tabular files (if required)
dataset =scripts.automation.read_ground_data(file_paths=test_path_1)
dataset.head()
dataset.columns
len(dataset)
#We work with yearly data 
#dataset = dataset[dataset['Year'] == 2023]
#dataset.head()

545

In [12]:
#====== GEOMETRIC OPERATIONS================

#Project to EPSG:4326
dataset_4326 = dataset.to_crs(4326)

#Check invalid or null geoms 
dataset_check_valid_geoms = scripts.geom_processing.check_invalid_geoms(gdf = dataset_4326)
if dataset_check_valid_geoms[0].empty:
    print("no invalid geoms")
else:
    print("The invalid geoms are:",dataset_check_valid_geoms[0])

#Check duplicate geoms    
dataset_4326_check_duplicate_geoms = scripts.geom_processing.check_repeating_geoms_s2cell(gdf = dataset_check_valid_geoms[1], col_name = "Geom_id", granularity = 25)
print("Duplicate geoms: ",len(dataset_4326_check_duplicate_geoms[0]))
print("Dataset with Geom_ids: ",len(dataset_4326_check_duplicate_geoms[1]))

#Remove 3D geoms
dataset_4326_non_duplicate_geoms = dataset_4326_check_duplicate_geoms[1]
dataset_4326_non_duplicate_geoms["geometry"] = dataset_4326_non_duplicate_geoms["geometry"].apply(scripts.geom_processing.geometry_remove_z)
dataset_4326_non_duplicate_geoms.columns#['product']

no invalid geoms
Geom ids assigned to the geodataframe


Duplicate geoms:  0
Dataset with Geom_ids:  545



  centroids = gdf.geometry.centroid


Index(['id', 'folder', 'group', 'f_name', 'is_active', 'ndvi', 'ndvi_date',
       'adm_name', 'subad_name', 'locality', 'p_estimate', 'ext_id',
       'shrd_field', 'company', 'mapp_area', 'legl_area', 'till_area',
       'created_at', 'p_crop', 'p_crop_s', 'p_variety', 'crop', 'crop_s',
       'product', 'harvested', 'variety', 'till_type', 'sow_date', 'harv_date',
       'irrigation', 'n_crop', 'n_crop_s', 'n_variety', 'n_till_typ',
       'n_sow_date', 'n_product', 'desc', 'gdd', 'precip', 'precip_acc',
       'soil_temp', 'sm_70mm', 'sm_280mm', 'sm_1000mm', 'perimeter', 'parcels',
       'dis_risk', 'report', 'scout_repo', 'soil_test', 'center', 'age_of_sug',
       'curr_year_', 'next_year_', 'productivi', 'grain_clas', 'grain_humi',
       'grain_garb', 'history_it', 'moisture_z', 'agronomist', 'yield_map',
       'field_addi', 'field_soil', 'expected_y', 'legal_enti', 'field_id',
       'geometry', 'lat', 'lon', 'Geom_id'],
      dtype='object')

In [None]:
#======CROP TYPE OPERATIONS================
ct_column = 'crop'
#Wrapper function removes all the null crop type columns and assigns crop, group and season to each row

dataset_4326_non_duplicate_geoms = scripts.crop_type.remove_null_ct(dataset=dataset_4326_non_duplicate_geoms,ct_column_name=ct_column)

data_crop = scripts.automation.crop_type_preprocess(dataset=dataset_4326_non_duplicate_geoms, crop_column = ct_column)
dataset_4326_non_duplicate_geoms = dataset_4326_non_duplicate_geoms.drop(columns=[ct_column])
data_crop.head()
len(dataset_4326_non_duplicate_geoms)

Number of null crop rows removed from the dataset:  21
The crop types in this dataset:  Crop
Maize         195
Wheat         122
Sunflower      93
Sugar_Beet     83
Grassland      22
Fallow          9
Name: count, dtype: int64
The number of variations in this dataset:  Crop_Variation
Silage    22
Name: count, dtype: int64
The number of group in this dataset:  Crop_Group
Cereals              317
Oilseeds              93
Fruits/Vegetables     83
Non Crop              31
Name: count, dtype: int64


524

In [14]:
#======YIELD OPERATIONS================

yield_col = "product" #Assign yield column name
curr_yield_unit = "centner/ha"
#Convert the yield col to numeric
dataset_4326_non_duplicate_geoms[yield_col] = scripts.observation_processing.to_numeric(series = dataset_4326_non_duplicate_geoms[yield_col])

#Convert the yield col units to ton/ha (db standard)
dataset_4326_non_duplicate_geoms[yield_col] = scripts.observation_processing.yield_process(column = dataset_4326_non_duplicate_geoms[yield_col],current_unit = curr_yield_unit)
len(dataset_4326_non_duplicate_geoms)

The yield units have been successfully changed from centner/ha to ton/ha 



524

In [15]:
dataset_4326_non_duplicate_geoms['harv_date'].unique()

<DatetimeArray>
['2021-08-01 00:00:00', '2021-08-03 00:00:00', '2021-08-09 00:00:00',
 '2021-07-25 00:00:00', '2021-08-12 00:00:00', '2021-07-27 00:00:00',
 '2021-07-30 00:00:00', '2021-07-28 00:00:00', '2021-07-26 00:00:00',
 '2021-08-08 00:00:00',
 ...
 '2021-08-31 00:00:00', '2021-09-08 00:00:00', '2021-09-02 00:00:00',
 '2021-07-14 00:00:00', '2021-10-17 00:00:00', '2021-07-15 00:00:00',
 '2021-08-25 00:00:00', '2021-08-22 00:00:00', '2021-08-30 00:00:00',
 '2021-07-11 00:00:00']
Length: 126, dtype: datetime64[ms]

In [16]:
#Datetime columns standardization
cols_to_convert = {
    "sow_date": {"new_col": "Sow_Date", "format": "%Y-%m-%d"},
    "harv_date": {"new_col": "Harvest_Date", "format": "%Y-%m-%d"}  
}
dataset_4326_non_duplicate_geoms = scripts.observation_processing.convert_multiple_datetime(df = dataset_4326_non_duplicate_geoms, dict_mapping = cols_to_convert,func = scripts.observation_processing.to_datetime)
#dataset_4326_non_duplicate_geoms["Sow_Date"].unique()

In [17]:
#Assign provider name and Src_file
provider_name = "Zelena_Dolyna" #Assign provider name
path_name = test_path_shp
#test_path_par + " + " + test_path_xlsx #Assign path to original file
  
dataset_4326_non_duplicate_geoms["Provider"] = provider_name
dataset_4326_non_duplicate_geoms["Src_File"] =  path_name 

#Join all the outputs 
data_refined_ct = pd.concat([data_crop,dataset_4326_non_duplicate_geoms],axis=1)

#Generate Sow month, year and day
data_refined_ct["Sow_Year"] = data_refined_ct["Sow_Date"].dt.year.astype("Int64")
data_refined_ct["Sow_Month"] = data_refined_ct["Sow_Date"].dt.month.astype("Int64")
data_refined_ct["Sow_Day"] = data_refined_ct["Sow_Date"].dt.day.astype("Int64")
#data_refined_ct["Sow_Day"].unique()

In [18]:
#FOR UKRAINE
#Assign values to Est_Sow_Year for different cropping season 
est_year_data = 2021
est_year_data_winter = est_year_data - 1 

#Assigning Est_Sow_Year values 
mask_null_sow_year = data_refined_ct["Sow_Year"].isnull()
# Assign est_year_data_2023 where Season == "Winter"
data_refined_ct.loc[
    mask_null_sow_year & (data_refined_ct["Crop_Season"] == "Winter"),
    "Est_Sow_Year"
] = est_year_data
# Assign est_year_data_2023 where Season != "Winter"
data_refined_ct.loc[
    mask_null_sow_year & (data_refined_ct["Crop_Season"] != "Winter"),
    "Est_Sow_Year"
] = est_year_data_winter
data_refined_ct.head()

Unnamed: 0,Crop_Group,Crop_Season,Crop_Variation,Crop,id,folder,group,f_name,is_active,ndvi,...,lon,Geom_id,Sow_Date,Harvest_Date,Provider,Src_File,Sow_Year,Sow_Month,Sow_Day,Est_Sow_Year
21,Cereals,Winter,,Wheat,4,Тульчин,"АФ ""Крищинці""",60014290 Агробаза,Yes,0.165,...,28.908754,40cd42009ff6e4,2020-10-25,2021-08-01,Zelena_Dolyna,data/2021_zelena_dolyna.shp,2020,10,25,
22,Cereals,Winter,,Wheat,5,Тульчин,"АФ ""Крищинці""",60014250 За стадіоном,Yes,0.185,...,28.922421,40cd69e3ba2dfc,2020-10-27,2021-08-03,Zelena_Dolyna,data/2021_zelena_dolyna.shp,2020,10,27,
23,Cereals,Winter,,Wheat,12,Тульчин,"АФ ""Крищинці""",60014190 За черешневою,Yes,0.153,...,28.941137,40cd42810230dc,2020-10-31,2021-08-09,Zelena_Dolyna,data/2021_zelena_dolyna.shp,2020,10,31,
24,Cereals,Winter,,Wheat,14,Томашпіль,"АФ ""Стіна""",20014010 За коритною 3,Yes,0.194,...,28.359179,40cd262cc6ca5c,2020-09-20,2021-07-25,Zelena_Dolyna,data/2021_zelena_dolyna.shp,2020,9,20,
25,Cereals,Winter,,Wheat,15,Томашпіль,"АФ ""Стіна""",20014210 Над лісником,Yes,0.356,...,28.400062,40cd276ec3ca0c,2020-11-26,2021-08-12,Zelena_Dolyna,data/2021_zelena_dolyna.shp,2020,11,26,


In [19]:
#Assign season id to all the rows 
data_final_refined = scripts.observation_processing.assign_season_id(data_refined_ct)
data_final_refined.columns

Index(['Crop_Group', 'Crop_Season', 'Crop_Variation', 'Crop', 'id', 'folder',
       'group', 'f_name', 'is_active', 'ndvi', 'ndvi_date', 'adm_name',
       'subad_name', 'locality', 'p_estimate', 'ext_id', 'shrd_field',
       'company', 'mapp_area', 'legl_area', 'till_area', 'created_at',
       'p_crop', 'p_crop_s', 'p_variety', 'crop_s', 'product', 'harvested',
       'variety', 'till_type', 'sow_date', 'harv_date', 'irrigation', 'n_crop',
       'n_crop_s', 'n_variety', 'n_till_typ', 'n_sow_date', 'n_product',
       'desc', 'gdd', 'precip', 'precip_acc', 'soil_temp', 'sm_70mm',
       'sm_280mm', 'sm_1000mm', 'perimeter', 'parcels', 'dis_risk', 'report',
       'scout_repo', 'soil_test', 'center', 'age_of_sug', 'curr_year_',
       'next_year_', 'productivi', 'grain_clas', 'grain_humi', 'grain_garb',
       'history_it', 'moisture_z', 'agronomist', 'yield_map', 'field_addi',
       'field_soil', 'expected_y', 'legal_enti', 'field_id', 'geometry', 'lat',
       'lon', 'Geom_id', '

In [29]:
#Assign final standardized columns names 
final_cols = scripts.automation.display_final_cols(df = data_final_refined, schema_dict = utils.target_schema_grouped)
final_cols.show()

VBox(children=(Accordion(children=(VBox(children=(HBox(children=(Label(value='Crop:', layout=Layout(width='200…

In [30]:
rename_dict = final_cols.get_mapping()
rename_dict

{'Crop': 'Crop',
 'variety': 'Variety',
 'Season_id': 'Season_id',
 'Geom_id': 'Geom_id',
 'Provider': 'Provider',
 'id': 'Src_id',
 'Src_File': 'Src_File',
 'Crop_Group': 'Crop_Group',
 'Crop_Season': 'Crop_Season',
 'Crop_Variation': 'Crop_Variation',
 'geometry': 'geometry',
 'Sow_Date': 'Sow_Date',
 'Est_Sow_Year': 'Est_Sow_Year',
 'Sow_Day': 'Sow_Day',
 'Sow_Month': 'Sow_Month',
 'Sow_Year': 'Sow_Year',
 'Harvest_Date': 'Harvest_Date'}

In [31]:
data_standard = data_final_refined.rename(columns=rename_dict)
data_standard = data_standard[list(v for k,v in rename_dict.items())]
data_standard.columns

Index(['Crop', 'Variety', 'Season_id', 'Geom_id', 'Provider', 'Src_id',
       'Src_File', 'Crop_Group', 'Crop_Season', 'Crop_Variation', 'geometry',
       'Sow_Date', 'Est_Sow_Year', 'Sow_Day', 'Sow_Month', 'Sow_Year',
       'Harvest_Date'],
      dtype='object')

In [34]:
data_standard.head()
data_standard = gpd.GeoDataFrame(data_standard,geometry='geometry')
data_standard.to_file("data/zel_2021.geojson")

In [None]:
#removing yield_clean
#def create_simple_mapper(user_df, schema_grouped):
#    """
#    Creates a UI using the simplified category dictionary.
#    """
#    user_cols = ['(None)'] + sorted(list(user_df.columns))
#    mapping_widgets = {}
#    
#    accordion = widgets.Accordion(children=[])
#    titles = []
#    children = []
#    
#    # Iterate directly through your simplified dictionary
#    for category, fields in schema_grouped.items():
#        titles.append(category)
#        rows = []
#        
#        for field in fields:
#            # Auto-Match Logic
#            default_val = '(None)'
#            for col in user_df.columns:
#                if col.lower() == field.lower():
#                    default_val = col
#                    break
#            
#            # UI Layout
#            lbl = widgets.Label(value=f"{field}:", layout=widgets.Layout(width='200px'))
#            dd = widgets.Dropdown(options=user_cols, value=default_val, layout=widgets.Layout(width='300px'))
#            
#            mapping_widgets[field] = dd
#            rows.append(widgets.HBox([lbl, dd], layout=widgets.Layout(margin='2px')))
#        
#        # Add tab content (Vertical stack)
#        children.append(widgets.VBox(rows, layout=widgets.Layout(padding='10px')))
#
#    accordion.children = tuple(children)
#    for i, title in enumerate(titles):
#        accordion.set_title(i, title)
#        
#    display(accordion)
#    return mapping_widgets
#
## --- RUN THE UI ---
## Pass the dataframe and the NEW generic dictionary
#widgets_dict = create_simple_mapper(data_final_refined, utils.target_schema_grouped)
#
## --- BUTTON LOGIC WITH "UNASSIGNED" CHECK ---
#btn = widgets.Button(description="Generate Mapping & Check Missing", layout=widgets.Layout(width='300px'))
#output = widgets.Output()
#
#def on_click(b):
#    with output:
#        output.clear_output()
#        final_mapping = {}
#        assigned_source_cols = set()
#        
#        # 1. Build the mapping dict
#        for field, widget in widgets_dict.items():
#            if widget.value != '(None)':
#                final_mapping[widget.value] = field
#                assigned_source_cols.add(widget.value)
#        
#        # 2. Find columns in the Input File that were NOT assigned
#        all_source_cols = set(data_final_refined.columns)
#        unmapped_cols = all_source_cols - assigned_source_cols
#        
#        # 3. Print Results
#        print(f"Successfully mapped {len(final_mapping)} columns.\n")
#        
#        print("--- MAPPING DICTIONARY ---")
#        pp = pprint.PrettyPrinter(indent=4)
#        pp.pprint(final_mapping)
#        
#        print("\n" + "="*40)
#        print(f"UNMAPPED COLUMNS ({len(unmapped_cols)})")
#        print("The following columns from your file were NOT assigned to any category:")
#        print("="*40)
#        
#        if unmapped_cols:
#            # Sort them so they are easy to read
#            for col in sorted(list(unmapped_cols)):
#                print(f" • {col}")
#        else:
#            print("Great! All columns from your file have been assigned.")
#
#btn.on_click(on_click)
#display(widgets.VBox([widgets.Label(""), btn, output]))