# Extract

In [6]:
# Make jupyter to accept import python modules from other folders
import sys
sys.path.append("../..")

from utils.functions import DB_MinioClient
from utils.constants import Constants
import pandas as pd
import io
import json

FILE_NAME = "pistacho.json"
BUCKET_FROM_NAME = Constants.STORAGE_LANDING_ZONE.value

minio_client = DB_MinioClient().connect()
data = minio_client.get_object(
    BUCKET_FROM_NAME, FILE_NAME).read().decode("utf-8")
data_json = json.loads(data)

# Clean

## Enclosures

In [7]:
df_enclosures = pd.json_normalize(data_json["Exploitation Parcel"])

# If municipality, province, agregate, zone, polygon, parcel and enclosure are "", then set them to 0
possible_zero_columns = ["CAP_Code.Municipality.Municipality_Code", "CAP_Code.Province.Province_Code", "CAP_Code.Agregate", "CAP_Code.Zone", "CAP_Code.Polygon", "CAP_Code.Parcel", "CAP_Code.Enclosure"]
for column in possible_zero_columns:
    if df_enclosures[column].dtype == "object":
        df_enclosures[column] = df_enclosures[column].replace(["", " "], 0)

# Rename columns
df_enclosures = df_enclosures.rename(columns={
    "CAP_Code.Province.Province_Code": "parcelProvinceId",
    "CAP_Code.Municipality.Municipality_Code": "parcelMunicipalityId",
    "CAP_Code.Polygon": "parcelPolygonId",
    "CAP_Code.Parcel": "parcelId",
    "CAP_Code.Enclosure": "parcelEnclosureId",
    "CAP_Code.Municipality.Municipality_Name": "parcelGeographicSpot",
    "CAP_Code.Agregate": "parcelAggregatedId",
    "CAP_Code.Zone": "parcelZoneId",
    "Use": "parcelUse",
    "Official_Area": "areaSIGPAC",
    "Used_Area": "area",
    "Crop.Species": "cropName",
    "Crop.Variety": "parcelVarietyId",
    "Rainfed_Irrigated": "irrigationKind",
    "Protected_Area?": "protectedArea",
})

# Drop columns
df_enclosures.drop(columns=["Parcel_Id"], inplace=True)

# Add columns
df_enclosures["cropId"] = None
df_enclosures["tenureRegimeId"] = None
df_enclosures["plantationYear"] = None
df_enclosures["numberOfTrees"] = None
df_enclosures["plantationDensity"] = None
df_enclosures["ATRIA_ADV_ASV"] = None
df_enclosures["parcelVulnerableArea"] = None
df_enclosures["specificZones"] = None
df_enclosures["slope"] = 0
df_enclosures["UHC"] = None
df_enclosures["UHCDescription"] = None
df_enclosures["ZepaZone"] = None
df_enclosures["SIEZone"] = None

# Convert NULL, NP, NaN, etc. to None
df_enclosures = df_enclosures.replace(
    {pd.NA: None, "NP": None, "NaN": None, "": None, "NULL": None})
df_enclosures

Unnamed: 0,parcelUse,areaSIGPAC,area,irrigationKind,protectedArea,parcelProvinceId,CAP_Code.Province.Province_Name,parcelMunicipalityId,parcelGeographicSpot,parcelAggregatedId,...,numberOfTrees,plantationDensity,ATRIA_ADV_ASV,parcelVulnerableArea,specificZones,slope,UHC,UHCDescription,ZepaZone,SIEZone
0,FS,13.63,13.63,Irrigated,N,47,,96,MORALEJA DE LAS PANADERAS,0,...,,,,,,0,,,,
1,TA,0.73,0.73,Irrigated,N,47,,124,POZAL DE GALLINAS,0,...,,,,,,0,,,,
2,FS,2.14,2.14,Irrigated,N,47,,124,POZAL DE GALLINAS,0,...,,,,,,0,,,,
3,VI,1.46,1.46,Irrigated,N,47,,124,POZAL DE GALLINAS,0,...,,,,,,0,,,,
4,FS,3.4,3.4,Irrigated,N,47,,124,POZAL DE GALLINAS,0,...,,,,,,0,,,,
5,FS,1.12,1.12,Irrigated,N,47,,124,POZAL DE GALLINAS,0,...,,,,,,0,,,,
6,FS,4.35,4.35,Irrigated,N,47,,124,POZAL DE GALLINAS,0,...,,,,,,0,,,,
7,FS,0.81,0.81,Irrigated,N,47,,124,POZAL DE GALLINAS,0,...,,,,,,0,,,,
8,FS,5.34,5.34,Irrigated,N,47,,124,POZAL DE GALLINAS,0,...,,,,,,0,,,,
9,FS,2.17,2.17,Irrigated,N,47,,124,POZAL DE GALLINAS,0,...,,,,,,0,,,,


## Treatments

In [8]:
# df_treatments = pd.json_normalize(data_json["Phytosanitary_Action"])

# # Map Parcel_Id from df_enclosures to df_treatments
# df_treatments["enclosureId"] = df_treatments["Parcel_Id"].apply(
#     lambda x: df_enclosures.loc[df_enclosures["Parcel_Id"] == x, "enclosureId"].values[0])
# df_treatments

# Transform

In [9]:
# Transform df_enclosures to parquet
parquet_enclosures = df_enclosures.to_parquet()

# Load

In [10]:
BUCKET_TO_NAME = Constants.STORAGE_TRUSTED_ZONE.value
# Create bucket if it doesn't exist
if not minio_client.bucket_exists(BUCKET_TO_NAME):
    minio_client.make_bucket(BUCKET_TO_NAME)

minio_client.put_object(
    BUCKET_TO_NAME,
    "ERP/PISTACYL/2021/pistachio_enclosures_2021.parquet",
    io.BytesIO(parquet_enclosures),
    length=io.BytesIO(parquet_enclosures).getbuffer().nbytes,
    content_type="application/octet-stream",
    metadata={
        "type": Constants.METADATA_PARCELS_AND_TREATMENTS_PARCELS.value,
        "source": "PISTACYL",
        "year": 2021,
        "state": "processed",
    }
)

<minio.helpers.ObjectWriteResult at 0x7f1c840b5b50>