# Extract

In [1]:
# Make jupyter to accept import python modules from other folders
import sys
sys.path.append("../..")

from utils.functions import DB_MinioClient
from utils.constants import Constants
import pandas as pd
import io

FILE_NAME = "cultivos_identificadores.xlsx"
BUCKET_FROM_NAME = Constants.STORAGE_LANDING_ZONE.value

# Connect to MinIO
minio_client = DB_MinioClient().connect()
# Get data from MinIO
data = minio_client.get_object(
    BUCKET_FROM_NAME, FILE_NAME).read()
df = pd.read_excel(io.BytesIO(data), engine="openpyxl",
                   sheet_name="productos", na_values=[''])


# Validate

In [2]:
import pandera as pa

schema = pa.DataFrameSchema(
    columns={
        "CÓDIGO": pa.Column(pa.Int, nullable=False),
        "PRODUCTO SIEX": pa.Column(pa.String, nullable=False)
    }
)

schema.validate(df)

Unnamed: 0,CÓDIGO,PRODUCTO SIEX
0,1,ABACA ALIAS MANILA
1,2,ABEDUL
2,3,ABETO
3,4,ABETO DE DOUGLAS
4,5,ACACIA
...,...,...
440,945,ZANAHORIA
441,946,ZAPOTE
442,947,ZARZAMORA
443,948,ZULLA


# Clean

In [3]:
# Change column names
try:
    df.columns = ["id", "name"]
except Exception as e:
    # Finish flow with error
    raise ValueError("Error changing column names: ", e)
# Modify data
# Convert NULL, NP, NaN, etc. to None
df = df.replace(
    {pd.NA: None, "NP": None, "NaN": None, "": None, "NULL": None})
# Convert strings to uppercase
df["name"] = df["name"].str.upper()
# Convert "id" column to string
df["id"] = df["id"].astype(str)

In [4]:
# Transform to parquet
parquet_buffer = df.to_parquet()

# Load

In [5]:
# Connect to MinIO
minio_client = DB_MinioClient().connect()
# Save data to MinIO
file_name = FILE_NAME.replace("xlsx", "parquet")
processed_data = io.BytesIO(parquet_buffer)
minio_client.put_object(
    Constants.STORAGE_TRUSTED_ZONE.value,
    f"ERP/unknown/{file_name}",
    processed_data,
    length=processed_data.getbuffer().nbytes,
    content_type="application/octet-stream",
    metadata={
        "source": "unknown",
        "type": "trusted"
    }
)



<minio.helpers.ObjectWriteResult at 0x7f92940fd250>