In [7]:
import pandas as pd
import great_expectations as gx
from sqlalchemy import create_engine
import pandas as pd
import great_expectations as gx

# 1) Traemos el dataset con el scrapping

In [8]:
df_scrapeado = pd.read_csv('../data/staging/used_cars_scrapping.csv')

# 2) Verificamos GE

In [9]:

df_prueba = df_scrapeado.copy()

# Convertir columnas numéricas a tipo adecuado
for col in ['price_clean', 'year']:
    if col in df_prueba.columns:
        df_prueba[col] = pd.to_numeric(df_prueba[col], errors='coerce')


print('Shape:', df_prueba.shape)
print('Columnas:', list(df_prueba.columns))
print('\nTipos:')
print(df_prueba.dtypes)
print('\nNulos (%):')
print((df_prueba.isna().mean()*100).round(2))


context = gx.get_context()

data_source = context.data_sources.add_pandas(name="cars_data")

data_asset = data_source.add_dataframe_asset(name="cars_assets")


batch_definition_name = "cars_batch"
batch_definition = data_asset.add_batch_definition_whole_dataframe(batch_definition_name)


batch_parameters = {"dataframe": df_prueba}
batch = batch_definition.get_batch(batch_parameters=batch_parameters)

expectation_suite_name = "cars_suite"
suite = gx.ExpectationSuite(name=expectation_suite_name)

# Agregar Expectativas básicas
#Validar que ningun valor en la columna key sea nulo
suite.add_expectation(gx.expectations.ExpectColumnValuesToNotBeNull(column="listing_id"))

#Valida que la columna price_clean (precio limpio) no tenga valores nulos.
suite.add_expectation(gx.expectations.ExpectColumnValuesToNotBeNull(column="price_clean"))
#Valida que todos los valores en price_clean sean mayores o iguales a 0.
suite.add_expectation(gx.expectations.ExpectColumnValuesToBeBetween(column="price_clean", min_value=0))

#Valida que la columna currency (moneda) no tenga valores nulos.
suite.add_expectation(gx.expectations.ExpectColumnValuesToNotBeNull(column="currency"))

#Valida que el dataset tenga entre 1 y 300 registros.
suite.add_expectation(gx.expectations.ExpectTableRowCountToBeBetween(min_value=1, max_value=300))

#Valida que la columna model (modelo del auto) no tenga valores nulos.
suite.add_expectation(gx.expectations.ExpectColumnValuesToNotBeNull(column="model"))

#Valida que todos los años (year) estén entre 1980 y 2030.
suite.add_expectation(gx.expectations.ExpectColumnValuesToBeBetween(column="year", min_value=1980, max_value=2030))

#Valida que todos los valores en la columna listing_id sean únicos.
gx.expectations.ExpectColumnValuesToBeUnique(column="listing_id")

# Guardar la Expectation Suite en el contexto
context.suites.add(suite)

# Validar los datos contra la suite
validation_results = batch.validate(suite)
print(validation_results)

# Opcional: mostrar filas problemáticas
bad_prices = df_prueba.index[df_prueba['price_clean'] < 0].tolist()
print('Filas con precio negativo:', len(bad_prices))
if bad_prices:
    print(df_prueba.loc[bad_prices, ['key','model','year','price_clean','currency']].head(10))

Shape: (300, 15)
Columnas: ['listing_id', 'brand', 'model', 'year', 'mileage_km', 'price_usd', 'city', 'state', 'fuel', 'transmission', 'seller_type', 'listing_date', 'country', 'price_clean', 'currency']

Tipos:
listing_id        int64
brand            object
model            object
year              int64
mileage_km        int64
price_usd       float64
city             object
state            object
fuel             object
transmission     object
seller_type      object
listing_date     object
country          object
price_clean     float64
currency         object
dtype: object

Nulos (%):
listing_id      0.0
brand           0.0
model           0.0
year            0.0
mileage_km      0.0
price_usd       0.0
city            0.0
state           0.0
fuel            0.0
transmission    0.0
seller_type     0.0
listing_date    0.0
country         0.0
price_clean     0.0
currency        0.0
dtype: float64


Calculating Metrics: 100%|██████████| 36/36 [00:00<00:00, 361.19it/s]


{
  "success": true,
  "results": [
    {
      "success": true,
      "expectation_config": {
        "type": "expect_column_values_to_not_be_null",
        "kwargs": {
          "batch_id": "cars_data-cars_assets",
          "column": "listing_id"
        },
        "meta": {},
        "id": "3e401298-1472-4aa0-b591-ab7c1f5d02f7",
        "severity": "critical"
      },
      "result": {
        "element_count": 300,
        "unexpected_count": 0,
        "unexpected_percent": 0.0,
        "partial_unexpected_list": [],
        "partial_unexpected_counts": [],
        "partial_unexpected_index_list": []
      },
      "meta": {},
      "exception_info": {
        "raised_exception": false,
        "exception_traceback": null,
        "exception_message": null
      }
    },
    {
      "success": true,
      "expectation_config": {
        "type": "expect_column_values_to_not_be_null",
        "kwargs": {
          "batch_id": "cars_data-cars_assets",
          "column": "price_clean

# 3) Guardamos dataset y subimos a postgres

In [4]:
df_prueba.to_csv("../data/curated/used_cars_final.csv", index=False, encoding="utf-8-sig")


In [None]:
usuario = "etluser"
password = "etlpass"
host = "localhost"   
puerto = "5432"
base_datos = "dw"

engine = create_engine(f"postgresql://{usuario}:{password}@{host}:{puerto}/{base_datos}")



df_prueba.to_sql("used_cars_final", engine, if_exists="replace", index=False)

print("Dataset subido correctamente a PostgreSQL en la tabla 'used_cars_final'")

Dataset subido correctamente a PostgreSQL en la tabla 'used_cars_final'
