<a href="https://colab.research.google.com/github/KiaroRB/data-analysis-projects/blob/main/silver_data_ingestion_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
from pyspark.sql import SparkSession
spark = SparkSession.builder\
      .appName("bronze_ingestion")\
      .getOrCreate()

In [6]:
!pip install -q gdown

In [7]:
import gdown

file_id = "1_0SQv1AFNNspJDRv1FgbWeVUBUqC5_Qb"
gdown.download(id=file_id, output="online_retail.csv", quiet=False)

#https://drive.google.com/file/d/1_0SQv1AFNNspJDRv1FgbWeVUBUqC5_Qb/view?usp=sharing

Downloading...
From: https://drive.google.com/uc?id=1_0SQv1AFNNspJDRv1FgbWeVUBUqC5_Qb
To: /content/online_retail.csv
100%|██████████| 45.6M/45.6M [00:00<00:00, 102MB/s]


'online_retail.csv'

In [8]:
df = spark.read.csv(
    "online_retail.csv",
    header=True,
    inferSchema=True,
    sep = ";"
)

In [32]:
# Bronze
df.write.mode("overwrite").parquet("bronze_online_retail_raw")

In [13]:
print("Total de registros:", df.count())
print("\nTotal de columnas:", len(df.columns))
print("\n REGISTRO DEL DATASET")
df.limit(5).show(truncate=False)
print("\n Verificación de datos")
df.printSchema()

Total de registros: 541909

Total de columnas: 8

 REGISTRO DEL DATASET
+---------+---------+-----------------------------------+--------+--------------+---------+----------+--------------+
|InvoiceNo|StockCode|Description                        |Quantity|InvoiceDate   |UnitPrice|CustomerID|Country       |
+---------+---------+-----------------------------------+--------+--------------+---------+----------+--------------+
|536365   |85123A   |WHITE HANGING HEART T-LIGHT HOLDER |6       |1/12/2010 8:26|2,55     |17850     |United Kingdom|
|536365   |71053    |WHITE METAL LANTERN                |6       |1/12/2010 8:26|3,39     |17850     |United Kingdom|
|536365   |84406B   |CREAM CUPID HEARTS COAT HANGER     |8       |1/12/2010 8:26|2,75     |17850     |United Kingdom|
|536365   |84029G   |KNITTED UNION FLAG HOT WATER BOTTLE|6       |1/12/2010 8:26|3,39     |17850     |United Kingdom|
|536365   |84029E   |RED WOOLLY HOTTIE WHITE HEART.     |6       |1/12/2010 8:26|3,39     |17850     |

In [14]:
#ENTREGABLE 2 — Plata: Limpieza + columnas derivadas
df.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: string (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Country: string (nullable = true)



In [15]:
total_bronze = df.count()
print("Registros antes de limpiar:", total_bronze)

Registros antes de limpiar: 541909


In [16]:
#Regla 1: Quantity > 0
from pyspark.sql.functions import col
df_quantity_ok = df.filter(col("Quantity") > 0)

df_quantity_ok.select("Quantity").show(5)

print("Registros luego de Quantity > 0:", df_quantity_ok.count())
print(
    "Eliminados por Quantity <= 0:",
    total_bronze - df_quantity_ok.count()
)

+--------+
|Quantity|
+--------+
|       6|
|       6|
|       8|
|       6|
|       6|
+--------+
only showing top 5 rows
Registros luego de Quantity > 0: 531285
Eliminados por Quantity <= 0: 10624


In [25]:
#Correción del tipo de dato UnitPrice (string a double)
from pyspark.sql.functions import regexp_replace

df_price_cast = df_quantity_ok.withColumn(
    "UnitPrice",
    regexp_replace(col("UnitPrice"), ",", ".").cast("double")
)

df_price_cast.printSchema()

#Regla 1: UnitPrice > 0
df_price_ok = df_price_cast.filter(col("UnitPrice") > 0)

print("Registros antes (Quantity OK):", df_quantity_ok.count())
print("Registros después (UnitPrice > 0):", df_price_ok.count())
print(
    "Eliminados por UnitPrice <= 0:",
    df_quantity_ok.count() - df_price_ok.count()
)

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Country: string (nullable = true)

Registros antes (Quantity OK): 531285
Registros después (UnitPrice > 0): 530104
Eliminados por UnitPrice <= 0: 1181


In [27]:
#Regla 3: CustomerID IS NOT NULL
df_customer_ok = df_price_ok.filter(col("CustomerID").isNotNull())

print("Registros antes Price:", df_price_ok.count())
print("Registros después CustomerID NOT NULL:", df_customer_ok.count())
print(
    "Eliminados por CustomerID NULL:",
    df_price_ok.count() - df_customer_ok.count()
)

Registros antes Price: 530104
Registros después CustomerID NOT NULL: 397884
Eliminados por CustomerID NULL: 132220


In [28]:
#limpiza InvoiceDate
from pyspark.sql.functions import expr

df_silver = df_customer_ok.withColumn(
    "InvoiceDate",
    expr("try_to_timestamp(InvoiceDate, 'd/M/yyyy H:mm')")
)

In [35]:
#Columnas derivadas
from pyspark.sql.functions import year, month

df_silver = (
    df_silver
    .withColumn("SaleAmount", col("Quantity") * col("UnitPrice"))
    .withColumn("Year", year(col("InvoiceDate")))
    .withColumn("Month", month(col("InvoiceDate")))
)

df_silver.select(
    "Quantity", "UnitPrice", "SaleAmount", "Year", "Month"
).show(5)

+--------+---------+------------------+----+-----+
|Quantity|UnitPrice|        SaleAmount|Year|Month|
+--------+---------+------------------+----+-----+
|       6|     2.55|15.299999999999999|2010|   12|
|       6|     3.39|             20.34|2010|   12|
|       8|     2.75|              22.0|2010|   12|
|       6|     3.39|             20.34|2010|   12|
|       6|     3.39|             20.34|2010|   12|
+--------+---------+------------------+----+-----+
only showing top 5 rows


In [31]:
#Silver
df_silver.write.mode("overwrite").parquet("silver_online_retail_clean")

In [33]:
!ls

bronze_online_retail_raw  sample_data
online_retail.csv	  silver_online_retail_clean


In [34]:
!ls silver_online_retail_clean

part-00000-002431e9-bd97-4686-997e-7e30882298e9-c000.snappy.parquet  _SUCCESS
part-00001-002431e9-bd97-4686-997e-7e30882298e9-c000.snappy.parquet
