# 1.-Inicializando SparkSession

In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import warnings

In [2]:
spark = SparkSession.builder.config("spark.jars", "/home/jovyan/drivers/postgresql-42.2.18.jar") \
    .master("local[*]").appName("Engine").getOrCreate().newSession()
warnings.filterwarnings("ignore")

22/06/25 04:43:28 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


# 2.-Spark Data Sources

## CSV

In [3]:
df_csv = spark.read.csv("/home/jovyan/data/TBL_MAINTENANCE.csv",inferSchema=True,header=True)

In [4]:
df_csv.show(4)

+--------+-----------+--------------+-------+----------+-----------------+
|stock_id|estimate_id|maintenance_id|type_id|package_id|price_maintenance|
+--------+-----------+--------------+-------+----------+-----------------+
|   15021|          0|         31109|      3|        82|              0.0|
|   23733|          0|         34809|      1|       200|           2299.0|
|   39363|          0|         34737|      6|        71|              0.0|
|    3112|          0|         31811|      1|       200|           2199.0|
+--------+-----------+--------------+-------+----------+-----------------+
only showing top 4 rows



# Base De Datos

In [5]:
df_db = spark.read.format("jdbc").option("url", "jdbc:postgresql://postgres:5432/postgres") \
    .option("dbtable", "tbl_maintenance") \
    .option("user", "airflow").option("password", "airflow") \
    .option("driver", "org.postgresql.Driver").load()

In [6]:
df_db.show(4)

+--------+-----------+--------------+-------+----------+-----------------+
|stock_id|estimate_id|maintenance_id|type_id|package_id|price_maintenance|
+--------+-----------+--------------+-------+----------+-----------------+
|   15021|          0|         31109|      3|        82|              0.0|
|   23733|          0|         34809|      1|       200|           2299.0|
|   39363|          0|         34737|      6|        71|              0.0|
|    3112|          0|         31811|      1|       200|           2199.0|
+--------+-----------+--------------+-------+----------+-----------------+
only showing top 4 rows



# 3.-Inspección De Data

In [7]:
df_db.dtypes

[('stock_id', 'int'),
 ('estimate_id', 'int'),
 ('maintenance_id', 'int'),
 ('type_id', 'int'),
 ('package_id', 'int'),
 ('price_maintenance', 'double')]

In [8]:
df_db.printSchema()

root
 |-- stock_id: integer (nullable = true)
 |-- estimate_id: integer (nullable = true)
 |-- maintenance_id: integer (nullable = true)
 |-- type_id: integer (nullable = true)
 |-- package_id: integer (nullable = true)
 |-- price_maintenance: double (nullable = true)



In [9]:
df_db.count()

7944

In [10]:
df_db.describe().show(vertical=True)

-RECORD 0-------------------------------
 summary           | count              
 stock_id          | 7944               
 estimate_id       | 7944               
 maintenance_id    | 7944               
 type_id           | 7944               
 package_id        | 7944               
 price_maintenance | 7944               
-RECORD 1-------------------------------
 summary           | mean               
 stock_id          | 34278.433660624374 
 estimate_id       | 2086421.2721550856 
 maintenance_id    | 34852.09151560927  
 type_id           | 4.913897280966768  
 package_id        | 146.1897029204431  
 price_maintenance | 715.5368806646545  
-RECORD 2-------------------------------
 summary           | stddev             
 stock_id          | 21838.325367366007 
 estimate_id       | 3330541.3818029696 
 maintenance_id    | 2863.3480228740473 
 type_id           | 1.9631858471755368 
 package_id        | 59.921845861454386 
 price_maintenance | 763.9643133297569  
-RECORD 3-------

In [11]:
df_db.explain()

== Physical Plan ==
*(1) Scan JDBCRelation(tbl_maintenance) [numPartitions=1] [stock_id#59,estimate_id#60,maintenance_id#61,type_id#62,package_id#63,price_maintenance#64] PushedFilters: [], ReadSchema: struct<stock_id:int,estimate_id:int,maintenance_id:int,type_id:int,package_id:int,price_maintenan...




# 4.-Duplicate Values

In [12]:
df_db = spark.read.format("jdbc").option("url", "jdbc:postgresql://postgres:5432/postgres") \
    .option("dbtable", "cat_type_package") \
    .option("user", "airflow").option("password", "airflow") \
    .option("driver", "org.postgresql.Driver").load()

In [13]:
df_db.show()

+----------+-----------------+-------------------+
|package_id|     package_name|               date|
+----------+-----------------+-------------------+
|        65|Warranty Included|2019-09-04 17:00:00|
|        70|Warranty Included|2019-09-05 17:00:00|
|        71|Warranty Included|2019-09-06 17:00:00|
|        76|Warranty Included|2019-09-07 17:00:00|
|        78|Warranty Included|2019-09-08 17:00:00|
|        81|Warranty Included|2019-09-09 17:00:00|
|        88|Warranty Included|2019-09-10 17:00:00|
|        92|Warranty Included|2019-09-11 17:00:00|
|        94|Warranty Included|2019-09-12 17:00:00|
|        96|Warranty Included|2019-09-13 17:00:00|
|        98|Warranty Included|2019-09-14 17:00:00|
|       106|Warranty Included|2019-09-15 17:00:00|
|        71|  Warranty Upsell|2019-09-04 17:00:00|
|        76|  Warranty Upsell|2019-09-05 17:00:00|
|        78|  Warranty Upsell|2019-09-06 17:00:00|
|        81|  Warranty Upsell|2019-09-07 17:00:00|
|        82|  Warranty Upsell|2

In [14]:
df_db.select("package_name").dropDuplicates().show()

+-----------------+
|     package_name|
+-----------------+
|  Warranty Upsell|
|  Paying Customer|
|Warranty Included|
+-----------------+



# 5.-Add,Update y Remove Columns

In [15]:
df_db = spark.read.format("jdbc").option("url", "jdbc:postgresql://postgres:5432/postgres") \
    .option("dbtable", "tbl_maintenance") \
    .option("user", "airflow").option("password", "airflow") \
    .option("driver", "org.postgresql.Driver").load()

In [16]:
df_db = df_db.withColumn("value_set",F.lit(1))

In [17]:
df_db.show(3)

+--------+-----------+--------------+-------+----------+-----------------+---------+
|stock_id|estimate_id|maintenance_id|type_id|package_id|price_maintenance|value_set|
+--------+-----------+--------------+-------+----------+-----------------+---------+
|   15021|          0|         31109|      3|        82|              0.0|        1|
|   23733|          0|         34809|      1|       200|           2299.0|        1|
|   39363|          0|         34737|      6|        71|              0.0|        1|
+--------+-----------+--------------+-------+----------+-----------------+---------+
only showing top 3 rows



In [18]:
df_db = df_db.withColumnRenamed("value_set","value_count")

In [19]:
df_db.show(3)

+--------+-----------+--------------+-------+----------+-----------------+-----------+
|stock_id|estimate_id|maintenance_id|type_id|package_id|price_maintenance|value_count|
+--------+-----------+--------------+-------+----------+-----------------+-----------+
|   15021|          0|         31109|      3|        82|              0.0|          1|
|   23733|          0|         34809|      1|       200|           2299.0|          1|
|   39363|          0|         34737|      6|        71|              0.0|          1|
+--------+-----------+--------------+-------+----------+-----------------+-----------+
only showing top 3 rows



In [20]:
df_db = df_db.drop("value_count")

# 6.-Queries 

## Select

In [21]:
df_db.printSchema()

root
 |-- stock_id: integer (nullable = true)
 |-- estimate_id: integer (nullable = true)
 |-- maintenance_id: integer (nullable = true)
 |-- type_id: integer (nullable = true)
 |-- package_id: integer (nullable = true)
 |-- price_maintenance: double (nullable = true)



In [22]:
df_db.select("maintenance_id","price_maintenance").show(4)

+--------------+-----------------+
|maintenance_id|price_maintenance|
+--------------+-----------------+
|         31109|              0.0|
|         34809|           2299.0|
|         34737|              0.0|
|         31811|           2199.0|
+--------------+-----------------+
only showing top 4 rows



## When

In [23]:
df_db = (df_db
 .withColumn("price_maintenance_name",
             F.when(df_db["price_maintenance"]==0,"Con Descuento")
             .otherwise("Sin Descuento :(")
            )
)

## Like

In [24]:
(df_db
 .filter(df_db.price_maintenance_name.like("Con Descuento")).show(4)
)

+--------+-----------+--------------+-------+----------+-----------------+----------------------+
|stock_id|estimate_id|maintenance_id|type_id|package_id|price_maintenance|price_maintenance_name|
+--------+-----------+--------------+-------+----------+-----------------+----------------------+
|   15021|          0|         31109|      3|        82|              0.0|         Con Descuento|
|   39363|          0|         34737|      6|        71|              0.0|         Con Descuento|
|   20276|          0|         31734|      6|        71|              0.0|         Con Descuento|
|   38030|          0|         33803|      6|        78|              0.0|         Con Descuento|
+--------+-----------+--------------+-------+----------+-----------------+----------------------+
only showing top 4 rows



# Startwith - Endwith

In [25]:
(df_db
 .filter(df_db.price_maintenance_name.startswith("Con")).show(4)
)

+--------+-----------+--------------+-------+----------+-----------------+----------------------+
|stock_id|estimate_id|maintenance_id|type_id|package_id|price_maintenance|price_maintenance_name|
+--------+-----------+--------------+-------+----------+-----------------+----------------------+
|   15021|          0|         31109|      3|        82|              0.0|         Con Descuento|
|   39363|          0|         34737|      6|        71|              0.0|         Con Descuento|
|   20276|          0|         31734|      6|        71|              0.0|         Con Descuento|
|   38030|          0|         33803|      6|        78|              0.0|         Con Descuento|
+--------+-----------+--------------+-------+----------+-----------------+----------------------+
only showing top 4 rows



In [26]:
(df_db
 .filter(df_db.price_maintenance_name.endswith("(")).show(4)
)

+--------+-----------+--------------+-------+----------+-----------------+----------------------+
|stock_id|estimate_id|maintenance_id|type_id|package_id|price_maintenance|price_maintenance_name|
+--------+-----------+--------------+-------+----------+-----------------+----------------------+
|   23733|          0|         34809|      1|       200|           2299.0|      Sin Descuento :(|
|    3112|          0|         31811|      1|       200|           2199.0|      Sin Descuento :(|
|   20061|          0|         32147|      6|       200|           1250.0|      Sin Descuento :(|
|  100197|    8848023|         38654|      2|       200|            550.0|      Sin Descuento :(|
+--------+-----------+--------------+-------+----------+-----------------+----------------------+
only showing top 4 rows



# Substring (Tarea Moral)

# Between (Tarea Moral)

# 7.-Group By

In [27]:
df_db.groupBy("type_id").agg(F.sum("price_maintenance")).show(10)

+-------+----------------------+
|type_id|sum(price_maintenance)|
+-------+----------------------+
|      1|             2242039.0|
|      6|             3203176.0|
|      3|              117550.0|
|      4|                8196.0|
|      2|    113263.98000000019|
+-------+----------------------+



# 8.-Filter

In [28]:
df_db.filter(df_db["price_maintenance"]==0.0).show(12)

+--------+-----------+--------------+-------+----------+-----------------+----------------------+
|stock_id|estimate_id|maintenance_id|type_id|package_id|price_maintenance|price_maintenance_name|
+--------+-----------+--------------+-------+----------+-----------------+----------------------+
|   15021|          0|         31109|      3|        82|              0.0|         Con Descuento|
|   39363|          0|         34737|      6|        71|              0.0|         Con Descuento|
|   20276|          0|         31734|      6|        71|              0.0|         Con Descuento|
|   38030|          0|         33803|      6|        78|              0.0|         Con Descuento|
|  103849|   10170732|         38532|      6|        78|              0.0|         Con Descuento|
|   24550|          0|         33328|      6|        78|              0.0|         Con Descuento|
|   21991|          0|         31998|      6|        96|              0.0|         Con Descuento|
|   35743|          

# 9.-Sort

In [29]:
df_db = spark.read.format("jdbc").option("url", "jdbc:postgresql://postgres:5432/postgres") \
    .option("dbtable", "cat_type_package") \
    .option("user", "airflow").option("password", "airflow") \
    .option("driver", "org.postgresql.Driver").load()

In [30]:
df_db.orderBy("package_id").show()

+----------+-----------------+-------------------+
|package_id|     package_name|               date|
+----------+-----------------+-------------------+
|        65|Warranty Included|2019-09-04 17:00:00|
|        70|Warranty Included|2019-09-05 17:00:00|
|        71|Warranty Included|2019-09-06 17:00:00|
|        71|  Warranty Upsell|2019-09-04 17:00:00|
|        76|Warranty Included|2019-09-07 17:00:00|
|        76|  Warranty Upsell|2019-09-05 17:00:00|
|        78|Warranty Included|2019-09-08 17:00:00|
|        78|  Warranty Upsell|2019-09-06 17:00:00|
|        81|Warranty Included|2019-09-09 17:00:00|
|        81|  Warranty Upsell|2019-09-07 17:00:00|
|        82|  Warranty Upsell|2019-09-08 17:00:00|
|        88|Warranty Included|2019-09-10 17:00:00|
|        92|Warranty Included|2019-09-11 17:00:00|
|        94|Warranty Included|2019-09-12 17:00:00|
|        96|Warranty Included|2019-09-13 17:00:00|
|        96|  Warranty Upsell|2019-09-09 17:00:00|
|        98|  Warranty Upsell|2