#Imports y librería

In [0]:
from pyspark.sql.functions import *

# Conociendo los datos

## Lecturas de datos

In [0]:
file_location = "dbfs:/FileStore/tables"
file_type = "csv"

#Datasets

df_pizzas = spark.read.format(file_type).option('header', True).load(f"{file_location}/pizzas.csv", inferSchema = True)

df_pizza_types = spark.read.format(file_type).option('header', True).load(f"{file_location}/pizza_types.csv", inferSchema = True)

df_orders = spark.read.format(file_type).option('header', True).load(f"{file_location}/orders.csv", inferSchema = True)

df_order_details = spark.read.format(file_type).option('header', True).load(f"{file_location}/order_details.csv", inferSchema = True)

## Esquemas

In [0]:
df_pizzas.printSchema()

root
 |-- pizza_id: string (nullable = true)
 |-- pizza_type_id: string (nullable = true)
 |-- size: string (nullable = true)
 |-- price: double (nullable = true)



In [0]:
df_pizza_types.printSchema()

root
 |-- pizza_type_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- category: string (nullable = true)
 |-- ingredients: string (nullable = true)



In [0]:
df_orders.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- date: date (nullable = true)
 |-- time: timestamp (nullable = true)



In [0]:
df_order_details.printSchema()

root
 |-- order_details_id: integer (nullable = true)
 |-- order_id: integer (nullable = true)
 |-- pizza_id: string (nullable = true)
 |-- quantity: integer (nullable = true)



## Displays de tablas y cantidad de registros por tablas

### Dataframe pizzas

In [0]:
df_pizzas.display()

pizza_id,pizza_type_id,size,price
bbq_ckn_s,bbq_ckn,S,12.75
bbq_ckn_m,bbq_ckn,M,16.75
bbq_ckn_l,bbq_ckn,L,20.75
cali_ckn_s,cali_ckn,S,12.75
cali_ckn_m,cali_ckn,M,16.75
cali_ckn_l,cali_ckn,L,20.75
ckn_alfredo_s,ckn_alfredo,S,12.75
ckn_alfredo_m,ckn_alfredo,M,16.75
ckn_alfredo_l,ckn_alfredo,L,20.75
ckn_pesto_s,ckn_pesto,S,12.75


In [0]:
df_pizzas.count()

Out[7]: 96

### Dataframe pizza_types

In [0]:
df_pizza_types.display()

pizza_type_id,name,category,ingredients
bbq_ckn,The Barbecue Chicken Pizza,Chicken,"Barbecued Chicken, Red Peppers, Green Peppers, Tomatoes, Red Onions, Barbecue Sauce"
cali_ckn,The California Chicken Pizza,Chicken,"Chicken, Artichoke, Spinach, Garlic, Jalapeno Peppers, Fontina Cheese, Gouda Cheese"
ckn_alfredo,The Chicken Alfredo Pizza,Chicken,"Chicken, Red Onions, Red Peppers, Mushrooms, Asiago Cheese, Alfredo Sauce"
ckn_pesto,The Chicken Pesto Pizza,Chicken,"Chicken, Tomatoes, Red Peppers, Spinach, Garlic, Pesto Sauce"
southw_ckn,The Southwest Chicken Pizza,Chicken,"Chicken, Tomatoes, Red Peppers, Red Onions, Jalapeno Peppers, Corn, Cilantro, Chipotle Sauce"
thai_ckn,The Thai Chicken Pizza,Chicken,"Chicken, Pineapple, Tomatoes, Red Peppers, Thai Sweet Chilli Sauce"
big_meat,The Big Meat Pizza,Classic,"Bacon, Pepperoni, Italian Sausage, Chorizo Sausage"
classic_dlx,The Classic Deluxe Pizza,Classic,"Pepperoni, Mushrooms, Red Onions, Red Peppers, Bacon"
hawaiian,The Hawaiian Pizza,Classic,"Sliced Ham, Pineapple, Mozzarella Cheese"
ital_cpcllo,The Italian Capocollo Pizza,Classic,"Capocollo, Red Peppers, Tomatoes, Goat Cheese, Garlic, Oregano"


In [0]:
df_pizza_types.count()

Out[9]: 32

### Dataframe orders

In [0]:
df_orders.display()

order_id,date,time
1,2015-01-01,2023-01-27T11:38:36.000+0000
2,2015-01-01,2023-01-27T11:57:40.000+0000
3,2015-01-01,2023-01-27T12:12:28.000+0000
4,2015-01-01,2023-01-27T12:16:31.000+0000
5,2015-01-01,2023-01-27T12:21:30.000+0000
6,2015-01-01,2023-01-27T12:29:36.000+0000
7,2015-01-01,2023-01-27T12:50:37.000+0000
8,2015-01-01,2023-01-27T12:51:37.000+0000
9,2015-01-01,2023-01-27T12:52:01.000+0000
10,2015-01-01,2023-01-27T13:00:15.000+0000


In [0]:
df_orders.count()

Out[11]: 21350

### Dataframe order_details

In [0]:
df_order_details.display()

order_details_id,order_id,pizza_id,quantity
1,1,hawaiian_m,1
2,2,classic_dlx_m,1
3,2,five_cheese_l,1
4,2,ital_supr_l,1
5,2,mexicana_m,1
6,2,thai_ckn_l,1
7,3,ital_supr_m,1
8,3,prsc_argla_l,1
9,4,ital_supr_m,1
10,5,ital_supr_m,1


In [0]:
df_order_details.count()

Out[13]: 48620

## Eliminación de duplicados

In [0]:
df_pizzas = df_pizzas.dropDuplicates()

In [0]:
# Cantidad luego de la eliminación de duplicados
df_pizzas.count()

Out[15]: 96

In [0]:
#----------------------------------------------------------------------------

In [0]:
df_pizza_types = df_pizza_types.dropDuplicates()

In [0]:
# Cantidad luego de la eliminación de duplicados
df_pizza_types.count()

Out[18]: 32

In [0]:
#------------------------------------------------------

In [0]:
df_orders = df_orders.dropDuplicates()

In [0]:
df_orders.count()

Out[21]: 21350

In [0]:
#---------------------------------------------------------------------------

In [0]:
df_order_details = df_order_details.dropDuplicates()

In [0]:
df_order_details.count()

Out[24]: 48620

## Valores Máximos y Mínimos

In [0]:
df_order_details.select(max('quantity')).show()

+-------------+
|max(quantity)|
+-------------+
|            4|
+-------------+



In [0]:
df_order_details.select(min('quantity')).show()

+-------------+
|min(quantity)|
+-------------+
|            1|
+-------------+



#prueba imagen
Aca probar insertar una imagen

In [0]:
%sql
SELECT (date_format(orders.time,'HH:mm:ss'))  FROM orders

"date_format(time, HH:mm:ss)"
11:38:36
11:57:40
12:12:28
12:16:31
12:21:30
12:29:36
12:50:37
12:51:37
12:52:01
13:00:15


# Conclusiones

Tenemos una tabla transaccional con ordenes del negocio (para el análisis en adelante, una tabla de hechos o facts)