##CREAR DATAFRAME DESDE CERO

In [0]:
datos = [(None,'Smith   ','36636','M',3500),
         ('Michael','   Rose','40288','M',4750),
         ('Robert','Williams','42114','M',None),
         ('Maria','    Jones    ','39192','F',4000)
        ]


In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

esquema = StructType([
    StructField('firstname', StringType(), True),
    StructField('lastname', StringType(), False),
    StructField('id', StringType(), False),
    StructField('gender', StringType(), True),
    StructField('salary', IntegerType(), True)
])


In [0]:
df = spark.createDataFrame(datos, esquema)

df.printSchema()


In [0]:
df.show()

##CREAR DATAFRAME DESDE ARCHIVO (I)

In [0]:
file = 'dbfs:/FileStore/shared_uploads/edurf.cld@gmail.com/sales-20.csv'

In [0]:
sales_df = spark.read.format("csv").option("header", "true").option("inferSchema", True).load(file)

sales_df.printSchema()


In [0]:
sales_df.show(truncate=False)

##CREAR DATAFRAME DESDE ARCHIVO (II)

In [0]:
from pyspark.sql.types import IntegerType, StringType, FloatType, ArrayType, DateType, BooleanType

persons_schema = StructType([
    StructField('id', IntegerType(), True),
    StructField('first_name', StringType(), True),
    StructField('last_name', StringType(), True),
    StructField('fav_movies', ArrayType(StringType()), True),
    StructField('salary', FloatType(), True),
    StructField('image_url', StringType(), True),
    StructField('date_of_birth', DateType(), True),
    StructField('active', BooleanType(), True)
])


In [0]:
file = 'dbfs:/FileStore/shared_uploads/edurf.cld@gmail.com/persons-15.json'

In [0]:
persons_df = spark.read.format('json').option('multiline', True).schema(persons_schema).load(file)

persons_df.show()


In [0]:
display(persons_df)

###CONSULTA DE DATOS DF

In [0]:
sales_df.select('Order_ID','Item_Type','Units_Sold','Unit_Price','Country').show(10,truncate=False)


In [0]:
sales_df.select(sales_df.Order_ID).show(10,truncate=False)

In [0]:
from pyspark.sql.functions import col, expr

sales_df.select(col('Order_ID'), col('Item_Type'), expr("Units_Sold * Unit_Price as TOTAL_PRICE")).show(10)


In [0]:
sales_df.filter((col('Region')=='Europe') & (col('Country')=='Spain')) \
.select(col('Order_ID'), col('Country'), col('Item_Type'),expr("Units_Sold * Unit_Price as TOTAL_PRICE")).show(5)


In [0]:
sales_df.where(col('Region')=='Europe').where(col('Country')=='Spain') \
.select(col('Order_ID'),col('Country'),col('Item_Type'), expr("Units_Sold * Unit_Price as TOTAL_PRICE")).show(5)


In [0]:
sales_df.select(col('Order_ID'),col('Country'),col('Item_Type'),col('Units_Sold')) \
.orderBy(col('Units_Sold').desc(),col('Country').asc()).show(20,truncate=False)


In [0]:
print(sales_df.select('Region').distinct().count())

sales_df.select('Region').distinct().show(truncate=False)


In [0]:
sales_df.select(col('Order_ID'),col('Country'),col('Item_Type'),col('Units_Sold')) \
.orderBy(col('Units_Sold').desc(),col('Country').asc()).limit(20).count()


##EJERCICIO
###     Devolver los campos producto, unidades vendidas, fechas de pedido y envío, de las ventas de la Zona Logística de Asia, ordenadas por país. Sólo nos interesan los 10 primeros.

##MODIFICAR DATOS DATAFRAME

In [0]:
from pyspark.sql.functions import lit

# VALOR DETERMINADO
sales_df.withColumn("Sent", lit(False)).show(5)

# CAMPO CALCULADO
sales_df.withColumn("Total_Price", expr("Units_Sold *  Unit_Price")).show(5)


In [0]:
print(sales_df.printSchema())

sales_df.withColumnRenamed('Region','Logist_Area').show()

In [0]:
resumen_df = sales_df.withColumn("Total_Price", expr("Units_Sold *  Unit_Price"))

resumen_df.show(5)


In [0]:
resumen_df2 = resumen_df.drop('Unit_Price','Region')

resumen_df2.printSchema()


In [0]:
df.show()

# eliminamos aquellos con salario nulo
not_null_df = df.dropna(subset='salary')

not_null_df.show()


In [0]:
sales_df.withColumn('Order_ID', col('Order_ID').cast('string')).printSchema()


In [0]:
from pyspark.sql.functions import ltrim, rtrim, trim

df.show()

corregido2 = df.withColumn('lastname', trim(col('lastname')))

corregido2.show()


##EXTRA: ESCRIBIR (GUARDAR) DATAFRAME RESULTADO, FORMATO PARQUET (DISTRIBUIDO)

In [0]:
path = 'dbfs:/FileStore/shared_uploads/edurf.cld@gmail.com/sales'
(sales_df.write
       	.format("parquet")
            	.mode("overwrite")
            	.option("compression", "snappy")
            	.save(path))


In [0]:
%fs ls 'dbfs:/FileStore/shared_uploads/edurf.cld@gmail.com/sales'
