# Spark SQL Valores Nulos

In [14]:
from pyspark.sql import SparkSession

In [15]:
spark = SparkSession.builder.appName('nulos').getOrCreate()

In [16]:
df = spark.read.csv('Null.csv', inferSchema=True, header=True)

In [17]:
df.printSchema()

root
 |-- Id: string (nullable = true)
 |-- Nombre: string (nullable = true)
 |-- Ventas: integer (nullable = true)
 |-- Clientes: integer (nullable = true)



In [18]:
df.show()

+----+------+------+--------+
|  Id|Nombre|Ventas|Clientes|
+----+------+------+--------+
|emp1|  John|  null|       3|
|emp2|  null|  null|    null|
|emp3|  null|   345|    null|
|emp4| Cindy|   456|       4|
+----+------+------+--------+



In [19]:
df.na.drop(thresh=4).show() #eliminará cualquier fila con algún valor nulo | thresh = minimos valores no nulos

+----+------+------+--------+
|  Id|Nombre|Ventas|Clientes|
+----+------+------+--------+
|emp4| Cindy|   456|       4|
+----+------+------+--------+



In [23]:
df.na.drop(how = ('any')).show()

+----+------+------+--------+
|  Id|Nombre|Ventas|Clientes|
+----+------+------+--------+
|emp4| Cindy|   456|       4|
+----+------+------+--------+



In [25]:
df.na.drop(how = ('all')).show()

+----+------+------+--------+
|  Id|Nombre|Ventas|Clientes|
+----+------+------+--------+
|emp1|  John|  null|       3|
|emp2|  null|  null|    null|
|emp3|  null|   345|    null|
|emp4| Cindy|   456|       4|
+----+------+------+--------+



In [27]:
df.na.drop(subset = 'Clientes').show()

+----+------+------+--------+
|  Id|Nombre|Ventas|Clientes|
+----+------+------+--------+
|emp1|  John|  null|       3|
|emp4| Cindy|   456|       4|
+----+------+------+--------+



In [29]:
df.na.drop(subset = 'Nombre').show()

+----+------+------+--------+
|  Id|Nombre|Ventas|Clientes|
+----+------+------+--------+
|emp1|  John|  null|       3|
|emp4| Cindy|   456|       4|
+----+------+------+--------+



# Rellenar = fill()

In [31]:
df.na.fill(0).show()

+----+------+------+--------+
|  Id|Nombre|Ventas|Clientes|
+----+------+------+--------+
|emp1|  John|     0|       3|
|emp2|  null|     0|       0|
|emp3|  null|   345|       0|
|emp4| Cindy|   456|       4|
+----+------+------+--------+



In [32]:
df.na.fill(0, subset=['Ventas']).show()

+----+------+------+--------+
|  Id|Nombre|Ventas|Clientes|
+----+------+------+--------+
|emp1|  John|     0|       3|
|emp2|  null|     0|    null|
|emp3|  null|   345|    null|
|emp4| Cindy|   456|       4|
+----+------+------+--------+



# Imputar la media

In [33]:
from pyspark.sql.functions import mean

In [34]:
media = df.select(mean(df['Ventas'])).collect()
media

[Row(avg(Ventas)=400.5)]

In [38]:
media[0][0]

400.5

In [39]:
df.na.fill(media[0][0], ['Ventas']).show()

+----+------+------+--------+
|  Id|Nombre|Ventas|Clientes|
+----+------+------+--------+
|emp1|  John|   400|       3|
|emp2|  null|   400|    null|
|emp3|  null|   345|    null|
|emp4| Cindy|   456|       4|
+----+------+------+--------+

