In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("Missing").getOrCreate()

In [3]:
df = spark.read.csv('Python-and-Spark-for-Big-Data-master/Spark_DataFrames/ContainsNull.csv', 
                   inferSchema=True, header=True)

In [7]:
df.head(5)

[Row(Id='emp1', Name='John', Sales=None),
 Row(Id='emp2', Name=None, Sales=None),
 Row(Id='emp3', Name=None, Sales=345.0),
 Row(Id='emp4', Name='Cindy', Sales=456.0)]

In [6]:
df.show() # df.na.drop(how="all").show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp2| null| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [9]:
df.na.drop().show() # df.na.drop(how="any").show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp4|Cindy|456.0|
+----+-----+-----+



In [10]:
df.na.drop(thresh=2).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [11]:
df.na.drop(subset=['Sales']).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [13]:
df.printSchema()

root
 |-- Id: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sales: double (nullable = true)



In [14]:
df.na.fill(0).show() # Notice only Sales (double) getting filled.

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John|  0.0|
|emp2| null|  0.0|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [15]:
df.na.fill("Fill").show() # Notice only the Name (string) getting filled.

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp2| Fill| null|
|emp3| Fill|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [16]:
df.na.fill("No Name", subset=['Name']).show()  # with programmers control.

+----+-------+-----+
|  Id|   Name|Sales|
+----+-------+-----+
|emp1|   John| null|
|emp2|No Name| null|
|emp3|No Name|345.0|
|emp4|  Cindy|456.0|
+----+-------+-----+



In [21]:
from pyspark.sql.functions import mean

mean_data = df.select(mean(df['Sales'])).collect()
mean_data

[Row(avg(Sales)=400.5)]

In [22]:
mean_val = mean_data[0][0]
mean_val

400.5

In [23]:
df.na.fill(mean_val, ['Sales']).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John|400.5|
|emp2| null|400.5|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+

