In [1]:
from pyspark.sql import SparkSession
# Creating spark session
spark = SparkSession.builder.appName('miss').getOrCreate()

In [2]:
df = spark.read.csv('ContainsNull.csv' , inferSchema=True , header = True)
df.show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp2| null| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [3]:
# Dropping row data using na function
df.na.drop().show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp4|Cindy|456.0|
+----+-----+-----+



In [4]:
# Giving threshold value with thresh function
df.na.drop(thresh=2).show() # Row need to have atleast two non null values to pass this threshold condition

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [5]:
# Passing how argument for the selective dropping of value
df.na.drop(how='all').show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp2| null| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [6]:
# Passing subset values that should not contain null values
df.na.drop(subset=['Sales']).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [9]:
# Filling empty datas using na.fill function
df.na.fill('No Data',subset=['Name']).show()

+----+-------+-----+
|  Id|   Name|Sales|
+----+-------+-----+
|emp1|   John| null|
|emp2|No Data| null|
|emp3|No Data|345.0|
|emp4|  Cindy|456.0|
+----+-------+-----+



In [10]:
# Finding mean using imported function
from pyspark.sql.functions import mean
mean_val = df.select (mean (df['Sales'])).collect()
mean_val

[Row(avg(Sales)=400.5)]

In [14]:
# To get the value
mean_sales = mean_val [0][0]
mean_sales

400.5

In [16]:
# Filling mean values in all the empty datas in Sales column
df.na.fill(mean_sales,'Sales').show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John|400.5|
|emp2| null|400.5|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+

