Create a Spark Session

In [1]:
from pyspark.sql import SparkSession

Start the Spark Session

In [2]:
spark = SparkSession.builder.appName('Missing Date').getOrCreate()

Load and display the data

In [3]:
filePath = 'ContainsNull.csv'
df = spark.read.csv(filePath, inferSchema=True, header=True)
df.show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp2| null| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



Working dropping missing values

In [4]:
df.na.drop().show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp4|Cindy|456.0|
+----+-----+-----+



Using threshold to control the rows that needs to be dropped

In [5]:
df.na.drop(thresh=2).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [37]:
df.na.drop(thresh=3).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp4|Cindy|456.0|
+----+-----+-----+



Using subset for readablity of what column will be dropped

In [38]:
df.na.drop(subset=['Sales']).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [10]:
df.na.drop(subset=['Name']).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp4|Cindy|456.0|
+----+-----+-----+



Using the how argument to control the drop

In [39]:
df.na.drop(how='any').show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp4|Cindy|456.0|
+----+-----+-----+



In [13]:
df.na.drop(how='all').show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp2| null| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



Filling in Missing values

In [14]:
df.na.fill('New Values').show()

+----+----------+-----+
|  Id|      Name|Sales|
+----+----------+-----+
|emp1|      John| null|
|emp2|New Values| null|
|emp3|New Values|345.0|
|emp4|     Cindy|456.0|
+----+----------+-----+



In [15]:
df.fillna('test new').show()

+----+--------+-----+
|  Id|    Name|Sales|
+----+--------+-----+
|emp1|    John| null|
|emp2|test new| null|
|emp3|test new|345.0|
|emp4|   Cindy|456.0|
+----+--------+-----+



In [16]:
df.fillna(0).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John|  0.0|
|emp2| null|  0.0|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [17]:
df.fillna('New Values', subset=['Name']).show()

+----+----------+-----+
|  Id|      Name|Sales|
+----+----------+-----+
|emp1|      John| null|
|emp2|New Values| null|
|emp3|New Values|345.0|
|emp4|     Cindy|456.0|
+----+----------+-----+



Fill missing values with the mean of the avaiable values

In [32]:
Sales_mean = df.agg({'Sales': 'mean'}).collect()

In [35]:
Sales_mean[0][0]

400.5

In [36]:
df.fillna(Sales_mean[0][0], subset='Sales').show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John|400.5|
|emp2| null|400.5|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



THE END