# Missing Data

The 3 options for filling in missing data:
- Keep the missing data points as just nulls.
- Drop the missing data points (inlcuding the entire row).
- Fill in the missing data with some other value.

In [1]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName("miss").getOrCreate()

In [4]:
df = spark.read.csv("ContainsNull.csv", inferSchema=True, header=True)

In [5]:
df.show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp2| null| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [6]:
df.na.drop().show()
# Drop all the rows that contains any amount of missing data.

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp4|Cindy|456.0|
+----+-----+-----+



In [9]:
df.na.drop(thresh=2).show()
# Drop rows that have at least the number of threshold missing entries.

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [10]:
df.na.drop(how="all").show()
# Drop the row if all the row values contain missing data

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp2| null| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [12]:
# The subset parameter gives you an optional list of of columns to consider.
df.na.drop(subset=["Sales"]).show()
# Here, all that matters is if your sales are missing.

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [14]:
df.printSchema()

root
 |-- Id: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sales: double (nullable = true)



In [15]:
df.na.fill("FILL VALUE").show()
# It fills in "FILL VALUE" into any missing string entry.

+----+----------+-----+
|  Id|      Name|Sales|
+----+----------+-----+
|emp1|      John| null|
|emp2|FILL VALUE| null|
|emp3|FILL VALUE|345.0|
|emp4|     Cindy|456.0|
+----+----------+-----+



In [16]:
df.na.fill(0).show()
# Fills in 0 into any missing numeric entry.

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John|  0.0|
|emp2| null|  0.0|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [17]:
df.na.fill("No Name", subset=["Name"]).show()
# Here we specifically tell Spark which column it should target.
# This is the preferred convention.  It shows that you specifically know what you are doing.

+----+-------+-----+
|  Id|   Name|Sales|
+----+-------+-----+
|emp1|   John| null|
|emp2|No Name| null|
|emp3|No Name|345.0|
|emp4|  Cindy|456.0|
+----+-------+-----+



In [18]:
# Now we want to fill in average values in the missing values.
from pyspark.sql.functions import mean

In [19]:
mean_val = df.select(mean(df["Sales"])).collect()  # In order to return the object back instead of just showing it

In [20]:
mean_val

[Row(avg(Sales)=400.5)]

In [22]:
mean_sales = mean_val[0][0]

In [23]:
df.na.fill(mean_sales, ["Sales"]).show()
# The missing sales entries have been filled with the mean_sales value.

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John|400.5|
|emp2| null|400.5|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [25]:
# How to do the above in just one line.
df.na.fill(df.select(mean(df["Sales"])).collect()[0][0], subset="Sales").show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John|400.5|
|emp2| null|400.5|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+

