In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("Missing_Values").getOrCreate()

24/03/26 09:32:04 WARN Utils: Your hostname, anthony-X570-AORUS-MASTER resolves to a loopback address: 127.0.1.1; using 192.168.1.15 instead (on interface wlp4s0)
24/03/26 09:32:04 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/26 09:32:05 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [8]:
df_pyspark = spark.read.csv("missing_values_data.csv", header=True, inferSchema=True)

In [9]:
# Empy values should appear as null, if not and it's just an empty space then there are spaces in csv file.
# CORRECT: Emma,,0,150000
# INCORRECT(space between commas): Emma, ,0,150000
df_pyspark.show()

+---------+----+----------+------+
|     Name| Age|Experience|Salary|
+---------+----+----------+------+
|   Amelia|  24|         1| 30000|
| Benjamin|  55|         9| 25000|
|Charlotte|  82|         3| 20000|
|    David|  87|      NULL| 35000|
|     Emma|NULL|         0| 15000|
| Jennifer|  67|         5| 25000|
|Charlotte|  82|         3| 20000|
|    David|NULL|      NULL| 35000|
|     Emma|  43|         0| 15000|
|    Chase|NULL|         1| 10000|
|     Adam|  34|         1| 10000|
+---------+----+----------+------+



In [10]:
# drop column
df_pyspark.drop('Name').show()

+----+----------+------+
| Age|Experience|Salary|
+----+----------+------+
|  24|         1| 30000|
|  55|         9| 25000|
|  82|         3| 20000|
|  87|      NULL| 35000|
|NULL|         0| 15000|
|  67|         5| 25000|
|  82|         3| 20000|
|NULL|      NULL| 35000|
|  43|         0| 15000|
|NULL|         1| 10000|
|  34|         1| 10000|
+----+----------+------+



In [11]:
# Drop null all null values.
df_pyspark.na.drop().show()

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
|   Amelia| 24|         1| 30000|
| Benjamin| 55|         9| 25000|
|Charlotte| 82|         3| 20000|
| Jennifer| 67|         5| 25000|
|Charlotte| 82|         3| 20000|
|     Emma| 43|         0| 15000|
|     Adam| 34|         1| 10000|
+---------+---+----------+------+



In [13]:
#  any==how
# Drop any Rows with all null values.
df_pyspark.na.drop(how="all").show()
# Drop any Rows with any null values.
df_pyspark.na.drop(how="any").show()

+---------+----+----------+------+
|     Name| Age|Experience|Salary|
+---------+----+----------+------+
|   Amelia|  24|         1| 30000|
| Benjamin|  55|         9| 25000|
|Charlotte|  82|         3| 20000|
|    David|  87|      NULL| 35000|
|     Emma|NULL|         0| 15000|
| Jennifer|  67|         5| 25000|
|Charlotte|  82|         3| 20000|
|    David|NULL|      NULL| 35000|
|     Emma|  43|         0| 15000|
|    Chase|NULL|         1| 10000|
|     Adam|  34|         1| 10000|
+---------+----+----------+------+

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
|   Amelia| 24|         1| 30000|
| Benjamin| 55|         9| 25000|
|Charlotte| 82|         3| 20000|
| Jennifer| 67|         5| 25000|
|Charlotte| 82|         3| 20000|
|     Emma| 43|         0| 15000|
|     Adam| 34|         1| 10000|
+---------+---+----------+------+



In [28]:
# Threshold
# Drop any Rows that don't have at least #thresh of defined values
df_pyspark.na.drop(how="any", thresh=4).show()

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
|   Amelia| 24|         1| 30000|
| Benjamin| 55|         9| 25000|
|Charlotte| 82|         3| 20000|
| Jennifer| 67|         5| 25000|
|Charlotte| 82|         3| 20000|
|     Emma| 43|         0| 15000|
|     Adam| 34|         1| 10000|
+---------+---+----------+------+



In [29]:
# Subset
# Dropa any Rows that have a NULL value in specified subset columns.
df_pyspark.na.drop(how="any", subset=['Experience']).show()


+---------+----+----------+------+
|     Name| Age|Experience|Salary|
+---------+----+----------+------+
|   Amelia|  24|         1| 30000|
| Benjamin|  55|         9| 25000|
|Charlotte|  82|         3| 20000|
|     Emma|NULL|         0| 15000|
| Jennifer|  67|         5| 25000|
|Charlotte|  82|         3| 20000|
|     Emma|  43|         0| 15000|
|    Chase|NULL|         1| 10000|
|     Adam|  34|         1| 10000|
+---------+----+----------+------+



In [37]:
# Fill missing values
# Fill missing values with specified value, corresponding the dtype of the Schema(double check that).
df_pyspark.na.fill(value=-1,).show()
print(df_pyspark.columns)

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
|   Amelia| 24|         1| 30000|
| Benjamin| 55|         9| 25000|
|Charlotte| 82|         3| 20000|
|    David| 87|        -1| 35000|
|     Emma| -1|         0| 15000|
| Jennifer| 67|         5| 25000|
|Charlotte| 82|         3| 20000|
|    David| -1|        -1| 35000|
|     Emma| 43|         0| 15000|
|    Chase| -1|         1| 10000|
|     Adam| 34|         1| 10000|
+---------+---+----------+------+

['Name', 'Age', 'Experience', 'Salary']


In [38]:
from pyspark.ml.feature import Imputer

# Take the given dataframe columns as input and calculate the mean dataframe's mean,
# to fill missing values in the output columns that are added to the dataframe with format:
# "{input_column}_imputed"
imputer = Imputer(
    inputCols=df_pyspark.columns[1:],
    outputCols=["{}_imputed".format(col) for col in df_pyspark.columns[1:]]
).setStrategy("mean")

In [39]:
imputer.fit(df_pyspark).transform(df_pyspark).show()

+---------+----+----------+------+-----------+------------------+--------------+
|     Name| Age|Experience|Salary|Age_imputed|Experience_imputed|Salary_imputed|
+---------+----+----------+------+-----------+------------------+--------------+
|   Amelia|  24|         1| 30000|         24|                 1|         30000|
| Benjamin|  55|         9| 25000|         55|                 9|         25000|
|Charlotte|  82|         3| 20000|         82|                 3|         20000|
|    David|  87|      NULL| 35000|         87|                 2|         35000|
|     Emma|NULL|         0| 15000|         59|                 0|         15000|
| Jennifer|  67|         5| 25000|         67|                 5|         25000|
|Charlotte|  82|         3| 20000|         82|                 3|         20000|
|    David|NULL|      NULL| 35000|         59|                 2|         35000|
|     Emma|  43|         0| 15000|         43|                 0|         15000|
|    Chase|NULL|         1| 