## Pyspark Handling missing Values

- Dropping columns
- Dropping Rows
- Various Parameter In Dropping functionalities
- Handling Missing values by Mean, Median and Mode

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('Practice').getOrCreate()

In [3]:
spark.read.csv('test2.csv', header=True, inferSchema=True)

DataFrame[Name: string, Age: int, Experience: int, Salary: int]

In [4]:
spark.read.csv('test2.csv', header=True, inferSchema=True).show()

+-------+----+----------+------+
|   Name| Age|Experience|Salary|
+-------+----+----------+------+
|  Limon|  31|        10| 30000|
|  Dalya|  30|         8| 25000|
|   Viko|  29|         4| 20000|
|   Paul|  24|         3| 20000|
| Mahesh|  21|         1| 15000|
| Harsha|  23|         2| 18000|
|Beyonce|null|      null| 40000|
|   null|  34|        10| 38000|
|   null|  36|      null|  null|
+-------+----+----------+------+



In [5]:
df_pyspark = spark.read.csv('test2.csv', header=True, inferSchema=True)

In [6]:
df_pyspark.show()

+-------+----+----------+------+
|   Name| Age|Experience|Salary|
+-------+----+----------+------+
|  Limon|  31|        10| 30000|
|  Dalya|  30|         8| 25000|
|   Viko|  29|         4| 20000|
|   Paul|  24|         3| 20000|
| Mahesh|  21|         1| 15000|
| Harsha|  23|         2| 18000|
|Beyonce|null|      null| 40000|
|   null|  34|        10| 38000|
|   null|  36|      null|  null|
+-------+----+----------+------+



In [7]:
# Drop the columns

df_pyspark.drop('Name').show()

+----+----------+------+
| Age|Experience|Salary|
+----+----------+------+
|  31|        10| 30000|
|  30|         8| 25000|
|  29|         4| 20000|
|  24|         3| 20000|
|  21|         1| 15000|
|  23|         2| 18000|
|null|      null| 40000|
|  34|        10| 38000|
|  36|      null|  null|
+----+----------+------+



In [8]:
df_pyspark.show()

+-------+----+----------+------+
|   Name| Age|Experience|Salary|
+-------+----+----------+------+
|  Limon|  31|        10| 30000|
|  Dalya|  30|         8| 25000|
|   Viko|  29|         4| 20000|
|   Paul|  24|         3| 20000|
| Mahesh|  21|         1| 15000|
| Harsha|  23|         2| 18000|
|Beyonce|null|      null| 40000|
|   null|  34|        10| 38000|
|   null|  36|      null|  null|
+-------+----+----------+------+



In [9]:
# drop null values

# wherever Null/NaN values will be there, those rows will get dropped
df_pyspark.na.drop().show()

+------+---+----------+------+
|  Name|Age|Experience|Salary|
+------+---+----------+------+
| Limon| 31|        10| 30000|
| Dalya| 30|         8| 25000|
|  Viko| 29|         4| 20000|
|  Paul| 24|         3| 20000|
|Mahesh| 21|         1| 15000|
|Harsha| 23|         2| 18000|
+------+---+----------+------+



In [10]:
# any == how

# It will drop those records where all column values are Null
df_pyspark.na.drop(how="all").show()

+-------+----+----------+------+
|   Name| Age|Experience|Salary|
+-------+----+----------+------+
|  Limon|  31|        10| 30000|
|  Dalya|  30|         8| 25000|
|   Viko|  29|         4| 20000|
|   Paul|  24|         3| 20000|
| Mahesh|  21|         1| 15000|
| Harsha|  23|         2| 18000|
|Beyonce|null|      null| 40000|
|   null|  34|        10| 38000|
|   null|  36|      null|  null|
+-------+----+----------+------+



In [11]:
# any Null value present it will be dropped

df_pyspark.na.drop(how="any").show()

+------+---+----------+------+
|  Name|Age|Experience|Salary|
+------+---+----------+------+
| Limon| 31|        10| 30000|
| Dalya| 30|         8| 25000|
|  Viko| 29|         4| 20000|
|  Paul| 24|         3| 20000|
|Mahesh| 21|         1| 15000|
|Harsha| 23|         2| 18000|
+------+---+----------+------+



In [12]:
# Threshold

# Threshold = 2, means atleast 2 Non-Null values are present, if not then it will be dropped
df_pyspark.na.drop(how="any", thresh=2).show()

+-------+----+----------+------+
|   Name| Age|Experience|Salary|
+-------+----+----------+------+
|  Limon|  31|        10| 30000|
|  Dalya|  30|         8| 25000|
|   Viko|  29|         4| 20000|
|   Paul|  24|         3| 20000|
| Mahesh|  21|         1| 15000|
| Harsha|  23|         2| 18000|
|Beyonce|null|      null| 40000|
|   null|  34|        10| 38000|
+-------+----+----------+------+



In [13]:
df_pyspark.na.drop(how="any", thresh=1).show()

+-------+----+----------+------+
|   Name| Age|Experience|Salary|
+-------+----+----------+------+
|  Limon|  31|        10| 30000|
|  Dalya|  30|         8| 25000|
|   Viko|  29|         4| 20000|
|   Paul|  24|         3| 20000|
| Mahesh|  21|         1| 15000|
| Harsha|  23|         2| 18000|
|Beyonce|null|      null| 40000|
|   null|  34|        10| 38000|
|   null|  36|      null|  null|
+-------+----+----------+------+



In [14]:
df_pyspark.na.drop(how="any", thresh=3).show()

+------+---+----------+------+
|  Name|Age|Experience|Salary|
+------+---+----------+------+
| Limon| 31|        10| 30000|
| Dalya| 30|         8| 25000|
|  Viko| 29|         4| 20000|
|  Paul| 24|         3| 20000|
|Mahesh| 21|         1| 15000|
|Harsha| 23|         2| 18000|
|  null| 34|        10| 38000|
+------+---+----------+------+



In [15]:
# Subset

# wherever Null values were present in 'Experience' col, all will be dropped
df_pyspark.na.drop(how="any", subset=['Experience']).show()

+------+---+----------+------+
|  Name|Age|Experience|Salary|
+------+---+----------+------+
| Limon| 31|        10| 30000|
| Dalya| 30|         8| 25000|
|  Viko| 29|         4| 20000|
|  Paul| 24|         3| 20000|
|Mahesh| 21|         1| 15000|
|Harsha| 23|         2| 18000|
|  null| 34|        10| 38000|
+------+---+----------+------+



In [16]:
df_pyspark.na.drop(how="any", subset=['Age']).show()

+------+---+----------+------+
|  Name|Age|Experience|Salary|
+------+---+----------+------+
| Limon| 31|        10| 30000|
| Dalya| 30|         8| 25000|
|  Viko| 29|         4| 20000|
|  Paul| 24|         3| 20000|
|Mahesh| 21|         1| 15000|
|Harsha| 23|         2| 18000|
|  null| 34|        10| 38000|
|  null| 36|      null|  null|
+------+---+----------+------+



In [22]:
# Filling the missing value

df_pyspark.na.fill('Missing Values').show()
# df_pyspark.na.fill('Missing Values', ['Experience', 'Age']).show()

+--------------+----+----------+------+
|          Name| Age|Experience|Salary|
+--------------+----+----------+------+
|         Limon|  31|        10| 30000|
|         Dalya|  30|         8| 25000|
|          Viko|  29|         4| 20000|
|          Paul|  24|         3| 20000|
|        Mahesh|  21|         1| 15000|
|        Harsha|  23|         2| 18000|
|       Beyonce|null|      null| 40000|
|Missing Values|  34|        10| 38000|
|Missing Values|  36|      null|  null|
+--------------+----+----------+------+



In [23]:
df_pyspark.show()

+-------+----+----------+------+
|   Name| Age|Experience|Salary|
+-------+----+----------+------+
|  Limon|  31|        10| 30000|
|  Dalya|  30|         8| 25000|
|   Viko|  29|         4| 20000|
|   Paul|  24|         3| 20000|
| Mahesh|  21|         1| 15000|
| Harsha|  23|         2| 18000|
|Beyonce|null|      null| 40000|
|   null|  34|        10| 38000|
|   null|  36|      null|  null|
+-------+----+----------+------+



In [24]:
from pyspark.ml.feature import Imputer

imputer = Imputer(
    inputCols=['Age', 'Experience', 'Salary'],
    outputCols=["{}_imputed".format(c) for c in ['Age', 'Experience', 'Salary']]
).setStrategy("mean")

In [25]:
# Add imputation columns to df

imputer.fit(df_pyspark).transform(df_pyspark).show()

+-------+----+----------+------+-----------+------------------+--------------+
|   Name| Age|Experience|Salary|Age_imputed|Experience_imputed|Salary_imputed|
+-------+----+----------+------+-----------+------------------+--------------+
|  Limon|  31|        10| 30000|         31|                10|         30000|
|  Dalya|  30|         8| 25000|         30|                 8|         25000|
|   Viko|  29|         4| 20000|         29|                 4|         20000|
|   Paul|  24|         3| 20000|         24|                 3|         20000|
| Mahesh|  21|         1| 15000|         21|                 1|         15000|
| Harsha|  23|         2| 18000|         23|                 2|         18000|
|Beyonce|null|      null| 40000|         28|                 5|         40000|
|   null|  34|        10| 38000|         34|                10|         38000|
|   null|  36|      null|  null|         36|                 5|         25750|
+-------+----+----------+------+-----------+--------

In [26]:
from pyspark.ml.feature import Imputer

imputer = Imputer(
    inputCols=['Age', 'Experience', 'Salary'],
    outputCols=["{}_imputed".format(c) for c in ['Age', 'Experience', 'Salary']]
).setStrategy("median")

In [27]:
# Add imputation columns to df

imputer.fit(df_pyspark).transform(df_pyspark).show()

+-------+----+----------+------+-----------+------------------+--------------+
|   Name| Age|Experience|Salary|Age_imputed|Experience_imputed|Salary_imputed|
+-------+----+----------+------+-----------+------------------+--------------+
|  Limon|  31|        10| 30000|         31|                10|         30000|
|  Dalya|  30|         8| 25000|         30|                 8|         25000|
|   Viko|  29|         4| 20000|         29|                 4|         20000|
|   Paul|  24|         3| 20000|         24|                 3|         20000|
| Mahesh|  21|         1| 15000|         21|                 1|         15000|
| Harsha|  23|         2| 18000|         23|                 2|         18000|
|Beyonce|null|      null| 40000|         29|                 4|         40000|
|   null|  34|        10| 38000|         34|                10|         38000|
|   null|  36|      null|  null|         36|                 4|         20000|
+-------+----+----------+------+-----------+--------