# Pyspark Handling Missing Values -

- **Dropping Columns**
- **Dropping Rows**
- **Various Parameter In Dropping Functionalities**
- **Handling Missing values by Mean, Median and Mode**

In [1]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('Handling Missing Data/Values').getOrCreate()

In [2]:
spark

In [3]:

import warnings

# Filter all warnings
warnings.filterwarnings('ignore')


In [4]:
df_pyspark=spark.read.csv("D:\APACHE SPARK\sample2.csv",header=True,inferSchema=True)

In [5]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [6]:
df_pyspark.show()

+---------+----+----------+------+
|     Name| Age|Experience|Salary|
+---------+----+----------+------+
|Yashwanth|  31|        10| 30000|
|    Kumar|  30|         8| 25000|
|     Ravi|  29|         4| 20000|
|     Teja|  24|         3| 20000|
|     Siva|  21|         1| 15000|
|    Vivek|  23|         2| 18000|
|    Sagar|NULL|      NULL| 40000|
|     NULL|  34|        10| 38000|
|     NULL|  36|      NULL|  NULL|
+---------+----+----------+------+



In [7]:
# Drop the Columns
df_pyspark.drop('Name').show()

+----+----------+------+
| Age|Experience|Salary|
+----+----------+------+
|  31|        10| 30000|
|  30|         8| 25000|
|  29|         4| 20000|
|  24|         3| 20000|
|  21|         1| 15000|
|  23|         2| 18000|
|NULL|      NULL| 40000|
|  34|        10| 38000|
|  36|      NULL|  NULL|
+----+----------+------+



In [8]:
df_pyspark.show()

+---------+----+----------+------+
|     Name| Age|Experience|Salary|
+---------+----+----------+------+
|Yashwanth|  31|        10| 30000|
|    Kumar|  30|         8| 25000|
|     Ravi|  29|         4| 20000|
|     Teja|  24|         3| 20000|
|     Siva|  21|         1| 15000|
|    Vivek|  23|         2| 18000|
|    Sagar|NULL|      NULL| 40000|
|     NULL|  34|        10| 38000|
|     NULL|  36|      NULL|  NULL|
+---------+----+----------+------+



In [9]:
df_pyspark.na.drop().show()

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
|Yashwanth| 31|        10| 30000|
|    Kumar| 30|         8| 25000|
|     Ravi| 29|         4| 20000|
|     Teja| 24|         3| 20000|
|     Siva| 21|         1| 15000|
|    Vivek| 23|         2| 18000|
+---------+---+----------+------+



In [10]:
# any==how
df_pyspark.na.drop(how="any").show()

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
|Yashwanth| 31|        10| 30000|
|    Kumar| 30|         8| 25000|
|     Ravi| 29|         4| 20000|
|     Teja| 24|         3| 20000|
|     Siva| 21|         1| 15000|
|    Vivek| 23|         2| 18000|
+---------+---+----------+------+



In [11]:
# Threshold
df_pyspark.na.drop(how="any",thresh=3).show()

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
|Yashwanth| 31|        10| 30000|
|    Kumar| 30|         8| 25000|
|     Ravi| 29|         4| 20000|
|     Teja| 24|         3| 20000|
|     Siva| 21|         1| 15000|
|    Vivek| 23|         2| 18000|
|     NULL| 34|        10| 38000|
+---------+---+----------+------+



In [12]:
# Subset
df_pyspark.na.drop(how="any",subset=['Age']).show()

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
|Yashwanth| 31|        10| 30000|
|    Kumar| 30|         8| 25000|
|     Ravi| 29|         4| 20000|
|     Teja| 24|         3| 20000|
|     Siva| 21|         1| 15000|
|    Vivek| 23|         2| 18000|
|     NULL| 34|        10| 38000|
|     NULL| 36|      NULL|  NULL|
+---------+---+----------+------+



In [13]:

# Fill Missing Values in Experience Column

df_pyspark = df_pyspark.na.fill(0, ["Experience"])
df_pyspark.show()


+---------+----+----------+------+
|     Name| Age|Experience|Salary|
+---------+----+----------+------+
|Yashwanth|  31|        10| 30000|
|    Kumar|  30|         8| 25000|
|     Ravi|  29|         4| 20000|
|     Teja|  24|         3| 20000|
|     Siva|  21|         1| 15000|
|    Vivek|  23|         2| 18000|
|    Sagar|NULL|         0| 40000|
|     NULL|  34|        10| 38000|
|     NULL|  36|         0|  NULL|
+---------+----+----------+------+



In [14]:
# Fill Missing Values in Age Column
df_pyspark.na.fill(0,['Age']).show()

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
|Yashwanth| 31|        10| 30000|
|    Kumar| 30|         8| 25000|
|     Ravi| 29|         4| 20000|
|     Teja| 24|         3| 20000|
|     Siva| 21|         1| 15000|
|    Vivek| 23|         2| 18000|
|    Sagar|  0|         0| 40000|
|     NULL| 34|        10| 38000|
|     NULL| 36|         0|  NULL|
+---------+---+----------+------+



In [21]:
# Fill Missing Values in Name Column
df_pyspark.na.fill("Unknown", ["Name"]).show()


+---------+----+----------+------+
|     Name| Age|Experience|Salary|
+---------+----+----------+------+
|Yashwanth|  31|        10| 30000|
|    Kumar|  30|         8| 25000|
|     Ravi|  29|         4| 20000|
|     Teja|  24|         3| 20000|
|     Siva|  21|         1| 15000|
|    Vivek|  23|         2| 18000|
|    Sagar|NULL|         0| 40000|
|  Unknown|  34|        10| 38000|
|  Unknown|  36|         0|  NULL|
+---------+----+----------+------+



In [17]:
df_pyspark.show()

+---------+----+----------+------+
|     Name| Age|Experience|Salary|
+---------+----+----------+------+
|Yashwanth|  31|        10| 30000|
|    Kumar|  30|         8| 25000|
|     Ravi|  29|         4| 20000|
|     Teja|  24|         3| 20000|
|     Siva|  21|         1| 15000|
|    Vivek|  23|         2| 18000|
|    Sagar|NULL|         0| 40000|
|  Unknown|  34|        10| 38000|
|  Unknown|  36|         0|  NULL|
+---------+----+----------+------+



In [18]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = false)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [19]:
from pyspark.ml.feature import Imputer

imputer = Imputer(
    inputCols=['Age', 'Experience', 'Salary'], 
    outputCols=["{}_imputed".format(c) for c in ['Age', 'Experience', 'Salary']]
    ).setStrategy("median")

In [20]:
# Add imputation cols to df
imputer.fit(df_pyspark).transform(df_pyspark).show()

+---------+----+----------+------+-----------+------------------+--------------+
|     Name| Age|Experience|Salary|Age_imputed|Experience_imputed|Salary_imputed|
+---------+----+----------+------+-----------+------------------+--------------+
|Yashwanth|  31|        10| 30000|         31|                10|         30000|
|    Kumar|  30|         8| 25000|         30|                 8|         25000|
|     Ravi|  29|         4| 20000|         29|                 4|         20000|
|     Teja|  24|         3| 20000|         24|                 3|         20000|
|     Siva|  21|         1| 15000|         21|                 1|         15000|
|    Vivek|  23|         2| 18000|         23|                 2|         18000|
|    Sagar|NULL|         0| 40000|         29|                 0|         40000|
|  Unknown|  34|        10| 38000|         34|                10|         38000|
|  Unknown|  36|         0|  NULL|         36|                 0|         20000|
+---------+----+----------+-