### PySpark Handling Missing Values
 - Dropping Cols
 - Dropping Rows
 - Various Parameter in Dropping functionalities
 - Handling Missing Values by Mean, Median and Mode  

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark=SparkSession.builder.appName('PracDF').getOrCreate()

In [3]:
df_pyspark=spark.read.option('header','true').csv('3_DF.csv',inferSchema=True)

In [4]:
df_pyspark

DataFrame[Name: string, CGPA: double, Experience: int, Salary: int]

In [5]:
df_pyspark.show()


+---------+----+----------+------+
|     Name|CGPA|Experience|Salary|
+---------+----+----------+------+
|  Koustav|9.03|         2| 50000|
|  Bishyan|9.16|         1| 55000|
|Dwaipayan|8.73|         3| 45000|
|   Mainak|8.98|         4| 51000|
|     null|4.56|        10| 62000|
|        X|null|         3|  null|
|     null| 4.0|      null|  null|
+---------+----+----------+------+



In [6]:
df_pyspark.na.drop().show()


+---------+----+----------+------+
|     Name|CGPA|Experience|Salary|
+---------+----+----------+------+
|  Koustav|9.03|         2| 50000|
|  Bishyan|9.16|         1| 55000|
|Dwaipayan|8.73|         3| 45000|
|   Mainak|8.98|         4| 51000|
+---------+----+----------+------+



In [7]:
### any==how
df_pyspark.na.drop(how="all").show()

+---------+----+----------+------+
|     Name|CGPA|Experience|Salary|
+---------+----+----------+------+
|  Koustav|9.03|         2| 50000|
|  Bishyan|9.16|         1| 55000|
|Dwaipayan|8.73|         3| 45000|
|   Mainak|8.98|         4| 51000|
|     null|4.56|        10| 62000|
|        X|null|         3|  null|
|     null| 4.0|      null|  null|
+---------+----+----------+------+



In [8]:
df_pyspark.na.drop(how="any").show()

+---------+----+----------+------+
|     Name|CGPA|Experience|Salary|
+---------+----+----------+------+
|  Koustav|9.03|         2| 50000|
|  Bishyan|9.16|         1| 55000|
|Dwaipayan|8.73|         3| 45000|
|   Mainak|8.98|         4| 51000|
+---------+----+----------+------+



In [9]:
## threshold
df_pyspark.na.drop(how="any",thresh=2).show() ##if more  null values than mentioned then it will be got deleted

+---------+----+----------+------+
|     Name|CGPA|Experience|Salary|
+---------+----+----------+------+
|  Koustav|9.03|         2| 50000|
|  Bishyan|9.16|         1| 55000|
|Dwaipayan|8.73|         3| 45000|
|   Mainak|8.98|         4| 51000|
|     null|4.56|        10| 62000|
|        X|null|         3|  null|
+---------+----+----------+------+



In [10]:
#subset
df_pyspark.na.drop(how="any",subset=['Experience']).show()

+---------+----+----------+------+
|     Name|CGPA|Experience|Salary|
+---------+----+----------+------+
|  Koustav|9.03|         2| 50000|
|  Bishyan|9.16|         1| 55000|
|Dwaipayan|8.73|         3| 45000|
|   Mainak|8.98|         4| 51000|
|     null|4.56|        10| 62000|
|        X|null|         3|  null|
+---------+----+----------+------+



In [11]:
df_pyspark.na.drop(how="any",subset=['Name']).show()

+---------+----+----------+------+
|     Name|CGPA|Experience|Salary|
+---------+----+----------+------+
|  Koustav|9.03|         2| 50000|
|  Bishyan|9.16|         1| 55000|
|Dwaipayan|8.73|         3| 45000|
|   Mainak|8.98|         4| 51000|
|        X|null|         3|  null|
+---------+----+----------+------+



In [12]:
## Filling the missing value
df_pyspark.na.fill('Missing Values').show()

+--------------+----+----------+------+
|          Name|CGPA|Experience|Salary|
+--------------+----+----------+------+
|       Koustav|9.03|         2| 50000|
|       Bishyan|9.16|         1| 55000|
|     Dwaipayan|8.73|         3| 45000|
|        Mainak|8.98|         4| 51000|
|Missing Values|4.56|        10| 62000|
|             X|null|         3|  null|
|Missing Values| 4.0|      null|  null|
+--------------+----+----------+------+



In [13]:
## Filling the missing value
df_pyspark.na.fill('Missing Values',['Experience','Salary']).show()

+---------+----+----------+------+
|     Name|CGPA|Experience|Salary|
+---------+----+----------+------+
|  Koustav|9.03|         2| 50000|
|  Bishyan|9.16|         1| 55000|
|Dwaipayan|8.73|         3| 45000|
|   Mainak|8.98|         4| 51000|
|     null|4.56|        10| 62000|
|        X|null|         3|  null|
|     null| 4.0|      null|  null|
+---------+----+----------+------+



In [14]:
df_pyspark.show()

+---------+----+----------+------+
|     Name|CGPA|Experience|Salary|
+---------+----+----------+------+
|  Koustav|9.03|         2| 50000|
|  Bishyan|9.16|         1| 55000|
|Dwaipayan|8.73|         3| 45000|
|   Mainak|8.98|         4| 51000|
|     null|4.56|        10| 62000|
|        X|null|         3|  null|
|     null| 4.0|      null|  null|
+---------+----+----------+------+



In [15]:
from pyspark.ml.feature import Imputer
imputer=Imputer(
    inputCols=['CGPA','Experience','Salary'],
    outputCols=["{}_imputed".format(c) for c in ['CGPA','Experience','Salary']]
).setStrategy("mean")

In [16]:
#add imputation cols to df
imputer.fit(df_pyspark).transform(df_pyspark).show()

+---------+----+----------+------+------------+------------------+--------------+
|     Name|CGPA|Experience|Salary|CGPA_imputed|Experience_imputed|Salary_imputed|
+---------+----+----------+------+------------+------------------+--------------+
|  Koustav|9.03|         2| 50000|        9.03|                 2|         50000|
|  Bishyan|9.16|         1| 55000|        9.16|                 1|         55000|
|Dwaipayan|8.73|         3| 45000|        8.73|                 3|         45000|
|   Mainak|8.98|         4| 51000|        8.98|                 4|         51000|
|     null|4.56|        10| 62000|        4.56|                10|         62000|
|        X|null|         3|  null|        7.41|                 3|         52600|
|     null| 4.0|      null|  null|         4.0|                 3|         52600|
+---------+----+----------+------+------------+------------------+--------------+

