### Pyspark Handling Missing Values
* Dropping Columns
* Dropping Rows
* Various Parameter in Dropping functionalities
* Handling Missing values by Mean

In [None]:
import pyspark

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Practise').getOrCreate()

In [28]:
df_pyspark = spark.read.csv('spark_tutorial.csv', header=True, inferSchema=True)
spark.read.csv('spark_tutorial.csv', header=True, inferSchema=True).show()

+------+----+----------+------+
|  Name| Age|Experience|Salary|
+------+----+----------+------+
| Harry|  27|         2| 25000|
| Jerry|  27|         4| 40000|
| param|  65|        40|200000|
| Laksh|  54|        30| 30000|
| Babji|  33|        10| 50000|
|  Anni|  32|        10| 55000|
|vishwa|null|      null|105000|
|  null|  34|        10| 38000|
|  null|  36|      null|  null|
+------+----+----------+------+



In [5]:
### dropping the columns:
df_pyspark.drop('Name').show()

+----+----------+------+
| Age|Experience|Salary|
+----+----------+------+
|  27|         2| 25000|
|  27|         4| 40000|
|  65|        40|200000|
|  54|        30| 30000|
|  33|        10| 50000|
|  32|        10| 55000|
|null|      null|105000|
|  34|        10| 38000|
|  36|      null|  null|
+----+----------+------+



In [7]:
#na.drop() will remove all the null values in any of the row.

df_pyspark.na.drop().show()

+-----+---+----------+------+
| Name|Age|Experience|Salary|
+-----+---+----------+------+
|Harry| 27|         2| 25000|
|Jerry| 27|         4| 40000|
|param| 65|        40|200000|
|Laksh| 54|        30| 30000|
|Babji| 33|        10| 50000|
| Anni| 32|        10| 55000|
+-----+---+----------+------+



In [11]:
### how = 'all' will remove the row only if all the col in that row carries null value

df_pyspark.na.drop(how="all").show()

+------+----+----------+------+
|  Name| Age|Experience|Salary|
+------+----+----------+------+
| Harry|  27|         2| 25000|
| Jerry|  27|         4| 40000|
| param|  65|        40|200000|
| Laksh|  54|        30| 30000|
| Babji|  33|        10| 50000|
|  Anni|  32|        10| 55000|
|vishwa|null|      null|105000|
|  null|  34|        10| 38000|
|  null|  36|      null|  null|
+------+----+----------+------+

None


In [13]:
### how = 'any' will remove the row if any of the col in that row carries null value

df_pyspark.na.drop(how="any").show()

+-----+---+----------+------+
| Name|Age|Experience|Salary|
+-----+---+----------+------+
|Harry| 27|         2| 25000|
|Jerry| 27|         4| 40000|
|param| 65|        40|200000|
|Laksh| 54|        30| 30000|
|Babji| 33|        10| 50000|
| Anni| 32|        10| 55000|
+-----+---+----------+------+



In [15]:
## Threshold value means, atleast mentioned non-null values should present, else will be removed.
# thresh = 1 means atleast 1 non null values should be there to skip.
df_pyspark.na.drop(how="any", thresh = 2).show()
df_pyspark.na.drop(how="any", thresh = 1).show()

+------+----+----------+------+
|  Name| Age|Experience|Salary|
+------+----+----------+------+
| Harry|  27|         2| 25000|
| Jerry|  27|         4| 40000|
| param|  65|        40|200000|
| Laksh|  54|        30| 30000|
| Babji|  33|        10| 50000|
|  Anni|  32|        10| 55000|
|vishwa|null|      null|105000|
|  null|  34|        10| 38000|
+------+----+----------+------+

+------+----+----------+------+
|  Name| Age|Experience|Salary|
+------+----+----------+------+
| Harry|  27|         2| 25000|
| Jerry|  27|         4| 40000|
| param|  65|        40|200000|
| Laksh|  54|        30| 30000|
| Babji|  33|        10| 50000|
|  Anni|  32|        10| 55000|
|vishwa|null|      null|105000|
|  null|  34|        10| 38000|
|  null|  36|      null|  null|
+------+----+----------+------+



In [16]:
## Subset
df_pyspark.na.drop(how = "any", subset = ['Experience']).show()

+-----+---+----------+------+
| Name|Age|Experience|Salary|
+-----+---+----------+------+
|Harry| 27|         2| 25000|
|Jerry| 27|         4| 40000|
|param| 65|        40|200000|
|Laksh| 54|        30| 30000|
|Babji| 33|        10| 50000|
| Anni| 32|        10| 55000|
| null| 34|        10| 38000|
+-----+---+----------+------+



In [43]:
### Filling the Missing Value
df_pyspark.na.fill('Missing values', ['Experience','Salary']).show()

+------+----+----------+------+
|  Name| Age|Experience|Salary|
+------+----+----------+------+
| Harry|  27|         2| 25000|
| Jerry|  27|         4| 40000|
| param|  65|        40|200000|
| Laksh|  54|        30| 30000|
| Babji|  33|        10| 50000|
|  Anni|  32|        10| 55000|
|vishwa|null|      null|105000|
|  null|  34|        10| 38000|
|  null|  36|      null|  null|
+------+----+----------+------+



In [49]:
## Taking mean and paste it on the null value

from pyspark.ml.feature import Imputer

# mean
imputer = Imputer(
    inputCols=['Age','Experience','Salary'],
    outputCols=[f"{c}_imputed" for c in ['Age','Experience','Salary']]).setStrategy("mean")

# median
imputer = Imputer(
    inputCols=['Age','Experience','Salary'],
    outputCols=[f"{c}_imputed" for c in ['Age','Experience','Salary']]).setStrategy("median")


In [50]:
imputer.fit(df_pyspark).transform(df_pyspark).show()

+------+----+----------+------+-----------+------------------+--------------+
|  Name| Age|Experience|Salary|Age_imputed|Experience_imputed|Salary_imputed|
+------+----+----------+------+-----------+------------------+--------------+
| Harry|  27|         2| 25000|         27|                 2|         25000|
| Jerry|  27|         4| 40000|         27|                 4|         40000|
| param|  65|        40|200000|         65|                40|        200000|
| Laksh|  54|        30| 30000|         54|                30|         30000|
| Babji|  33|        10| 50000|         33|                10|         50000|
|  Anni|  32|        10| 55000|         32|                10|         55000|
|vishwa|null|      null|105000|         33|                10|        105000|
|  null|  34|        10| 38000|         34|                10|         38000|
|  null|  36|      null|  null|         36|                10|         40000|
+------+----+----------+------+-----------+------------------+--