### Pyspark Handling Missing Values
- Dropping Columns
- Dropping Rows
- Various Parameter In Dropping functionalities
- Handling Missing Values by Mean, Median and Mode

In [1]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('HandlingMissingValues').getOrCreate()

In [8]:
df_pyspark=spark.read.csv('testHandling.csv',header=True,inferSchema=True)

In [9]:
df_pyspark.show()

+--------+----+----------+------+
|    name| age|experience|salary|
+--------+----+----------+------+
|    Luiz|  21|         1|  1500|
| Augusto|  48|        24|  2500|
|Peterson|  23|         5|  1900|
|   Brian|  11|      null|  null|
|    null|  35|         5|  2600|
|    null|  52|         0|  null|
|    null|null|      null|  null|
+--------+----+----------+------+



In [10]:
## Drop the columns
df_pyspark.drop('name').show()

+----+----------+------+
| age|experience|salary|
+----+----------+------+
|  21|         1|  1500|
|  48|        24|  2500|
|  23|         5|  1900|
|  11|      null|  null|
|  35|         5|  2600|
|  52|         0|  null|
|null|      null|  null|
+----+----------+------+



In [12]:
df_pyspark.na.drop().show()

+--------+---+----------+------+
|    name|age|experience|salary|
+--------+---+----------+------+
|    Luiz| 21|         1|  1500|
| Augusto| 48|        24|  2500|
|Peterson| 23|         5|  1900|
|   Brian| 11|         0|     0|
+--------+---+----------+------+



In [17]:
## any=how
## default is how=any if any data is na drop row
## how=all if all values is na drop row
df_pyspark.na.drop(how="all").show()

+--------+---+----------+------+
|    name|age|experience|salary|
+--------+---+----------+------+
|    Luiz| 21|         1|  1500|
| Augusto| 48|        24|  2500|
|Peterson| 23|         5|  1900|
|   Brian| 11|         0|     0|
|    null| 35|         5|  2600|
|    null| 52|         0|  null|
+--------+---+----------+------+



In [19]:
## threshold
## threshold determines if n quantities of columns is not na to drop row 
## this example below is if a row has 2 fill row will be not dropped
df_pyspark.na.drop(how="all",thresh=2).show()

+--------+---+----------+------+
|    name|age|experience|salary|
+--------+---+----------+------+
|    Luiz| 21|         1|  1500|
| Augusto| 48|        24|  2500|
|Peterson| 23|         5|  1900|
|   Brian| 11|         0|     0|
|    null| 35|         5|  2600|
|    null| 52|         0|  null|
+--------+---+----------+------+



In [30]:
## subset
## subset drop row if one of subset values is na, subset is a AND conditional only works if all subset is null
df_pyspark.na.drop(how="all",subset=(['age'])).show()

+--------+---+----------+------+
|    name|age|experience|salary|
+--------+---+----------+------+
|    Luiz| 21|         1|  1500|
| Augusto| 48|        24|  2500|
|Peterson| 23|         5|  1900|
|   Brian| 11|         0|     0|
|    null| 35|         5|  2600|
|    null| 52|         0|  null|
+--------+---+----------+------+



In [32]:
## Filling the Missing Values
df_pyspark.na.fill('Missing Values', ['name','age']).show()

+--------------+----+----------+------+
|          name| age|experience|salary|
+--------------+----+----------+------+
|          Luiz|  21|         1|  1500|
|       Augusto|  48|        24|  2500|
|      Peterson|  23|         5|  1900|
|         Brian|  11|         0|     0|
|Missing Values|  35|         5|  2600|
|Missing Values|  52|         0|  null|
|Missing Values|null|      null|  null|
+--------------+----+----------+------+



In [33]:
df_pyspark.show()

+--------+----+----------+------+
|    name| age|experience|salary|
+--------+----+----------+------+
|    Luiz|  21|         1|  1500|
| Augusto|  48|        24|  2500|
|Peterson|  23|         5|  1900|
|   Brian|  11|         0|     0|
|    null|  35|         5|  2600|
|    null|  52|         0|  null|
|    null|null|      null|  null|
+--------+----+----------+------+



In [40]:
from pyspark.ml.feature import Imputer

imputer = Imputer(
  inputCols=['age','experience','salary'],
  outputCols=['{}_imputed'.format(c) for c in ['age','experience','salary']]
).setStrategy('median')

In [41]:
## Add imputation cols to df
imputer.fit(df_pyspark).transform(df_pyspark).show()

+--------+----+----------+------+-----------+------------------+--------------+
|    name| age|experience|salary|age_imputed|experience_imputed|salary_imputed|
+--------+----+----------+------+-----------+------------------+--------------+
|    Luiz|  21|         1|  1500|         21|                 1|          1500|
| Augusto|  48|        24|  2500|         48|                24|          2500|
|Peterson|  23|         5|  1900|         23|                 5|          1900|
|   Brian|  11|         0|     0|         11|                 0|             0|
|    null|  35|         5|  2600|         35|                 5|          2600|
|    null|  52|         0|  null|         52|                 0|          1900|
|    null|null|      null|  null|         23|                 1|          1900|
+--------+----+----------+------+-----------+------------------+--------------+

