# Part 3: Learning How to Account for Missing Data
## The following will be covered:
* Dropping Columns
* Dropping Rows
* Various Parameter in Dropping Functionalities
* Handling Missing values by Mean, Median, and Mode

In [1]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('Practice').getOrCreate()

In [21]:
df_pyspark=spark.read.csv('pysparktest.csv', header=True, inferSchema=True)
df_pyspark.show()

+------+----+----------+------+
|  Name| Age|Experience|Salary|
+------+----+----------+------+
|Jayden|  23|         5| 80000|
| Shawn|  27|         7| 65000|
|   Bob|null|         4| 12345|
|Jeremy|  40|         8| 69000|
|Joseph|  23|         4|  null|
|  Mary|  24|      null|100000|
|  null|null|      null|  null|
|  null|  34|         2| 40000|
|  null| 121|      null| 90000|
+------+----+----------+------+



In [9]:
# Drop the columns
df_pyspark.drop('Name').show()

+----+----------+------+
| Age|Experience|Salary|
+----+----------+------+
|  23|         5| 80000|
|  27|         7| 65000|
|null|         4| 12345|
|  40|         8| 69000|
|  23|         4|  null|
|  24|      null|100000|
+----+----------+------+



In [14]:
# Will drop the row if there is ANY occurence of a null
df_pyspark.na.drop().show()

+------+---+----------+------+
|  Name|Age|Experience|Salary|
+------+---+----------+------+
|Jayden| 23|         5| 80000|
| Shawn| 27|         7| 65000|
|Jeremy| 40|         8| 69000|
+------+---+----------+------+



In [15]:
# how==any, the default, will drop if there ANY nulls in the row
df_pyspark.na.drop(how='any')

DataFrame[Name: string, Age: int, Experience: int, Salary: int]

In [16]:
# how==all, will only drop the row if all values are null 
df_pyspark.na.drop(how='all')

DataFrame[Name: string, Age: int, Experience: int, Salary: int]

In [24]:
# Threshold, at least x values are PRESENT
df_pyspark.na.drop(how='any', thresh=2).show()

+------+----+----------+------+
|  Name| Age|Experience|Salary|
+------+----+----------+------+
|Jayden|  23|         5| 80000|
| Shawn|  27|         7| 65000|
|   Bob|null|         4| 12345|
|Jeremy|  40|         8| 69000|
|Joseph|  23|         4|  null|
|  Mary|  24|      null|100000|
|  null|  34|         2| 40000|
|  null| 121|      null| 90000|
+------+----+----------+------+



In [26]:
# Subset, removes rows with nulls in the given column
df_pyspark.na.drop(how='any',subset='Experience').show()

+------+----+----------+------+
|  Name| Age|Experience|Salary|
+------+----+----------+------+
|Jayden|  23|         5| 80000|
| Shawn|  27|         7| 65000|
|   Bob|null|         4| 12345|
|Jeremy|  40|         8| 69000|
|Joseph|  23|         4|  null|
|  null|  34|         2| 40000|
+------+----+----------+------+



In [50]:
# Filling missing values, note: matches type
# String
df_pyspark.na.fill('Missing').show()
# Integer
df_pyspark.na.fill(0).show()
# Can also fill specific columns
df_pyspark.na.fill(0,['Experience','Salary']).show()

+-------+----+----------+------+
|   Name| Age|Experience|Salary|
+-------+----+----------+------+
| Jayden|  23|         5| 80000|
|  Shawn|  27|         7| 65000|
|    Bob|null|         4| 12345|
| Jeremy|  40|         8| 69000|
| Joseph|  23|         4|  null|
|   Mary|  24|      null|100000|
|Missing|null|      null|  null|
|Missing|  34|         2| 40000|
|Missing| 121|      null| 90000|
+-------+----+----------+------+

+------+---+----------+------+
|  Name|Age|Experience|Salary|
+------+---+----------+------+
|Jayden| 23|         5| 80000|
| Shawn| 27|         7| 65000|
|   Bob|  0|         4| 12345|
|Jeremy| 40|         8| 69000|
|Joseph| 23|         4|     0|
|  Mary| 24|         0|100000|
|  null|  0|         0|     0|
|  null| 34|         2| 40000|
|  null|121|         0| 90000|
+------+---+----------+------+

+------+----+----------+------+
|  Name| Age|Experience|Salary|
+------+----+----------+------+
|Jayden|  23|         5| 80000|
| Shawn|  27|         7| 65000|
|   Bo

In [55]:
# Use an Imputer function for imputed column values like mean or median
from pyspark.ml.feature import Imputer

imputer = Imputer(
    inputCols=['Age', 'Experience', 'Salary'],
    outputCols=["{}_imputed".format(c) for c in ['age', 'Experience', 'Salary']]).setStrategy("mean")


In [56]:
imputer.fit(df_pyspark).transform(df_pyspark).show()

+------+----+----------+------+-----------+------------------+--------------+
|  Name| Age|Experience|Salary|age_imputed|Experience_imputed|Salary_imputed|
+------+----+----------+------+-----------+------------------+--------------+
|Jayden|  23|         5| 80000|         23|                 5|         80000|
| Shawn|  27|         7| 65000|         27|                 7|         65000|
|   Bob|null|         4| 12345|         41|                 4|         12345|
|Jeremy|  40|         8| 69000|         40|                 8|         69000|
|Joseph|  23|         4|  null|         23|                 4|         65192|
|  Mary|  24|      null|100000|         24|                 5|        100000|
|  null|null|      null|  null|         41|                 5|         65192|
|  null|  34|         2| 40000|         34|                 2|         40000|
|  null| 121|      null| 90000|        121|                 5|         90000|
+------+----+----------+------+-----------+------------------+--