## Pyspark Handling Missing Values
1. Dropping Columns
2. Dropping Rows
3. Various Parameter In Dropping functionalities
4. Handling Missing values by Mean, Median and Mode

# 1. Dropping Columns


In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Practice').getOrCreate()

In [2]:
df_pyspark = spark.read.csv('test2.csv', header='True', inferSchema='True')

In [3]:
df_pyspark

DataFrame[Name: string, Age: int, Experience: int, Salary: int]

In [4]:
df_pyspark.show()

+---------+----+----------+------+
|     Name| Age|Experience|Salary|
+---------+----+----------+------+
| muhammed|  25|         2|  5000|
|    Ahmed|  32|         6|  7000|
|Abdulaziz|  53|        25| 40000|
|   Sherif|  44|        14| 30000|
|  hussien|  46|        12| 25000|
|      Ali|  29|         6| 11000|
|     Zein|  38|         8| 12000|
|     Adam|NULL|      NULL|  NULL|
|     NULL|  36|         6|  NULL|
+---------+----+----------+------+



In [5]:
df_pyspark.na.drop().show()

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
| muhammed| 25|         2|  5000|
|    Ahmed| 32|         6|  7000|
|Abdulaziz| 53|        25| 40000|
|   Sherif| 44|        14| 30000|
|  hussien| 46|        12| 25000|
|      Ali| 29|         6| 11000|
|     Zein| 38|         8| 12000|
+---------+---+----------+------+



# drop() Parameters
----------
### 1. how : str, optional
    'any' or 'all'.
    If 'any', drop a row if it contains any nulls.
    If 'all', drop a row only if all its values are null.
### 2. thresh: int, optional
    default None
    If specified, drop rows that have less than `thresh` non-null values.
    This overwrites the `how` parameter.
### 3. subset : str, tuple or list, optional
    optional list of column names to only drop from it .

Returns
-------
:class:`DataFrame`
    DataFrame with null only rows excluded.

In [6]:
df_pyspark.na.drop(how='any').show()

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
| muhammed| 25|         2|  5000|
|    Ahmed| 32|         6|  7000|
|Abdulaziz| 53|        25| 40000|
|   Sherif| 44|        14| 30000|
|  hussien| 46|        12| 25000|
|      Ali| 29|         6| 11000|
|     Zein| 38|         8| 12000|
+---------+---+----------+------+



In [7]:
df_pyspark.na.drop(how='all').show()

+---------+----+----------+------+
|     Name| Age|Experience|Salary|
+---------+----+----------+------+
| muhammed|  25|         2|  5000|
|    Ahmed|  32|         6|  7000|
|Abdulaziz|  53|        25| 40000|
|   Sherif|  44|        14| 30000|
|  hussien|  46|        12| 25000|
|      Ali|  29|         6| 11000|
|     Zein|  38|         8| 12000|
|     Adam|NULL|      NULL|  NULL|
|     NULL|  36|         6|  NULL|
+---------+----+----------+------+



In [8]:
df_pyspark.na.drop(how='all', thresh=1).show()

+---------+----+----------+------+
|     Name| Age|Experience|Salary|
+---------+----+----------+------+
| muhammed|  25|         2|  5000|
|    Ahmed|  32|         6|  7000|
|Abdulaziz|  53|        25| 40000|
|   Sherif|  44|        14| 30000|
|  hussien|  46|        12| 25000|
|      Ali|  29|         6| 11000|
|     Zein|  38|         8| 12000|
|     Adam|NULL|      NULL|  NULL|
|     NULL|  36|         6|  NULL|
+---------+----+----------+------+



In [9]:
df_pyspark.na.drop(how='any', subset=['salary']).show()

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
| muhammed| 25|         2|  5000|
|    Ahmed| 32|         6|  7000|
|Abdulaziz| 53|        25| 40000|
|   Sherif| 44|        14| 30000|
|  hussien| 46|        12| 25000|
|      Ali| 29|         6| 11000|
|     Zein| 38|         8| 12000|
+---------+---+----------+------+



## Fill The Missing Value

In [10]:
# df_pyspark = df_pyspark.na.fill('Missing Value')
# df_pyspark = df_pyspark.na.fill(0)
df_pyspark.show()


+---------+----+----------+------+
|     Name| Age|Experience|Salary|
+---------+----+----------+------+
| muhammed|  25|         2|  5000|
|    Ahmed|  32|         6|  7000|
|Abdulaziz|  53|        25| 40000|
|   Sherif|  44|        14| 30000|
|  hussien|  46|        12| 25000|
|      Ali|  29|         6| 11000|
|     Zein|  38|         8| 12000|
|     Adam|NULL|      NULL|  NULL|
|     NULL|  36|         6|  NULL|
+---------+----+----------+------+



In [11]:
from pyspark.ml.feature import Imputer 

In [12]:
imputer = Imputer(
                    inputCols=['Age','Experience','Salary'],
                   outputCols=[f"{c}_imputed" for c in['Age','Experience','Salary']]
                 ).setStrategy("mean")

In [14]:
imputer.fit(df_pyspark).transform(df_pyspark).show()

+---------+----+----------+------+-----------+------------------+--------------+
|     Name| Age|Experience|Salary|Age_imputed|Experience_imputed|Salary_imputed|
+---------+----+----------+------+-----------+------------------+--------------+
| muhammed|  25|         2|  5000|         25|                 2|          5000|
|    Ahmed|  32|         6|  7000|         32|                 6|          7000|
|Abdulaziz|  53|        25| 40000|         53|                25|         40000|
|   Sherif|  44|        14| 30000|         44|                14|         30000|
|  hussien|  46|        12| 25000|         46|                12|         25000|
|      Ali|  29|         6| 11000|         29|                 6|         11000|
|     Zein|  38|         8| 12000|         38|                 8|         12000|
|     Adam|NULL|      NULL|  NULL|         37|                 9|         18571|
|     NULL|  36|         6|  NULL|         36|                 6|         18571|
+---------+----+----------+-