## Pyspark Handling Missing Values

- Dropping Columns
- Dropping Rows
- Various Parameter in Dropping functionalities
* Handling Missing values by Mean, Meadian And Mode


In [1]:
from pyspark.sql import SparkSession


In [2]:
ps = SparkSession.builder.appName("Practice").getOrCreate()

In [3]:
ps

In [29]:
df = ps.read.option('header','true').csv('spark.csv',inferSchema=True)
df.show()

+-----+-------+----+------+
|Sr.No|   Name| Age|Salary|
+-----+-------+----+------+
|    1|Kaushal|  26| 42000|
|    2| Satyam|  24| 25000|
|    3| Shivam|  26|  null|
|    4|Ranjeet|  25| 40000|
|    5|Ghanavi|null|  null|
+-----+-------+----+------+



In [5]:
df.printSchema()

root
 |-- Sr.No: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [19]:
# drop the columns
df = df.withColumnRenamed("Sr.No","Index")
df.show()

+-----+-------+---+------+
|Index|   Name|Age|Salary|
+-----+-------+---+------+
|    1|Kaushal| 26| 42000|
|    2| Satyam| 24| 25000|
|    3| Shivam| 26|    10|
|    4|Ranjeet| 25| 40000|
|    5|Ghanavi| 10|    10|
+-----+-------+---+------+



In [7]:
df.na.drop(how="all").show()

+-----+-------+----+------+
|Index|   Name| Age|Salary|
+-----+-------+----+------+
|    1|Kaushal|  26| 42000|
|    2| Satyam|  24| 25000|
|    3| Shivam|  26|  null|
|    4|Ranjeet|  25| 40000|
|    5|Ghanavi|null|  null|
+-----+-------+----+------+



In [8]:
df.na.drop(how="any").show()

+-----+-------+---+------+
|Index|   Name|Age|Salary|
+-----+-------+---+------+
|    1|Kaushal| 26| 42000|
|    2| Satyam| 24| 25000|
|    4|Ranjeet| 25| 40000|
+-----+-------+---+------+



In [9]:
# If there are 3 non null values then it will keep the record else it will drop the record
df.na.drop(how="any",thresh=3).show()

+-----+-------+---+------+
|Index|   Name|Age|Salary|
+-----+-------+---+------+
|    1|Kaushal| 26| 42000|
|    2| Satyam| 24| 25000|
|    3| Shivam| 26|  null|
|    4|Ranjeet| 25| 40000|
+-----+-------+---+------+



In [18]:
df.na.drop(how="any",subset=['Age']).show()

+-----+-------+---+------+
|Index|   Name|Age|Salary|
+-----+-------+---+------+
|    1|Kaushal| 26| 42000|
|    2| Satyam| 24| 25000|
|    3| Shivam| 26|    10|
|    4|Ranjeet| 25| 40000|
|    5|Ghanavi| 10|    10|
+-----+-------+---+------+



In [37]:
# Filling the missing values
df.na.fill(22,['Salary','age']).show()
# df.show()

+-----+-------+---+------+
|Sr.No|   Name|Age|Salary|
+-----+-------+---+------+
|    1|Kaushal| 26| 42000|
|    2| Satyam| 24| 25000|
|    3| Shivam| 26|    22|
|    4|Ranjeet| 25| 40000|
|    5|Ghanavi| 22|    22|
+-----+-------+---+------+



In [41]:
from pyspark.ml.feature import Imputer
imputer = Imputer(
    inputCols=['Age','Salary'],
    outputCols=["{}_imputed".format(c) for c in ['Age','Salary']]
    ).setStrategy('mean')

In [42]:
imputer.fit(df).transform(df).show()

+-----+-------+----+------+-----------+--------------+
|Sr.No|   Name| Age|Salary|Age_imputed|Salary_imputed|
+-----+-------+----+------+-----------+--------------+
|    1|Kaushal|  26| 42000|         26|         42000|
|    2| Satyam|  24| 25000|         24|         25000|
|    3| Shivam|  26|  null|         26|         35666|
|    4|Ranjeet|  25| 40000|         25|         40000|
|    5|Ghanavi|null|  null|         25|         35666|
+-----+-------+----+------+-----------+--------------+

