In [87]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Handlings").getOrCreate()

In [88]:
df_pyspark=spark.read.csv("test2.csv", header=True, inferSchema=True)

df_pyspark.show()

+----+----+----------+------+
|Name| Age|Experience|Salary|
+----+----+----------+------+
|   a|  12|         2| 10000|
|   b|  13|         4| 30000|
|   c|  14|         2|  4000|
|   d|  15|         6| 50000|
|   e|  21|      null| 14000|
|   f|  34|         8|  null|
|null|  32|         1|  2400|
|null|null|         5| 13000|
|   i|null|      null|  null|
|null|   2|      null|  3400|
+----+----+----------+------+



In [89]:
# Drop column

df_pyspark.drop("salary").show()

+----+----+----------+
|Name| Age|Experience|
+----+----+----------+
|   a|  12|         2|
|   b|  13|         4|
|   c|  14|         2|
|   d|  15|         6|
|   e|  21|      null|
|   f|  34|         8|
|null|  32|         1|
|null|null|         5|
|   i|null|      null|
|null|   2|      null|
+----+----+----------+



In [90]:
df_pyspark.show()

+----+----+----------+------+
|Name| Age|Experience|Salary|
+----+----+----------+------+
|   a|  12|         2| 10000|
|   b|  13|         4| 30000|
|   c|  14|         2|  4000|
|   d|  15|         6| 50000|
|   e|  21|      null| 14000|
|   f|  34|         8|  null|
|null|  32|         1|  2400|
|null|null|         5| 13000|
|   i|null|      null|  null|
|null|   2|      null|  3400|
+----+----+----------+------+



In [91]:
# Rows

df_pyspark.na.drop().show()

+----+---+----------+------+
|Name|Age|Experience|Salary|
+----+---+----------+------+
|   a| 12|         2| 10000|
|   b| 13|         4| 30000|
|   c| 14|         2|  4000|
|   d| 15|         6| 50000|
+----+---+----------+------+



In [92]:
df_pyspark.na.drop(how="any").show()

# how = "all" will drop all null values

+----+---+----------+------+
|Name|Age|Experience|Salary|
+----+---+----------+------+
|   a| 12|         2| 10000|
|   b| 13|         4| 30000|
|   c| 14|         2|  4000|
|   d| 15|         6| 50000|
+----+---+----------+------+



In [93]:
df_pyspark.na.drop(how="any",thresh=2).show()
df_pyspark.na.drop(how="any",thresh=3).show()
# df_pyspark.na.drop(how="any",thresh=1).show()

# means if thresh is 2 then it means atleast 2 non null values are there in the column
# if thresh is 3 then it means atleast 3 non null values are there in the column

+----+----+----------+------+
|Name| Age|Experience|Salary|
+----+----+----------+------+
|   a|  12|         2| 10000|
|   b|  13|         4| 30000|
|   c|  14|         2|  4000|
|   d|  15|         6| 50000|
|   e|  21|      null| 14000|
|   f|  34|         8|  null|
|null|  32|         1|  2400|
|null|null|         5| 13000|
|null|   2|      null|  3400|
+----+----+----------+------+

+----+---+----------+------+
|Name|Age|Experience|Salary|
+----+---+----------+------+
|   a| 12|         2| 10000|
|   b| 13|         4| 30000|
|   c| 14|         2|  4000|
|   d| 15|         6| 50000|
|   e| 21|      null| 14000|
|   f| 34|         8|  null|
|null| 32|         1|  2400|
+----+---+----------+------+



In [94]:
df_pyspark.na.drop(how="any",subset=["experience"]).show()


+----+----+----------+------+
|Name| Age|Experience|Salary|
+----+----+----------+------+
|   a|  12|         2| 10000|
|   b|  13|         4| 30000|
|   c|  14|         2|  4000|
|   d|  15|         6| 50000|
|   f|  34|         8|  null|
|null|  32|         1|  2400|
|null|null|         5| 13000|
+----+----+----------+------+



In [95]:
# Filling the missing values

df_pyspark.na.fill(0, ['age']).show()

+----+---+----------+------+
|Name|Age|Experience|Salary|
+----+---+----------+------+
|   a| 12|         2| 10000|
|   b| 13|         4| 30000|
|   c| 14|         2|  4000|
|   d| 15|         6| 50000|
|   e| 21|      null| 14000|
|   f| 34|         8|  null|
|null| 32|         1|  2400|
|null|  0|         5| 13000|
|   i|  0|      null|  null|
|null|  2|      null|  3400|
+----+---+----------+------+



In [97]:
df_pyspark.show()

+----+----+----------+------+
|Name| Age|Experience|Salary|
+----+----+----------+------+
|   a|  12|         2| 10000|
|   b|  13|         4| 30000|
|   c|  14|         2|  4000|
|   d|  15|         6| 50000|
|   e|  21|      null| 14000|
|   f|  34|         8|  null|
|null|  32|         1|  2400|
|null|null|         5| 13000|
|   i|null|      null|  null|
|null|   2|      null|  3400|
+----+----+----------+------+



In [104]:
from pyspark.ml.feature import Imputer

imputer = Imputer(
    inputCols=['Age', 'Experience', 'Salary'],
    outputCols=["{}_im".format(col) for col in ['age', 'experience', 'salary']]).setStrategy("mean")


In [105]:
imputer.fit(df_pyspark).transform(df_pyspark).show()

+----+----+----------+------+------+-------------+---------+
|Name| Age|Experience|Salary|age_im|experience_im|salary_im|
+----+----+----------+------+------+-------------+---------+
|   a|  12|         2| 10000|    12|            2|    10000|
|   b|  13|         4| 30000|    13|            4|    30000|
|   c|  14|         2|  4000|    14|            2|     4000|
|   d|  15|         6| 50000|    15|            6|    50000|
|   e|  21|      null| 14000|    21|            4|    14000|
|   f|  34|         8|  null|    34|            8|    15850|
|null|  32|         1|  2400|    32|            1|     2400|
|null|null|         5| 13000|    17|            5|    13000|
|   i|null|      null|  null|    17|            4|    15850|
|null|   2|      null|  3400|     2|            4|     3400|
+----+----+----------+------+------+-------------+---------+

