In [37]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('Practise').getOrCreate()

In [38]:
df_pyspark = spark.read.csv('test.csv', header=True, inferSchema=True)

In [39]:
df_pyspark.show()

+-----+----+----------+------+
| NAME| AGE|EXPERIENCE|SALARY|
+-----+----+----------+------+
| JON |  31|        10| 30000|
|  VON|  53|        20| 60000|
|  RON|  25|         2| 10000|
|BJORK|  23|         1| 10000|
| DORK|null|      null| 30000|
| null|  34|        10| 20000|
| null|  36|      null|  null|
+-----+----+----------+------+



In [40]:
df_pyspark.na.drop().show()

+-----+---+----------+------+
| NAME|AGE|EXPERIENCE|SALARY|
+-----+---+----------+------+
| JON | 31|        10| 30000|
|  VON| 53|        20| 60000|
|  RON| 25|         2| 10000|
|BJORK| 23|         1| 10000|
+-----+---+----------+------+



In [41]:
## drop takes some arguments, if how==all this would return a DF without rows with complete null vals.
df_pyspark.na.drop(how='any').show()
df_pyspark.na.drop(how='all').show()

+-----+---+----------+------+
| NAME|AGE|EXPERIENCE|SALARY|
+-----+---+----------+------+
| JON | 31|        10| 30000|
|  VON| 53|        20| 60000|
|  RON| 25|         2| 10000|
|BJORK| 23|         1| 10000|
+-----+---+----------+------+

+-----+----+----------+------+
| NAME| AGE|EXPERIENCE|SALARY|
+-----+----+----------+------+
| JON |  31|        10| 30000|
|  VON|  53|        20| 60000|
|  RON|  25|         2| 10000|
|BJORK|  23|         1| 10000|
| DORK|null|      null| 30000|
| null|  34|        10| 20000|
| null|  36|      null|  null|
+-----+----+----------+------+



In [42]:
## threshold is another parameter of drop;
## it checks the number of non null values in a row,if this condition is not met,
## then the row is removed.

df_pyspark.show()
df_pyspark.na.drop(thresh = 3).show()


+-----+----+----------+------+
| NAME| AGE|EXPERIENCE|SALARY|
+-----+----+----------+------+
| JON |  31|        10| 30000|
|  VON|  53|        20| 60000|
|  RON|  25|         2| 10000|
|BJORK|  23|         1| 10000|
| DORK|null|      null| 30000|
| null|  34|        10| 20000|
| null|  36|      null|  null|
+-----+----+----------+------+

+-----+---+----------+------+
| NAME|AGE|EXPERIENCE|SALARY|
+-----+---+----------+------+
| JON | 31|        10| 30000|
|  VON| 53|        20| 60000|
|  RON| 25|         2| 10000|
|BJORK| 23|         1| 10000|
| null| 34|        10| 20000|
+-----+---+----------+------+



In [43]:
## subset;
## Removes the rows containing null values in a specific row.
df_pyspark.show()
df_pyspark.na.drop(subset=['EXPERIENCE']).show()

+-----+----+----------+------+
| NAME| AGE|EXPERIENCE|SALARY|
+-----+----+----------+------+
| JON |  31|        10| 30000|
|  VON|  53|        20| 60000|
|  RON|  25|         2| 10000|
|BJORK|  23|         1| 10000|
| DORK|null|      null| 30000|
| null|  34|        10| 20000|
| null|  36|      null|  null|
+-----+----+----------+------+

+-----+---+----------+------+
| NAME|AGE|EXPERIENCE|SALARY|
+-----+---+----------+------+
| JON | 31|        10| 30000|
|  VON| 53|        20| 60000|
|  RON| 25|         2| 10000|
|BJORK| 23|         1| 10000|
| null| 34|        10| 20000|
+-----+---+----------+------+



In [47]:
## Filling the missing values

df_pyspark.na.fill(value=0,subset=['AGE','EXPERIENCE','SALARY']).show()

+-----+---+----------+------+
| NAME|AGE|EXPERIENCE|SALARY|
+-----+---+----------+------+
| JON | 31|        10| 30000|
|  VON| 53|        20| 60000|
|  RON| 25|         2| 10000|
|BJORK| 23|         1| 10000|
| DORK|  0|         0| 30000|
| null| 34|        10| 20000|
| null| 36|         0|     0|
+-----+---+----------+------+



In [48]:
## Filling missing values with mean of that column using imputer function
df_pyspark.show()

+-----+----+----------+------+
| NAME| AGE|EXPERIENCE|SALARY|
+-----+----+----------+------+
| JON |  31|        10| 30000|
|  VON|  53|        20| 60000|
|  RON|  25|         2| 10000|
|BJORK|  23|         1| 10000|
| DORK|null|      null| 30000|
| null|  34|        10| 20000|
| null|  36|      null|  null|
+-----+----+----------+------+



In [51]:
from pyspark.ml.feature import Imputer

imputer = Imputer(
    inputCols=['AGE','EXPERIENCE','SALARY'],
    outputCols=[f"{c}_imputed" for c in ['AGE','EXPERIENCE','SALARY']]
).setStrategy("median")

In [52]:
imputer.fit(df_pyspark).transform(df_pyspark).show()

+-----+----+----------+------+-----------+------------------+--------------+
| NAME| AGE|EXPERIENCE|SALARY|AGE_imputed|EXPERIENCE_imputed|SALARY_imputed|
+-----+----+----------+------+-----------+------------------+--------------+
| JON |  31|        10| 30000|         31|                10|         30000|
|  VON|  53|        20| 60000|         53|                20|         60000|
|  RON|  25|         2| 10000|         25|                 2|         10000|
|BJORK|  23|         1| 10000|         23|                 1|         10000|
| DORK|null|      null| 30000|         31|                10|         30000|
| null|  34|        10| 20000|         34|                10|         20000|
| null|  36|      null|  null|         36|                10|         20000|
+-----+----+----------+------+-----------+------------------+--------------+



In [54]:
imputer = Imputer(
    inputCols=['AGE','EXPERIENCE','SALARY'],
    outputCols=['AGE','EXPERIENCE','SALARY']
).setStrategy("mean")

imputer.fit(df_pyspark).transform(df_pyspark).show()

+-----+---+----------+------+
| NAME|AGE|EXPERIENCE|SALARY|
+-----+---+----------+------+
| JON | 31|        10| 30000|
|  VON| 53|        20| 60000|
|  RON| 25|         2| 10000|
|BJORK| 23|         1| 10000|
| DORK| 33|         8| 30000|
| null| 34|        10| 20000|
| null| 36|         8| 26666|
+-----+---+----------+------+

