In [5]:
import pyspark

In [6]:
from pyspark.sql import SparkSession

In [7]:
spark = SparkSession.builder.appName("practise").getOrCreate()

In [8]:
spark

In [9]:
df_py = spark.read.csv('test1.csv', header=True, inferSchema=True)

In [10]:
df_py.show()

+------+----+----------+------+
|  name| age|experience|salary|
+------+----+----------+------+
| krish|  31|        10| 30000|
| sudha|  30|         8| 25000|
| sunny|  29|         4| 20000|
|  paul|  24|         3| 20000|
|harsha|  21|         1| 15000|
|shubhi|  23|         2| 18000|
|mahesh|null|      null| 40000|
|  null|  34|        10| 38000|
|  null|  36|      null|  null|
+------+----+----------+------+



Drop null Rows

In [9]:
df_py.na.drop().show()

+------+---+----------+------+
|  name|age|experience|salary|
+------+---+----------+------+
| krish| 31|        10| 30000|
| sudha| 30|         8| 25000|
| sunny| 29|         4| 20000|
|  paul| 24|         3| 20000|
|harsha| 21|         1| 15000|
|shubhi| 23|         2| 18000|
+------+---+----------+------+



# How 
## any - if one value is null row gets droped
## all - if all values null then only drop otherwise not

In [12]:
df_py.na.drop(how="all").show()

+------+----+----------+------+
|  name| age|experience|salary|
+------+----+----------+------+
| krish|  31|        10| 30000|
| sudha|  30|         8| 25000|
| sunny|  29|         4| 20000|
|  paul|  24|         3| 20000|
|harsha|  21|         1| 15000|
|shubhi|  23|         2| 18000|
|mahesh|null|      null| 40000|
|  null|  34|        10| 38000|
|  null|  36|      null|  null|
+------+----+----------+------+



In [13]:
df_py.na.drop(how="any").show()

+------+---+----------+------+
|  name|age|experience|salary|
+------+---+----------+------+
| krish| 31|        10| 30000|
| sudha| 30|         8| 25000|
| sunny| 29|         4| 20000|
|  paul| 24|         3| 20000|
|harsha| 21|         1| 15000|
|shubhi| 23|         2| 18000|
+------+---+----------+------+



## Threshold
### thresh=2 -- any two non-null values should be present

In [19]:
df_py.na.drop(how="all", thresh=2).show()
## any 2 non-null values shoud be present

+------+----+----------+------+
|  name| age|experience|salary|
+------+----+----------+------+
| krish|  31|        10| 30000|
| sudha|  30|         8| 25000|
| sunny|  29|         4| 20000|
|  paul|  24|         3| 20000|
|harsha|  21|         1| 15000|
|shubhi|  23|         2| 18000|
|mahesh|null|      null| 40000|
|  null|  34|        10| 38000|
+------+----+----------+------+



# Subset
## drop null values from a particular column

In [18]:
df_py.na.drop(how="all",subset=['experience']).show()

+------+---+----------+------+
|  name|age|experience|salary|
+------+---+----------+------+
| krish| 31|        10| 30000|
| sudha| 30|         8| 25000|
| sunny| 29|         4| 20000|
|  paul| 24|         3| 20000|
|harsha| 21|         1| 15000|
|shubhi| 23|         2| 18000|
|  null| 34|        10| 38000|
+------+---+----------+------+



### Filling the missing values

In [21]:
df_py.na.fill('Missing values').show()

+--------------+----+----------+------+
|          name| age|experience|salary|
+--------------+----+----------+------+
|         krish|  31|        10| 30000|
|         sudha|  30|         8| 25000|
|         sunny|  29|         4| 20000|
|          paul|  24|         3| 20000|
|        harsha|  21|         1| 15000|
|        shubhi|  23|         2| 18000|
|        mahesh|null|      null| 40000|
|Missing values|  34|        10| 38000|
|Missing values|  36|      null|  null|
+--------------+----+----------+------+



In [25]:
## fill specific column
df_py.na.fill('missing value',['experience','age']).show()

+------+----+----------+------+
|  name| age|experience|salary|
+------+----+----------+------+
| krish|  31|        10| 30000|
| sudha|  30|         8| 25000|
| sunny|  29|         4| 20000|
|  paul|  24|         3| 20000|
|harsha|  21|         1| 15000|
|shubhi|  23|         2| 18000|
|mahesh|null|      null| 40000|
|  null|  34|        10| 38000|
|  null|  36|      null|  null|
+------+----+----------+------+



In [26]:
df_py.show()

+------+----+----------+------+
|  name| age|experience|salary|
+------+----+----------+------+
| krish|  31|        10| 30000|
| sudha|  30|         8| 25000|
| sunny|  29|         4| 20000|
|  paul|  24|         3| 20000|
|harsha|  21|         1| 15000|
|shubhi|  23|         2| 18000|
|mahesh|null|      null| 40000|
|  null|  34|        10| 38000|
|  null|  36|      null|  null|
+------+----+----------+------+



# Imputer
## create imputer function
### "setStrategy" to any one i.e. 'mean', 'median', 'mode'

In [27]:
from pyspark.ml.feature import Imputer
imputer = Imputer(
    inputCols=['age','experience','salary'],
    outputCols=["{}_imputed".format(c) for c in ['age','experience','salary']]
).setStrategy("mean")

### Adding imputation columns
#### Null values get replaced by mean values of that column

In [29]:
imputer.fit(df_py).transform(df_py).show()

+------+----+----------+------+-----------+------------------+--------------+
|  name| age|experience|salary|age_imputed|experience_imputed|salary_imputed|
+------+----+----------+------+-----------+------------------+--------------+
| krish|  31|        10| 30000|         31|                10|         30000|
| sudha|  30|         8| 25000|         30|                 8|         25000|
| sunny|  29|         4| 20000|         29|                 4|         20000|
|  paul|  24|         3| 20000|         24|                 3|         20000|
|harsha|  21|         1| 15000|         21|                 1|         15000|
|shubhi|  23|         2| 18000|         23|                 2|         18000|
|mahesh|null|      null| 40000|         28|                 5|         40000|
|  null|  34|        10| 38000|         34|                10|         38000|
|  null|  36|      null|  null|         36|                 5|         25750|
+------+----+----------+------+-----------+------------------+--