#### System environment variables

In [0]:
import os 
import sys
os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

#### Creating SparkSession

In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
        .appName('fillna') \
        .getOrCreate()
spark.version

Out[2]: '3.3.2'

#### Read CSV

In [0]:
df1 = spark.read.format("csv").option("header", "true").load("dbfs:/FileStore/shared_uploads/komatisatya1729@gmail.com/titanic.csv")

In [0]:
df1.show(truncate=False)

+-----------+-------------------------------------------------------+------+------+----+-----+-----+----------------+-------+-----+--------+--------+
|PassengerId|Name                                                   |Pclass|Sex   |Age |SibSp|Parch|Ticket          |Fare   |Cabin|Embarked|Survived|
+-----------+-------------------------------------------------------+------+------+----+-----+-----+----------------+-------+-----+--------+--------+
|1          |Braund, Mr. Owen Harris                                |null  |male  |22  |1    |0    |A/5 21171       |7.25   |null |S       |0       |
|2          |Cumings, Mrs. John Bradley (Florence Briggs Thayer)    |1     |female|38  |1    |0    |PC 17599        |71.2833|C85  |C       |1       |
|3          |Heikkinen, Miss. Laina                                 |3     |female|26  |0    |0    |STON/O2. 3101282|7.925  |null |S       |1       |
|4          |Futrelle, Mrs. Jacques Heath (Lily May Peel)           |1     |female|35  |1    |0    |

In [0]:
df1.printSchema()

root
 |-- PassengerId: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Pclass: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- SibSp: string (nullable = true)
 |-- Parch: string (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: string (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)
 |-- Survived: string (nullable = true)



#### Replacing with number(using fillna)

In [0]:
# If we replace with only integer type, Then only integer columns from the dataframe gets replaced.
df1.fillna(value=0).show()

+-----------+--------------------+------+------+----+-----+-----+----------------+-------+-----+--------+--------+
|PassengerId|                Name|Pclass|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|Survived|
+-----------+--------------------+------+------+----+-----+-----+----------------+-------+-----+--------+--------+
|          1|Braund, Mr. Owen ...|  null|  male|  22|    1|    0|       A/5 21171|   7.25| null|       S|       0|
|          2|Cumings, Mrs. Joh...|     1|female|  38|    1|    0|        PC 17599|71.2833|  C85|       C|       1|
|          3|Heikkinen, Miss. ...|     3|female|  26|    0|    0|STON/O2. 3101282|  7.925| null|       S|       1|
|          4|Futrelle, Mrs. Ja...|     1|female|  35|    1|    0|          113803|   53.1| C123|       S|       1|
|          5|Allen, Mr. Willia...|     3|  male|  35|    0|    0|          373450|   8.05| null|       S|       0|
|          6|    Moran, Mr. James|     3|  male|null|    0|    0|          33087

#### Replacing with string(using na.fill)

In [0]:
# Similarly, if we replace with only string type, then only string columns from df gets replaced.
# Subset here means that we can replace the same value for some columns as per wish
df1.na.fill(value = 'y', subset = ['Age']).show()

+-----------+--------------------+------+------+---+-----+-----+----------------+-------+-----+--------+--------+
|PassengerId|                Name|Pclass|   Sex|Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|Survived|
+-----------+--------------------+------+------+---+-----+-----+----------------+-------+-----+--------+--------+
|          1|Braund, Mr. Owen ...|  null|  male| 22|    1|    0|       A/5 21171|   7.25| null|       S|       0|
|          2|Cumings, Mrs. Joh...|     1|female| 38|    1|    0|        PC 17599|71.2833|  C85|       C|       1|
|          3|Heikkinen, Miss. ...|     3|female| 26|    0|    0|STON/O2. 3101282|  7.925| null|       S|       1|
|          4|Futrelle, Mrs. Ja...|     1|female| 35|    1|    0|          113803|   53.1| C123|       S|       1|
|          5|Allen, Mr. Willia...|     3|  male| 35|    0|    0|          373450|   8.05| null|       S|       0|
|          6|    Moran, Mr. James|     3|  male|  y|    0|    0|          330877| 8.4583

#### Replacing null with a specific aggregate value

In [0]:
from pyspark.sql.functions import max
max_pclass=df1.agg(max("pclass")).collect()[0][0]
print(max_pclass)

3


In [0]:
df1.na.fill(value=max_pclass,subset=["pclass"]).show()

+-----------+--------------------+------+------+----+-----+-----+----------------+-------+-----+--------+--------+
|PassengerId|                Name|Pclass|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|Survived|
+-----------+--------------------+------+------+----+-----+-----+----------------+-------+-----+--------+--------+
|          1|Braund, Mr. Owen ...|     3|  male|  22|    1|    0|       A/5 21171|   7.25| null|       S|       0|
|          2|Cumings, Mrs. Joh...|     1|female|  38|    1|    0|        PC 17599|71.2833|  C85|       C|       1|
|          3|Heikkinen, Miss. ...|     3|female|  26|    0|    0|STON/O2. 3101282|  7.925| null|       S|       1|
|          4|Futrelle, Mrs. Ja...|     1|female|  35|    1|    0|          113803|   53.1| C123|       S|       1|
|          5|Allen, Mr. Willia...|     3|  male|  35|    0|    0|          373450|   8.05| null|       S|       0|
|          6|    Moran, Mr. James|     3|  male|null|    0|    0|          33087

##### Replacing with multiple values for multiple columns

In [0]:
df1.na.fill("Unique",['Cabin'])\
    .na.fill("Unknow",['Age']).show()

+-----------+--------------------+------+------+------+-----+-----+----------------+-------+------+--------+--------+
|PassengerId|                Name|Pclass|   Sex|   Age|SibSp|Parch|          Ticket|   Fare| Cabin|Embarked|Survived|
+-----------+--------------------+------+------+------+-----+-----+----------------+-------+------+--------+--------+
|          1|Braund, Mr. Owen ...|  null|  male|    22|    1|    0|       A/5 21171|   7.25|Unique|       S|       0|
|          2|Cumings, Mrs. Joh...|     1|female|    38|    1|    0|        PC 17599|71.2833|   C85|       C|       1|
|          3|Heikkinen, Miss. ...|     3|female|    26|    0|    0|STON/O2. 3101282|  7.925|Unique|       S|       1|
|          4|Futrelle, Mrs. Ja...|     1|female|    35|    1|    0|          113803|   53.1|  C123|       S|       1|
|          5|Allen, Mr. Willia...|     3|  male|    35|    0|    0|          373450|   8.05|Unique|       S|       0|
|          6|    Moran, Mr. James|     3|  male|Unknow| 

##### Replacing with multiple values for multiple columns using Dictionary

In [0]:
rep_dict={
    "Age":"Unknown",
    "Cabin":"Unique"
}
df1.fillna(rep_dict).show()

+-----------+--------------------+------+------+-------+-----+-----+----------------+-------+------+--------+--------+
|PassengerId|                Name|Pclass|   Sex|    Age|SibSp|Parch|          Ticket|   Fare| Cabin|Embarked|Survived|
+-----------+--------------------+------+------+-------+-----+-----+----------------+-------+------+--------+--------+
|          1|Braund, Mr. Owen ...|  null|  male|     22|    1|    0|       A/5 21171|   7.25|Unique|       S|       0|
|          2|Cumings, Mrs. Joh...|     1|female|     38|    1|    0|        PC 17599|71.2833|   C85|       C|       1|
|          3|Heikkinen, Miss. ...|     3|female|     26|    0|    0|STON/O2. 3101282|  7.925|Unique|       S|       1|
|          4|Futrelle, Mrs. Ja...|     1|female|     35|    1|    0|          113803|   53.1|  C123|       S|       1|
|          5|Allen, Mr. Willia...|     3|  male|     35|    0|    0|          373450|   8.05|Unique|       S|       0|
|          6|    Moran, Mr. James|     3|  male|