In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
spark: SparkSession = SparkSession.builder \
    .master("local[1]") \
    .appName("SparkByExamples.com") \
    .getOrCreate()

data = [
    ("James",None,"M"),
    ("Anna","NY","F"),
    ("Julia",None,None)
]

columns = ["name","state","gender"]
df =spark.createDataFrame(data,columns)

df.printSchema()
df.show()

df.filter("state is NULL").show()
df.filter(df.state.isNull()).show()
df.filter(col("state").isNull()).show()

df.filter("state IS NULL AND gender IS NULL").show()
df.filter(df.state.isNull() & df.gender.isNull()).show()

df.filter("state is not NULL").show()
df.filter("NOT state is NULL").show()
df.filter(df.state.isNotNull()).show()
df.filter(col("state").isNotNull()).show()
df.na.drop(subset=["state"]).show()

df.createOrReplaceTempView("DATA")
spark.sql("SELECT * FROM DATA where STATE IS NULL").show()
spark.sql("SELECT * FROM DATA where STATE IS NULL AND GENDER IS NULL").show()
spark.sql("SELECT * FROM DATA where STATE IS NOT NULL").show()

root
 |-- name: string (nullable = true)
 |-- state: string (nullable = true)
 |-- gender: string (nullable = true)

+-----+-----+------+
| name|state|gender|
+-----+-----+------+
|James| null|     M|
| Anna|   NY|     F|
|Julia| null|  null|
+-----+-----+------+

+-----+-----+------+
| name|state|gender|
+-----+-----+------+
|James| null|     M|
|Julia| null|  null|
+-----+-----+------+

+-----+-----+------+
| name|state|gender|
+-----+-----+------+
|James| null|     M|
|Julia| null|  null|
+-----+-----+------+

+-----+-----+------+
| name|state|gender|
+-----+-----+------+
|James| null|     M|
|Julia| null|  null|
+-----+-----+------+

+-----+-----+------+
| name|state|gender|
+-----+-----+------+
|Julia| null|  null|
+-----+-----+------+

+-----+-----+------+
| name|state|gender|
+-----+-----+------+
|Julia| null|  null|
+-----+-----+------+

+----+-----+------+
|name|state|gender|
+----+-----+------+
|Anna|   NY|     F|
+----+-----+------+

+----+-----+------+
|name|state|gender|
+

In [0]:
#First, the code imports the necessary modules: SparkSession from pyspark.sql and the col function from pyspark.sql.functions.

#The code creates a SparkSession named spark with a specified configuration. This session will be used to interact with Spark and perform various operations on the data.

#The code defines the data as a list of tuples, where each tuple represents a row in the DataFrame. The columns list contains the names of the columns in the DataFrame.

#Using the spark.createDataFrame() method, the code creates a DataFrame named df from the data and columns.

#The df.printSchema() method is called to print the schema of the DataFrame, which shows the column names and their data types.

#The df.show() method is called to display the content of the DataFrame. This will print the rows of the DataFrame in a tabular format.

#The code applies filters to the DataFrame using different approaches:

#df.filter("state is NULL"): This filter selects rows where the "state" column is null. The filter condition is specified as a string using SQL syntax.

#df.filter(df.state.isNull()): This filter selects rows where the "state" column is null. It uses the isNull() method on the column object df.state.

#df.filter(col("state").isNull()): This filter selects rows where the "state" column is null. It uses the isNull() function from the col module to create a column object.

#Similar to step 7, the code applies filters to the DataFrame for specific conditions:

#df.filter("state IS NULL AND gender IS NULL"): This filter selects rows where both the "state" and "gender" columns are null.

#df.filter(df.state.isNull() & df.gender.isNull()): This filter selects rows where both the "state" and "gender" columns are null. It uses the & operator to combine the filter conditions.

#The code applies filters to select rows where the "state" column is not null:

#df.filter("state is not NULL"): This filter selects rows where the "state" column is not null.

#df.filter("NOT state is NULL"): This filter selects rows where the "state" column is not null. It uses the NOT operator to negate the filter condition.

#df.filter(df.state.isNotNull()): This filter selects rows where the "state" column is not null. It uses the isNotNull() method on the column object df.state.

#df.filter(col("state").isNotNull()): This filter selects rows where the "state" column is not null. It uses the isNotNull() function from the col module to create a column object.

#The df.na.drop(subset=["state"]) method is called to drop rows that have null values in the "state" column.

#The code creates a temporary view named "DATA" for the DataFrame using df.createOrReplaceTempView("DATA"). This allows executing SQL queries on the DataFrame.

#Spark SQL queries are executed using spark.sql():

#spark.sql("SELECT * FROM DATA where STATE IS NULL"): This query selects rows from the "DATA" view where the "STATE" column is null.

#spark.sql("SELECT * FROM DATA where STATE IS NULL AND GENDER IS NULL"): This query selects rows from the "DATA" view where both the "STATE"