In [0]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField, StringType, IntegerType, ArrayType
from pyspark.sql.functions import col,array_contains

spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()

arrayStructureData = [
        (("James","","Smith"),["Java","Scala","C++"],"OH","M"),
        (("Anna","Rose",""),["Spark","Java","C++"],"NY","F"),
        (("Julia","","Williams"),["CSharp","VB"],"OH","F"),
        (("Maria","Anne","Jones"),["CSharp","VB"],"NY","M"),
        (("Jen","Mary","Brown"),["CSharp","VB"],"NY","M"),
        (("Mike","Mary","Williams"),["Python","VB"],"OH","M")
        ]
        
arrayStructureSchema = StructType([
        StructField('name', StructType([
             StructField('firstname', StringType(), True),
             StructField('middlename', StringType(), True),
             StructField('lastname', StringType(), True)
             ])),
         StructField('languages', ArrayType(StringType()), True),
         StructField('state', StringType(), True),
         StructField('gender', StringType(), True)
         ])


df = spark.createDataFrame(data = arrayStructureData, schema = arrayStructureSchema)
df.printSchema()
df.show(truncate=False)

#Equals
df.filter(df.state == "OH") \
    .show(truncate=False)

#Not equals
df.filter(~(df.state == "OH")) \
    .show(truncate=False)
df.filter(df.state != "OH") \
    .show(truncate=False)    
    
df.filter(col("state") == "OH") \
    .show(truncate=False)    
    
df.filter("gender  == 'M'") \
    .show(truncate=False)    

df.filter("gender  <> 'M'") \
    .show(truncate=False)    

#IS IN
li=["OH","CA","DE"]
df.filter(df.state.isin(li)).show()
#IS NOT IN
df.filter(~df.state.isin(li)).show()

df.filter( (df.state  == "OH") & (df.gender  == "M") ) \
    .show(truncate=False)        

df.filter(array_contains(df.languages,"Java")) \
    .show(truncate=False)        

df.filter(df.name.lastname == "Williams") \
    .show(truncate=False) 

df.filter(df.state.startswith("N")).show()
df.filter(df.state.endswith("H")).show()
df.filter(df.state.like("N%")).show()

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- languages: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- state: string (nullable = true)
 |-- gender: string (nullable = true)

+----------------------+------------------+-----+------+
|name                  |languages         |state|gender|
+----------------------+------------------+-----+------+
|{James, , Smith}      |[Java, Scala, C++]|OH   |M     |
|{Anna, Rose, }        |[Spark, Java, C++]|NY   |F     |
|{Julia, , Williams}   |[CSharp, VB]      |OH   |F     |
|{Maria, Anne, Jones}  |[CSharp, VB]      |NY   |M     |
|{Jen, Mary, Brown}    |[CSharp, VB]      |NY   |M     |
|{Mike, Mary, Williams}|[Python, VB]      |OH   |M     |
+----------------------+------------------+-----+------+

+----------------------+------------------+-----+------+
|name                  |langu

In [0]:
#First, the code imports the necessary modules: pyspark, SparkSession from pyspark.sql, StructType, StructField, StringType, IntegerType, ArrayType from pyspark.sql.types, and col, array_contains from pyspark.sql.functions.

#A SparkSession named spark is created using SparkSession.builder. This session will be used to interact with Spark and perform various operations on the data.

#The code defines the data as arrayStructureData, which is a list of tuples representing the rows of the DataFrame. Each tuple consists of the following elements: a nested tuple representing the name (first name, middle name, last name), a list of languages, state, and gender.

#The code defines the schema for the DataFrame using arrayStructureSchema, which is a StructType object. The schema specifies the structure of the DataFrame, including nested fields and their data types.

#The spark.createDataFrame() method is called to create a DataFrame named df from the provided data and schema.

#The df.printSchema() method is called to print the schema of the DataFrame. It displays the column names and their corresponding data types.

#The df.show(truncate=False) method is called to display the content of the DataFrame. This prints the rows of the DataFrame in a tabular format, without truncating the column values.

#Filtering rows where the "state" column is equal to "OH":

#The df.filter(df.state == "OH") method filters the DataFrame and selects rows where the "state" column is equal to "OH".
#The show(truncate=False) method is called to display the filtered DataFrame.
#Filtering rows where the "state" column is not equal to "OH":

#The ~ operator is used to negate the filter condition.
#The df.filter(~(df.state == "OH")) and df.filter(df.state != "OH") methods filter the DataFrame and select rows where the "state" column is not equal to "OH".
#The show(truncate=False) method is called to display the filtered DataFrame.
#Filtering rows using column expressions:

#The col("state") function is used to create a column object representing the "state" column.
#The df.filter(col("state") == "OH") and df.filter("gender == 'M'") methods filter the DataFrame using column expressions.
#The show(truncate=False) method is called to display the filtered DataFrame.
#Filtering rows where the "state" column is in a list of values:

#The df.state.isin(li) method filters the DataFrame and selects rows where the "state" column's value is present in the list li.
#The show() method is called to display the filtered DataFrame.
#Filtering rows where the "state" column is not in a list of values:

#The ~ operator is used to negate the filter condition.
#The df.filter(~df.state.isin(li)) method filters the DataFrame and selects rows where the "state" column's value is not present in the list li.
#The show() method is called to display the filtered DataFrame.
#Combining multiple filter conditions using logical AND:

#The & operator is used to combine multiple filter conditions.
#The df.filter((df.state == "OH") & (df.gender == "M")) method filters the DataFrame