In [0]:
import os
import sys
os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
        .appName("PythonCollect") \
        .getOrCreate()

In [0]:
#Creating Data frame
dept=[("Finance",10),("Marketing",20),("Sales",30),("IT",40)]
columns=["Dept_Name","Dept_Id"]
deptDF=spark.createDataFrame(dept,columns)

In [0]:
deptDF.show()

+---------+-------+
|Dept_Name|Dept_Id|
+---------+-------+
|  Finance|     10|
|Marketing|     20|
|    Sales|     30|
|       IT|     40|
+---------+-------+



#### Collect() function

In [0]:
dataCollect=deptDF.collect()
print(dataCollect)

[Row(Dept_Name='Finance', Dept_Id=10), Row(Dept_Name='Marketing', Dept_Id=20), Row(Dept_Name='Sales', Dept_Id=30), Row(Dept_Name='IT', Dept_Id=40)]


In [0]:
# Printing Collected Data
for row in dataCollect:
    print(row['Dept_Id'],row["Dept_Name"])

10 Finance
20 Marketing
30 Sales
40 IT


In [0]:
deptDF.collect()[:][:]

Out[11]: [Row(Dept_Name='Finance', Dept_Id=10),
 Row(Dept_Name='Marketing', Dept_Id=20),
 Row(Dept_Name='Sales', Dept_Id=30),
 Row(Dept_Name='IT', Dept_Id=40)]

In [0]:
deptDF.collect()

Out[12]: [Row(Dept_Name='Finance', Dept_Id=10),
 Row(Dept_Name='Marketing', Dept_Id=20),
 Row(Dept_Name='Sales', Dept_Id=30),
 Row(Dept_Name='IT', Dept_Id=40)]

In [0]:
dataCollect2=deptDF.select("*").collect()
print(dataCollect2)

[Row(Dept_Name='Finance', Dept_Id=10), Row(Dept_Name='Marketing', Dept_Id=20), Row(Dept_Name='Sales', Dept_Id=30), Row(Dept_Name='IT', Dept_Id=40)]


#### Filter() Function

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, ArrayType

# Overcome the dataframe
data = [
    (("James","","Smith"),["Java","Scala","C++"],"OH","M"),
    (("Anna","Rose",""),["Spark","Java","C++"],"NY","F"),
    (("Julia","","Williams"),["CSharp","VB"],"OH","F"),
    (("Maria","Anne","Jones"),["CSharp","VB"],"NY","M"),
    (("Jen","Mary","Brown"),["CSharp","VB"],"NY","M"),
    (("Mike","Mary","Williams"),["Python","VB"],"OH","M")
]

schema = StructType([
     StructField('name', StructType([
        StructField('firstname', StringType(), True),
        StructField('middlename', StringType(), True),
         StructField('lastname', StringType(), True)
     ])),
     StructField('languages', ArrayType(StringType()), True),
     StructField('state', StringType(), True),
     StructField('gender', StringType(), True)
 ])

df = spark.createDataFrame(data, schema)

In [0]:
df.printSchema()

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- languages: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- state: string (nullable = true)
 |-- gender: string (nullable = true)



In [0]:
df.show(truncate=False)

+----------------------+------------------+-----+------+
|name                  |languages         |state|gender|
+----------------------+------------------+-----+------+
|{James, , Smith}      |[Java, Scala, C++]|OH   |M     |
|{Anna, Rose, }        |[Spark, Java, C++]|NY   |F     |
|{Julia, , Williams}   |[CSharp, VB]      |OH   |F     |
|{Maria, Anne, Jones}  |[CSharp, VB]      |NY   |M     |
|{Jen, Mary, Brown}    |[CSharp, VB]      |NY   |M     |
|{Mike, Mary, Williams}|[Python, VB]      |OH   |M     |
+----------------------+------------------+-----+------+



#### Equals and Not Equals Condition

In [0]:
# Using equals condition with . operator
df.filter(df.state == 'OH').show(truncate=False)

+----------------------+------------------+-----+------+
|name                  |languages         |state|gender|
+----------------------+------------------+-----+------+
|{James, , Smith}      |[Java, Scala, C++]|OH   |M     |
|{Julia, , Williams}   |[CSharp, VB]      |OH   |F     |
|{Mike, Mary, Williams}|[Python, VB]      |OH   |M     |
+----------------------+------------------+-----+------+



In [0]:
df.filter(df['state']=='OH').show(truncate=False)

+----------------------+------------------+-----+------+
|name                  |languages         |state|gender|
+----------------------+------------------+-----+------+
|{James, , Smith}      |[Java, Scala, C++]|OH   |M     |
|{Julia, , Williams}   |[CSharp, VB]      |OH   |F     |
|{Mike, Mary, Williams}|[Python, VB]      |OH   |M     |
+----------------------+------------------+-----+------+



In [0]:
df.filter(df.state!='OH').show()

+--------------------+------------------+-----+------+
|                name|         languages|state|gender|
+--------------------+------------------+-----+------+
|      {Anna, Rose, }|[Spark, Java, C++]|   NY|     F|
|{Maria, Anne, Jones}|      [CSharp, VB]|   NY|     M|
|  {Jen, Mary, Brown}|      [CSharp, VB]|   NY|     M|
+--------------------+------------------+-----+------+



In [0]:
df.filter(~(df.state=='OH')).show()

+--------------------+------------------+-----+------+
|                name|         languages|state|gender|
+--------------------+------------------+-----+------+
|      {Anna, Rose, }|[Spark, Java, C++]|   NY|     F|
|{Maria, Anne, Jones}|      [CSharp, VB]|   NY|     M|
|  {Jen, Mary, Brown}|      [CSharp, VB]|   NY|     M|
+--------------------+------------------+-----+------+



In [0]:
from pyspark.sql.functions import col
df.filter(col('state')=='NY').show()

+--------------------+------------------+-----+------+
|                name|         languages|state|gender|
+--------------------+------------------+-----+------+
|      {Anna, Rose, }|[Spark, Java, C++]|   NY|     F|
|{Maria, Anne, Jones}|      [CSharp, VB]|   NY|     M|
|  {Jen, Mary, Brown}|      [CSharp, VB]|   NY|     M|
+--------------------+------------------+-----+------+



In [0]:
### Using col function
from pyspark.sql.functions import col
df.filter(col('name.lastname') == 'Williams').show()

+--------------------+------------+-----+------+
|                name|   languages|state|gender|
+--------------------+------------+-----+------+
| {Julia, , Williams}|[CSharp, VB]|   OH|     F|
|{Mike, Mary, Will...|[Python, VB]|   OH|     M|
+--------------------+------------+-----+------+



#### DataFrme filter using sql operations

In [0]:
df.filter("gender='M'").show()

+--------------------+------------------+-----+------+
|                name|         languages|state|gender|
+--------------------+------------------+-----+------+
|    {James, , Smith}|[Java, Scala, C++]|   OH|     M|
|{Maria, Anne, Jones}|      [CSharp, VB]|   NY|     M|
|  {Jen, Mary, Brown}|      [CSharp, VB]|   NY|     M|
|{Mike, Mary, Will...|      [Python, VB]|   OH|     M|
+--------------------+------------------+-----+------+



In [0]:
df.filter("gender<>'M'").show()

+-------------------+------------------+-----+------+
|               name|         languages|state|gender|
+-------------------+------------------+-----+------+
|     {Anna, Rose, }|[Spark, Java, C++]|   NY|     F|
|{Julia, , Williams}|      [CSharp, VB]|   OH|     F|
+-------------------+------------------+-----+------+



#### Select and filter together

In [0]:
# Using select and filter together in a query
df.select("name.firstname", "name.lastname", "languages").filter(df['state'] == 'OH').show(truncate = False)

+---------+--------+------------------+
|firstname|lastname|languages         |
+---------+--------+------------------+
|James    |Smith   |[Java, Scala, C++]|
|Julia    |Williams|[CSharp, VB]      |
|Mike     |Williams|[Python, VB]      |
+---------+--------+------------------+



#### Filter with multiple conditions

In [0]:
df.filter((df.state=='OH')& (df.gender=='F')).show()

+-------------------+------------+-----+------+
|               name|   languages|state|gender|
+-------------------+------------+-----+------+
|{Julia, , Williams}|[CSharp, VB]|   OH|     F|
+-------------------+------------+-----+------+



#### Filter based on list values

In [0]:
l=["OA","LI","OH"]
df.filter(df.state.isin(l)).show()

+--------------------+------------------+-----+------+
|                name|         languages|state|gender|
+--------------------+------------------+-----+------+
|    {James, , Smith}|[Java, Scala, C++]|   OH|     M|
| {Julia, , Williams}|      [CSharp, VB]|   OH|     F|
|{Mike, Mary, Will...|      [Python, VB]|   OH|     M|
+--------------------+------------------+-----+------+



In [0]:
df.filter(~(df.state.isin(l))).show()

+--------------------+------------------+-----+------+
|                name|         languages|state|gender|
+--------------------+------------------+-----+------+
|      {Anna, Rose, }|[Spark, Java, C++]|   NY|     F|
|{Maria, Anne, Jones}|      [CSharp, VB]|   NY|     M|
|  {Jen, Mary, Brown}|      [CSharp, VB]|   NY|     M|
+--------------------+------------------+-----+------+



#### Filters based on Ends with,Starts with,contains

In [0]:
# Starts with functions
df.filter(df.state.startswith('O')).show()

+--------------------+------------------+-----+------+
|                name|         languages|state|gender|
+--------------------+------------------+-----+------+
|    {James, , Smith}|[Java, Scala, C++]|   OH|     M|
| {Julia, , Williams}|      [CSharp, VB]|   OH|     F|
|{Mike, Mary, Will...|      [Python, VB]|   OH|     M|
+--------------------+------------------+-----+------+



In [0]:
# Ends with function
df.filter(df.state.endswith('Y')).show()

+--------------------+------------------+-----+------+
|                name|         languages|state|gender|
+--------------------+------------------+-----+------+
|      {Anna, Rose, }|[Spark, Java, C++]|   NY|     F|
|{Maria, Anne, Jones}|      [CSharp, VB]|   NY|     M|
|  {Jen, Mary, Brown}|      [CSharp, VB]|   NY|     M|
+--------------------+------------------+-----+------+



In [0]:
# contains function
df.filter(df.name.firstname.contains('n')).show()

+------------------+------------------+-----+------+
|              name|         languages|state|gender|
+------------------+------------------+-----+------+
|    {Anna, Rose, }|[Spark, Java, C++]|   NY|     F|
|{Jen, Mary, Brown}|      [CSharp, VB]|   NY|     M|
+------------------+------------------+-----+------+



#### Filters using like and rlike

In [0]:
# Prepare data
data2 = [
    (2, 'Michael Raj'),
    (3, 'Santhanam'),
    (4, 'Rolex'),
    (5, 'Leo')
]

# Creating the columns in the schema
schema = ["id", "name"]

# Creating a data frame
df2 = spark.createDataFrame(data = data2 , schema = schema)

In [0]:
df2.printSchema()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)



In [0]:
# like is used to check if the match is done with the case being sensitive
df2.filter(df2.name.like('M%')).show()

+---+-----------+
| id|       name|
+---+-----------+
|  2|Michael Raj|
+---+-----------+



In [0]:
# ilike is used to check if the match is done with the case being insensitive
df2.filter(df2.name.ilike('%m%')).show()

+---+-----------+
| id|       name|
+---+-----------+
|  2|Michael Raj|
|  3|  Santhanam|
+---+-----------+



In [0]:
# Using rlike for checking uif the Regular expression is matching
df2.filter(df2.name.rlike('x$')).show()

+---+-----+
| id| name|
+---+-----+
|  4|Rolex|
+---+-----+



#### Filter on array Type Column

In [0]:
# We use array_contains to check for an element in a list
from pyspark.sql.functions import array_contains
df.filter(array_contains(df.languages, 'C++')).show()

+----------------+------------------+-----+------+
|            name|         languages|state|gender|
+----------------+------------------+-----+------+
|{James, , Smith}|[Java, Scala, C++]|   OH|     M|
|  {Anna, Rose, }|[Spark, Java, C++]|   NY|     F|
+----------------+------------------+-----+------+



In [0]:
# If we want to use check rows with multiple elements in an array, we have to use the array contains twice using AND
df.filter(array_contains(df.languages, 'C++') & array_contains(df.languages, 'Scala')).show()

+----------------+------------------+-----+------+
|            name|         languages|state|gender|
+----------------+------------------+-----+------+
|{James, , Smith}|[Java, Scala, C++]|   OH|     M|
+----------------+------------------+-----+------+

