<a href="https://colab.research.google.com/github/Matteo-Artuso/pyspark_exemple/blob/main/2_spark_data_filtering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 40 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 46.6 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.1-py2.py3-none-any.whl size=281845512 sha256=1ce40a4713703bf05aca5cf5e7046e6e8658f8eb93ee261d26f93c5b83cc9b76
  Stored in directory: /root/.cache/pip/wheels/43/dc/11/ec201cd671da62fa9c5cc77078235e40722170ceba231d7598
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.5 pyspark-3.3.1


<h4>Operations</h4>
<p>Spark supports two different types of operations</p>
<ul>
    <li><b>Transformations</b> on RDDs return another RDD as a result (e.g., filter()), this is why they are called lazy operations.</li>
    <li><b>Actions</b> return values from RDDs</li>
</ul>

In [3]:
# Import the basic spark library
from pyspark.sql import SparkSession

# Create an entry point to the PySpark Application
spark = SparkSession.builder \
      .master("local") \
      .appName("MyFirstSparkApplication") \
      .getOrCreate()

In [4]:
from pyspark.sql.types import StructType, StructField, StringType, FloatType, ArrayType

#Createe the schema using StructField(Name, Type, Nullable)
schema = StructType([ \
    StructField("Pizza Name", StringType(), True), \
    StructField("Price", FloatType(), True), \
    StructField("Ingredients", ArrayType(StringType()), True) \
])

df_data = [("Margherita", 5.95, ["Tomato Sauce", "Mozzarella Cheese", "Basil"]),
        ("Calzone", 7.95, ["Tomato Sauce", "Mozzarella Cheese", "Prosciutto Cotto"]),
        ("Diavola", 5.95, ["Tomato Sauce", "Mozzarella Cheese", "Spicy Salame"]),
        ("Prosciutto", 7.95, ["Tomato Sauce", "Mozzarella Cheese", "Prosciutto Cotto"]),
        ("Speck & Brie", 7.95, ["Tomato Sauce", "Mozzarella Cheese", "Speck", "Brie"]),
        ("Tonno & Cipolle", 7.95, ["Tomato Sauce", "Mozzarella Cheese", "Tuna", "Onions"]),
        ("Fries", 3.95, ["Potatoes"])]

df = spark.createDataFrame(data = df_data, schema = schema)
df.printSchema()
df.show(truncate=False)

root
 |-- Pizza Name: string (nullable = true)
 |-- Price: float (nullable = true)
 |-- Ingredients: array (nullable = true)
 |    |-- element: string (containsNull = true)

+---------------+-----+---------------------------------------------------+
|Pizza Name     |Price|Ingredients                                        |
+---------------+-----+---------------------------------------------------+
|Margherita     |5.95 |[Tomato Sauce, Mozzarella Cheese, Basil]           |
|Calzone        |7.95 |[Tomato Sauce, Mozzarella Cheese, Prosciutto Cotto]|
|Diavola        |5.95 |[Tomato Sauce, Mozzarella Cheese, Spicy Salame]    |
|Prosciutto     |7.95 |[Tomato Sauce, Mozzarella Cheese, Prosciutto Cotto]|
|Speck & Brie   |7.95 |[Tomato Sauce, Mozzarella Cheese, Speck, Brie]     |
|Tonno & Cipolle|7.95 |[Tomato Sauce, Mozzarella Cheese, Tuna, Onions]    |
|Fries          |3.95 |[Potatoes]                                         |
+---------------+-----+-------------------------------------------

<h4>Filtering operations (i.e., WHERE conditions)</h4>

In [5]:
# Filtering using equal condition
df.filter(df.Price == "7.95").show(truncate = False)

+---------------+-----+---------------------------------------------------+
|Pizza Name     |Price|Ingredients                                        |
+---------------+-----+---------------------------------------------------+
|Calzone        |7.95 |[Tomato Sauce, Mozzarella Cheese, Prosciutto Cotto]|
|Prosciutto     |7.95 |[Tomato Sauce, Mozzarella Cheese, Prosciutto Cotto]|
|Speck & Brie   |7.95 |[Tomato Sauce, Mozzarella Cheese, Speck, Brie]     |
|Tonno & Cipolle|7.95 |[Tomato Sauce, Mozzarella Cheese, Tuna, Onions]    |
+---------------+-----+---------------------------------------------------+



In [6]:
# Filtering using not equal condition
df.filter(df.Price != "7.95").show(truncate = False)

+----------+-----+-----------------------------------------------+
|Pizza Name|Price|Ingredients                                    |
+----------+-----+-----------------------------------------------+
|Margherita|5.95 |[Tomato Sauce, Mozzarella Cheese, Basil]       |
|Diavola   |5.95 |[Tomato Sauce, Mozzarella Cheese, Spicy Salame]|
|Fries     |3.95 |[Potatoes]                                     |
+----------+-----+-----------------------------------------------+



In [7]:
# Filtering using the col() function
from pyspark.sql.functions import col

df.filter(col("Price") == "7.95").show(truncate = False)

+---------------+-----+---------------------------------------------------+
|Pizza Name     |Price|Ingredients                                        |
+---------------+-----+---------------------------------------------------+
|Calzone        |7.95 |[Tomato Sauce, Mozzarella Cheese, Prosciutto Cotto]|
|Prosciutto     |7.95 |[Tomato Sauce, Mozzarella Cheese, Prosciutto Cotto]|
|Speck & Brie   |7.95 |[Tomato Sauce, Mozzarella Cheese, Speck, Brie]     |
|Tonno & Cipolle|7.95 |[Tomato Sauce, Mozzarella Cheese, Tuna, Onions]    |
+---------------+-----+---------------------------------------------------+



In [8]:
# Filtering using SQL Expression
df.filter("Price == '7.95'").show(truncate = False)

+---------------+-----+---------------------------------------------------+
|Pizza Name     |Price|Ingredients                                        |
+---------------+-----+---------------------------------------------------+
|Calzone        |7.95 |[Tomato Sauce, Mozzarella Cheese, Prosciutto Cotto]|
|Prosciutto     |7.95 |[Tomato Sauce, Mozzarella Cheese, Prosciutto Cotto]|
|Speck & Brie   |7.95 |[Tomato Sauce, Mozzarella Cheese, Speck, Brie]     |
|Tonno & Cipolle|7.95 |[Tomato Sauce, Mozzarella Cheese, Tuna, Onions]    |
+---------------+-----+---------------------------------------------------+



In [9]:
# Filtering with multiple conditions
df.filter((df.Price == "7.95") & (col("Pizza Name") == "Calzone")).show(truncate = False)

# N.B. Parenthesis are essential!

+----------+-----+---------------------------------------------------+
|Pizza Name|Price|Ingredients                                        |
+----------+-----+---------------------------------------------------+
|Calzone   |7.95 |[Tomato Sauce, Mozzarella Cheese, Prosciutto Cotto]|
+----------+-----+---------------------------------------------------+



In [10]:
# Filtering w.r.t. a list of elements
favourite_pizzas = ["Speck & Brie", "Tonno & Cipolle"]

# "is in" Filtering
df.filter(col("Pizza Name").isin(favourite_pizzas)).show(truncate = False)

# "is not in" Filtering
df.filter(col("Pizza Name").isin(favourite_pizzas) == False).show(truncate = False)

+---------------+-----+-----------------------------------------------+
|Pizza Name     |Price|Ingredients                                    |
+---------------+-----+-----------------------------------------------+
|Speck & Brie   |7.95 |[Tomato Sauce, Mozzarella Cheese, Speck, Brie] |
|Tonno & Cipolle|7.95 |[Tomato Sauce, Mozzarella Cheese, Tuna, Onions]|
+---------------+-----+-----------------------------------------------+

+----------+-----+---------------------------------------------------+
|Pizza Name|Price|Ingredients                                        |
+----------+-----+---------------------------------------------------+
|Margherita|5.95 |[Tomato Sauce, Mozzarella Cheese, Basil]           |
|Calzone   |7.95 |[Tomato Sauce, Mozzarella Cheese, Prosciutto Cotto]|
|Diavola   |5.95 |[Tomato Sauce, Mozzarella Cheese, Spicy Salame]    |
|Prosciutto|7.95 |[Tomato Sauce, Mozzarella Cheese, Prosciutto Cotto]|
|Fries     |3.95 |[Potatoes]                                         |

In [11]:
# Filtering w.r.t. a list of elements

# collect() -> Extract the list of rows from the resulting RDD
expensive_pizzas = df.filter(col("Price") == "7.95").select("Pizza Name").collect()

# Extract the value of the chosen field
expensive_pizzas = [ep[0] for ep in expensive_pizzas]

# "is in" Filtering
df.filter(col("Pizza Name").isin(expensive_pizzas)).show(truncate = False)

# "is not in" Filtering
df.filter(col("Pizza Name").isin(expensive_pizzas) == False).show(truncate = False)

+---------------+-----+---------------------------------------------------+
|Pizza Name     |Price|Ingredients                                        |
+---------------+-----+---------------------------------------------------+
|Calzone        |7.95 |[Tomato Sauce, Mozzarella Cheese, Prosciutto Cotto]|
|Prosciutto     |7.95 |[Tomato Sauce, Mozzarella Cheese, Prosciutto Cotto]|
|Speck & Brie   |7.95 |[Tomato Sauce, Mozzarella Cheese, Speck, Brie]     |
|Tonno & Cipolle|7.95 |[Tomato Sauce, Mozzarella Cheese, Tuna, Onions]    |
+---------------+-----+---------------------------------------------------+

+----------+-----+-----------------------------------------------+
|Pizza Name|Price|Ingredients                                    |
+----------+-----+-----------------------------------------------+
|Margherita|5.95 |[Tomato Sauce, Mozzarella Cheese, Basil]       |
|Diavola   |5.95 |[Tomato Sauce, Mozzarella Cheese, Spicy Salame]|
|Fries     |3.95 |[Potatoes]                            

In [12]:
# Filtering based on the content of the column

# Filtering based on the initial letter(s)
df.filter(col("Pizza Name").startswith("To")).show(truncate = False)

# Filtering based on the ending letter(s)
df.filter(col("Pizza Name").endswith("one")).show(truncate = False)

# Filtering based on whether a word is contained in the word
df.filter(col("Pizza Name").contains("&")).show(truncate = False)

+---------------+-----+-----------------------------------------------+
|Pizza Name     |Price|Ingredients                                    |
+---------------+-----+-----------------------------------------------+
|Tonno & Cipolle|7.95 |[Tomato Sauce, Mozzarella Cheese, Tuna, Onions]|
+---------------+-----+-----------------------------------------------+

+----------+-----+---------------------------------------------------+
|Pizza Name|Price|Ingredients                                        |
+----------+-----+---------------------------------------------------+
|Calzone   |7.95 |[Tomato Sauce, Mozzarella Cheese, Prosciutto Cotto]|
+----------+-----+---------------------------------------------------+

+---------------+-----+-----------------------------------------------+
|Pizza Name     |Price|Ingredients                                    |
+---------------+-----+-----------------------------------------------+
|Speck & Brie   |7.95 |[Tomato Sauce, Mozzarella Cheese, Speck, Bri

In [13]:
# Filtering using like (i.e., SQL LIKE) 
df.filter(col("Pizza Name").like("%on%")).show(truncate = False)

# Filtering using rlike (i.e., REGEX LIKE)
df.filter(col("Pizza Name").rlike("[A-z]*&[A-z]*")).show(truncate = False)

+---------------+-----+---------------------------------------------------+
|Pizza Name     |Price|Ingredients                                        |
+---------------+-----+---------------------------------------------------+
|Calzone        |7.95 |[Tomato Sauce, Mozzarella Cheese, Prosciutto Cotto]|
|Tonno & Cipolle|7.95 |[Tomato Sauce, Mozzarella Cheese, Tuna, Onions]    |
+---------------+-----+---------------------------------------------------+

+---------------+-----+-----------------------------------------------+
|Pizza Name     |Price|Ingredients                                    |
+---------------+-----+-----------------------------------------------+
|Speck & Brie   |7.95 |[Tomato Sauce, Mozzarella Cheese, Speck, Brie] |
|Tonno & Cipolle|7.95 |[Tomato Sauce, Mozzarella Cheese, Tuna, Onions]|
+---------------+-----+-----------------------------------------------+



In [14]:
# Filtering on array columns
from pyspark.sql.functions import array_contains

# Filtering on a single value
df.filter(array_contains(df.Ingredients, "Tomato Sauce")).show(truncate = False)

# Filtering on multiple values
df.filter(array_contains(df.Ingredients, "Tomato Sauce") & array_contains(df.Ingredients, "Basil")).show(truncate = False)

+---------------+-----+---------------------------------------------------+
|Pizza Name     |Price|Ingredients                                        |
+---------------+-----+---------------------------------------------------+
|Margherita     |5.95 |[Tomato Sauce, Mozzarella Cheese, Basil]           |
|Calzone        |7.95 |[Tomato Sauce, Mozzarella Cheese, Prosciutto Cotto]|
|Diavola        |5.95 |[Tomato Sauce, Mozzarella Cheese, Spicy Salame]    |
|Prosciutto     |7.95 |[Tomato Sauce, Mozzarella Cheese, Prosciutto Cotto]|
|Speck & Brie   |7.95 |[Tomato Sauce, Mozzarella Cheese, Speck, Brie]     |
|Tonno & Cipolle|7.95 |[Tomato Sauce, Mozzarella Cheese, Tuna, Onions]    |
+---------------+-----+---------------------------------------------------+

+----------+-----+----------------------------------------+
|Pizza Name|Price|Ingredients                             |
+----------+-----+----------------------------------------+
|Margherita|5.95 |[Tomato Sauce, Mozzarella Cheese, Basil]|

In [15]:
# Limit the results to the first 5 elements
df.limit(5).show(truncate = False)

+------------+-----+---------------------------------------------------+
|Pizza Name  |Price|Ingredients                                        |
+------------+-----+---------------------------------------------------+
|Margherita  |5.95 |[Tomato Sauce, Mozzarella Cheese, Basil]           |
|Calzone     |7.95 |[Tomato Sauce, Mozzarella Cheese, Prosciutto Cotto]|
|Diavola     |5.95 |[Tomato Sauce, Mozzarella Cheese, Spicy Salame]    |
|Prosciutto  |7.95 |[Tomato Sauce, Mozzarella Cheese, Prosciutto Cotto]|
|Speck & Brie|7.95 |[Tomato Sauce, Mozzarella Cheese, Speck, Brie]     |
+------------+-----+---------------------------------------------------+

