In [1]:
import pyspark
from pyspark.sql import SparkSession

import pyspark.sql.functions as f

from pyspark.sql.types import (
    StructType, StructField,
    IntegerType, LongType, FloatType, DoubleType,
    StringType, DateType, TimestampType, BooleanType,
    MapType, ArrayType
)

In [2]:
spark = SparkSession.builder.appName("pyspark-processing").getOrCreate()

In [12]:
df = spark.read.format("csv").options(header=True, delimeter=",").load("./work/pyspark_v1/from_slack/products.csv")
df.createOrReplaceTempView("products")
df.show(5)

+---+----------+--------+---+----------+--------+
| id|      name|cost_net|vat|cost_gross|category|
+---+----------+--------+---+----------+--------+
|  1|Bindownica| 2003.46|  8|   2163.74|       B|
|  2| Telewizor| 1144.24|  5|   1201.45|       H|
|  3|     Torba|   75.66|  5|     79.44|       E|
|  4| Podkładka| 2653.38|  8|   2865.65|       D|
|  5|   Hub USB|  416.74| 23|    512.59|       G|
+---+----------+--------+---+----------+--------+
only showing top 5 rows



In [16]:
df.select(
    f.col("id").alias("ID"),
    f.col("name").alias("Name"),
    f.col("cost_net").alias("Net Cost"),
    f.col("category").alias("Cat")
).show(5)

+---+----------+--------+---+
| ID|      Name|Net Cost|Cat|
+---+----------+--------+---+
|  1|Bindownica| 2003.46|  B|
|  2| Telewizor| 1144.24|  H|
|  3|     Torba|   75.66|  E|
|  4| Podkładka| 2653.38|  D|
|  5|   Hub USB|  416.74|  G|
+---+----------+--------+---+
only showing top 5 rows



In [17]:
df.select(
    f.col("id").alias("ID"),
    f.col("name").alias("Name"),
    f.col("cost_net").alias("Net Cost"),
    f.col("category").alias("Cat")
).filter((f.col("Cat") == "G")).show(5)

+---+----------+--------+---+
| ID|      Name|Net Cost|Cat|
+---+----------+--------+---+
|  5|   Hub USB|  416.74|  G|
| 27| Słuchawki| 4954.44|  G|
| 32| Konwerter|  116.13|  G|
| 35|    Laptop| 4285.69|  G|
| 36|Smartwatch| 4937.37|  G|
+---+----------+--------+---+
only showing top 5 rows



In [18]:
df.select(
    f.col("id").alias("ID"),
    f.col("name").alias("Name"),
    f.col("cost_net").alias("Net Cost"),
    f.col("category").alias("Cat")
).filter((f.col("Cat").isin(["E", "G"]))).show(5)

+---+---------+--------+---+
| ID|     Name|Net Cost|Cat|
+---+---------+--------+---+
|  3|    Torba|   75.66|  E|
|  5|  Hub USB|  416.74|  G|
|  6|   Pralka| 4507.47|  E|
| 21|    Kabel| 1703.73|  E|
| 22|Ładowarka| 2114.18|  E|
+---+---------+--------+---+
only showing top 5 rows



In [21]:
df.select(
    f.col("id").alias("ID"),
    f.col("name").alias("Name"),
    f.col("cost_net").alias("Net Cost"),
    f.col("category").alias("Cat")
).filter((f.col("cost_net").isNull())).show(5)

+---+----+--------+---+
| ID|Name|Net Cost|Cat|
+---+----+--------+---+
+---+----+--------+---+



In [22]:
spark2 = SparkSession.builder.appName("pyspark-cust").getOrCreate()

In [23]:
df2 = spark2.read.format("csv").options(header=True, delimeter=",").load("./work/pyspark_v1/from_slack/customers.csv")
df2.createOrReplaceTempView("cust")
df2.show(5)

+---+----------+-----------+---------+--------------------+------------+------+
| id|first_name|  last_name|    phone|               email|    acc_type|points|
+---+----------+-----------+---------+--------------------+------------+------+
|  1|    Julian|     Siatka|828223683|juliansiatka@onet.pl|    Ultimate|233101|
|  2|     Tymon|Buszkiewicz|069601377|tymonbuszkiewicz@...|    Ultimate| 61371|
|  3|    Karina|        Roś|160231488| karinaroś@yahoo.com|Professional|215880|
|  4|     Karol|     Marcol|469824688|karolmarcol@inter...|Professional|495433|
|  5| Marcelina|     Gamrat|899504259|marcelinagamrat@o...|Professional|264302|
+---+----------+-----------+---------+--------------------+------------+------+
only showing top 5 rows



In [24]:
df2.select(
    f.col("first_name").alias("Imie"),
    f.col("last_name").alias("Nazwisko"),
    f.col("phone").alias("Telefon"),
    f.col("email").alias("Email")
).show(5)

+---------+-----------+---------+--------------------+
|     Imie|   Nazwisko|  Telefon|               Email|
+---------+-----------+---------+--------------------+
|   Julian|     Siatka|828223683|juliansiatka@onet.pl|
|    Tymon|Buszkiewicz|069601377|tymonbuszkiewicz@...|
|   Karina|        Roś|160231488| karinaroś@yahoo.com|
|    Karol|     Marcol|469824688|karolmarcol@inter...|
|Marcelina|     Gamrat|899504259|marcelinagamrat@o...|
+---------+-----------+---------+--------------------+
only showing top 5 rows



In [26]:
df2.select(
    f.col("first_name").alias("Imie").cast(StringType()),
    f.col("last_name").alias("Nazwisko").cast(StringType()),
    f.col("phone").alias("Telefon").cast(StringType()),
    f.col("email").alias("Email").cast(StringType())
).show(5)

+---------+-----------+---------+--------------------+
|     Imie|   Nazwisko|  Telefon|               Email|
+---------+-----------+---------+--------------------+
|   Julian|     Siatka|828223683|juliansiatka@onet.pl|
|    Tymon|Buszkiewicz|069601377|tymonbuszkiewicz@...|
|   Karina|        Roś|160231488| karinaroś@yahoo.com|
|    Karol|     Marcol|469824688|karolmarcol@inter...|
|Marcelina|     Gamrat|899504259|marcelinagamrat@o...|
+---------+-----------+---------+--------------------+
only showing top 5 rows



In [29]:
df2.select(
    f.col("first_name").alias("Imie").cast(StringType()),
    f.col("last_name").alias("Nazwisko").cast(StringType()),
    f.col("phone").alias("Telefon").cast(StringType()),
    f.col("email").alias("Email").cast(StringType()),
    f.col("points").alias("Punkty").cast(IntegerType()),
    f.col("acc_type").alias("Typ").cast(StringType())
).filter(
    f.col("points").between(1000,2000)
    ).show(5)

+-------+--------+---------+--------------------+------+------------+
|   Imie|Nazwisko|  Telefon|               Email|Punkty|         Typ|
+-------+--------+---------+--------------------+------+------------+
|Cyprian|     Gaś|992307787|    cypriangaś@o2.pl|  1716|       Basic|
|  Filip|  Lesner|675828213|filiplesner@inter...|  1318|    Complete|
|   Emil| Andziak|424375441|emilandziak@hotma...|  1157|    Ultimate|
| Hubert|   Raźny|056316091|hubertraźny@inter...|  1848|       Basic|
|Kajetan|    Sapa|799263020| kajetansapa@onet.pl|  1878|Professional|
+-------+--------+---------+--------------------+------+------------+
only showing top 5 rows



In [31]:
df2.select(
    f.col("first_name").alias("Imie").cast(StringType()),
    f.col("last_name").alias("Nazwisko").cast(StringType()),
    f.col("phone").alias("Telefon").cast(StringType()),
    f.col("email").alias("Email").cast(StringType()),
    f.col("points").alias("Punkty").cast(IntegerType()),
    f.col("acc_type").alias("Typ").cast(StringType())
).where(
    f.col("acc_type").isin(["Basic"])
    ).show(5)

+---------+--------+---------+--------------------+------+-----+
|     Imie|Nazwisko|  Telefon|               Email|Punkty|  Typ|
+---------+--------+---------+--------------------+------+-----+
|   Wiktor|   Kinal|226926214|wiktorkinal@gmail...|496938|Basic|
|     Alex|Kastelik|853740246|alexkastelik@gmai...|177683|Basic|
|Krzysztof| Noculak|031145569|krzysztofnoculak@...|469200|Basic|
|Agnieszka|   Sojda|620859467|agnieszkasojda@in...|358865|Basic|
|Tymoteusz|   Drewa|438936771|tymoteuszdrewa@ho...|112632|Basic|
+---------+--------+---------+--------------------+------+-----+
only showing top 5 rows



In [32]:
df2.select(
    #f.col("first_name").alias("Imie").cast(StringType()),
    #f.col("last_name").alias("Nazwisko").cast(StringType()),
    #f.col("phone").alias("Telefon").cast(StringType()),
    #f.col("email").alias("Email").cast(StringType()),
    #f.col("points").alias("Punkty").cast(IntegerType()),
    f.col("acc_type").alias("Typ").cast(StringType())
).distinct().show()

+------------+
|         Typ|
+------------+
|    Ultimate|
|    Complete|
|Professional|
|       Basic|
+------------+



In [35]:
df2.select(
    f.col("first_name").alias("Imie").cast(StringType()),
    f.col("last_name").alias("Nazwisko").cast(StringType()),
    f.col("phone").alias("Telefon").cast(StringType()),
    f.col("email").alias("Email").cast(StringType()),
    f.col("points").alias("Punkty").cast(IntegerType()),
    f.col("acc_type").alias("Typ").cast(StringType())
).describe().show()

+-------+------+---------+--------------------+--------------------+------------------+--------+
|summary|  Imie| Nazwisko|             Telefon|               Email|            Punkty|     Typ|
+-------+------+---------+--------------------+--------------------+------------------+--------+
|  count| 50000|    50000|               50000|               50000|             50000|   50000|
|   mean|  NULL|     NULL|   5.0021823133304E8|                NULL|      250557.01444|    NULL|
| stddev|  NULL|     NULL|2.8812862863572043E8|                NULL|143853.34462022746|    NULL|
|    min|   Ada|Abramczuk|           000001719|adaadamaszek@inte...|               126|   Basic|
|    max|Łukasz|   Żółtek|           999994166|łukaszżurawik@gma...|            499985|Ultimate|
+-------+------+---------+--------------------+--------------------+------------------+--------+



In [36]:

df3 = spark.read.format("json").options(multiLine=True, inferSchema=False).load("./work/pyspark_v1/from_slack/books.json")
df3.show()
df3.printSchema()

+-------------------+------------+-------------+--------------------------------+--------------------+
|              autor|data_wydania|srednia_ocena|timestamp_ostatniej_aktualizacji|       tytul_ksiazki|
+-------------------+------------+-------------+--------------------------------+--------------------+
|      George Orwell|  1949-06-08|          4.7|            2023-10-01T12:00:00Z|                1984|
|       J.K. Rowling|  1997-06-26|          4.8|            2023-09-25T15:45:00Z|Harry Potter and ...|
|F. Scott Fitzgerald|  1925-04-10|          4.5|            2023-10-03T10:30:00Z|    The Great Gatsby|
+-------------------+------------+-------------+--------------------------------+--------------------+

root
 |-- autor: string (nullable = true)
 |-- data_wydania: string (nullable = true)
 |-- srednia_ocena: double (nullable = true)
 |-- timestamp_ostatniej_aktualizacji: string (nullable = true)
 |-- tytul_ksiazki: string (nullable = true)



In [37]:
book_schema = StructType(
        [
            StructField("autor", StringType(), False),
            StructField("data_wydania", DateType(), False),
            StructField("srednia_ocena", DoubleType(), False),
            StructField("timestamp_ostatniej_aktualizacji", TimestampType(), False),
            StructField("tytul_ksiazki", StringType(), False)
        ]
)

df3.select(
    f.col("autor").alias("autor"),
    f.col("data_wydania").alias("data"),
    f.col("srednia_ocena").alias("AVG ocena"),
    f.col("timestamp_ostatniej_aktualizacji").alias("Data aktualizacji"),
    f.col("tytul_ksiazki").alias("tytul")
).show(5)

+-------------------+----------+---------+--------------------+--------------------+
|              autor|      data|AVG ocena|   Data aktualizacji|               tytul|
+-------------------+----------+---------+--------------------+--------------------+
|      George Orwell|1949-06-08|      4.7|2023-10-01T12:00:00Z|                1984|
|       J.K. Rowling|1997-06-26|      4.8|2023-09-25T15:45:00Z|Harry Potter and ...|
|F. Scott Fitzgerald|1925-04-10|      4.5|2023-10-03T10:30:00Z|    The Great Gatsby|
+-------------------+----------+---------+--------------------+--------------------+



In [None]:
book_schema = StructType(
        [
            StructField("autor", StringType(), False),
            StructField("data_wydania", DateType(), False),
            StructField("srednia_ocena", DoubleType(), False),
            StructField("timestamp_ostatniej_aktualizacji", TimestampType(), False),
            StructField("tytul_ksiazki", StringType(), False)
        ]
)

#Lepiej nie uzywac bo pozniej jest problem z nullable na false - czyli pozwala na nule 
df4 = spark.read.format("json").options(multiLine=True).schema(book_schema).load("./work/pyspark_v1/from_slack/books.json")
df4.show()
df4.printSchema()

+-------------------+------------+-------------+--------------------------------+--------------------+
|              autor|data_wydania|srednia_ocena|timestamp_ostatniej_aktualizacji|       tytul_ksiazki|
+-------------------+------------+-------------+--------------------------------+--------------------+
|      George Orwell|  1949-06-08|          4.7|             2023-10-01 12:00:00|                1984|
|       J.K. Rowling|  1997-06-26|          4.8|             2023-09-25 15:45:00|Harry Potter and ...|
|F. Scott Fitzgerald|  1925-04-10|          4.5|             2023-10-03 10:30:00|    The Great Gatsby|
+-------------------+------------+-------------+--------------------------------+--------------------+

root
 |-- autor: string (nullable = true)
 |-- data_wydania: date (nullable = true)
 |-- srednia_ocena: double (nullable = true)
 |-- timestamp_ostatniej_aktualizacji: timestamp (nullable = true)
 |-- tytul_ksiazki: string (nullable = true)



## **Grupowanie, agregacja i sortowanie**


In [39]:
df5 = spark2.read.format("csv").options(header=True, delimeter=",").load("./work/pyspark_v1/from_slack/customers.csv")
df5.show(5)


+---+----------+-----------+---------+--------------------+------------+------+
| id|first_name|  last_name|    phone|               email|    acc_type|points|
+---+----------+-----------+---------+--------------------+------------+------+
|  1|    Julian|     Siatka|828223683|juliansiatka@onet.pl|    Ultimate|233101|
|  2|     Tymon|Buszkiewicz|069601377|tymonbuszkiewicz@...|    Ultimate| 61371|
|  3|    Karina|        Roś|160231488| karinaroś@yahoo.com|Professional|215880|
|  4|     Karol|     Marcol|469824688|karolmarcol@inter...|Professional|495433|
|  5| Marcelina|     Gamrat|899504259|marcelinagamrat@o...|Professional|264302|
+---+----------+-----------+---------+--------------------+------------+------+
only showing top 5 rows



In [54]:
df5.groupBy("acc_type").agg(
    f.count("id").alias("count_po_id")
).orderBy(
    f.col("acc_type").desc()
    ).show(5)

+------------+-----------+
|    acc_type|count_po_id|
+------------+-----------+
|    Ultimate|      12382|
|Professional|      12442|
|    Complete|      12652|
|       Basic|      12524|
+------------+-----------+



# **Funkcje Okna**

In [40]:
from pyspark.sql import Window

In [42]:
df6 = spark2.read.format("csv").options(header=True, delimeter=",").load("./work/pyspark_v1/from_slack/products.csv")
df6.show(5)

+---+----------+--------+---+----------+--------+
| id|      name|cost_net|vat|cost_gross|category|
+---+----------+--------+---+----------+--------+
|  1|Bindownica| 2003.46|  8|   2163.74|       B|
|  2| Telewizor| 1144.24|  5|   1201.45|       H|
|  3|     Torba|   75.66|  5|     79.44|       E|
|  4| Podkładka| 2653.38|  8|   2865.65|       D|
|  5|   Hub USB|  416.74| 23|    512.59|       G|
+---+----------+--------+---+----------+--------+
only showing top 5 rows



In [49]:
window_prod = Window.partitionBy(f.col("category"))

df6.select(
    f.col("id"),
    f.col("category"),
    f.avg(f.col("cost_gross")).over(window_prod).alias("avg_per_cat")
).show(5)

+---+--------+-----------------+
| id|category|      avg_per_cat|
+---+--------+-----------------+
| 16|       A|2795.846015773731|
| 59|       A|2795.846015773731|
| 89|       A|2795.846015773731|
|101|       A|2795.846015773731|
|109|       A|2795.846015773731|
+---+--------+-----------------+
only showing top 5 rows



## JOINY


In [55]:
order_details = spark2.read.format("csv").options(header=True, delimeter=",").load("./work/pyspark_v1/from_slack/order_details.csv")
orders = spark2.read.format("csv").options(header=True, delimeter=",").load("./work/pyspark_v1/from_slack/orders.csv")

order_details.show(5)
orders.show(5)

+--------+----------+--------+
|order_id|product_id|quantity|
+--------+----------+--------+
|       1|     14490|       4|
|       1|     28697|       5|
|       2|     24995|       1|
|       2|      6036|       4|
|       3|      6258|       2|
+--------+----------+--------+
only showing top 5 rows

+---+------+----------+--------------------+--------------+
| id|client|order_date|              status|payment_method|
+---+------+----------+--------------------+--------------+
|  1| 20601|2022-01-27|Częściowo zrealiz...|      Transfer|
|  2| 35536|2022-03-10|        W realizacji|          Cash|
|  3| 24865|2017-04-01|         Dostarczone|          Cash|
|  4|  9502|2017-03-18|         Dostarczone|      Transfer|
|  5|  2948|2016-02-08|             Wysłane|          Cash|
+---+------+----------+--------------------+--------------+
only showing top 5 rows



In [None]:
orders.join(
    order_details,
    orders["id"] == order_details["order_id"],
    "inner"
).show(truncate=False)



+---+------+----------+----------------------+--------------+--------+----------+--------+
|id |client|order_date|status                |payment_method|order_id|product_id|quantity|
+---+------+----------+----------------------+--------------+--------+----------+--------+
|1  |20601 |2022-01-27|Częściowo zrealizowane|Transfer      |1       |28697     |5       |
|1  |20601 |2022-01-27|Częściowo zrealizowane|Transfer      |1       |14490     |4       |
|2  |35536 |2022-03-10|W realizacji          |Cash          |2       |6036      |4       |
|2  |35536 |2022-03-10|W realizacji          |Cash          |2       |24995     |1       |
|3  |24865 |2017-04-01|Dostarczone           |Cash          |3       |6258      |2       |
|4  |9502  |2017-03-18|Dostarczone           |Transfer      |4       |23255     |3       |
|4  |9502  |2017-03-18|Dostarczone           |Transfer      |4       |14012     |2       |
|5  |2948  |2016-02-08|Wysłane               |Cash          |5       |2171      |4       |

JAK ROBIC LEPIEJ

In [63]:
## tylko dla jednaj kolumny
pre_join_orders = orders.select(
    f.col("id").alias("order_id"),
    f.col("client").alias("client"),
    f.col("order_date").alias("order_date")
)

order_det = pre_join_orders.join(
    order_details,
    ["order_id"],
    "inner"
).show(truncate=False)


+--------+------+----------+----------+--------+
|order_id|client|order_date|product_id|quantity|
+--------+------+----------+----------+--------+
|1       |20601 |2022-01-27|14490     |4       |
|1       |20601 |2022-01-27|28697     |5       |
|2       |35536 |2022-03-10|24995     |1       |
|2       |35536 |2022-03-10|6036      |4       |
|3       |24865 |2017-04-01|6258      |2       |
|4       |9502  |2017-03-18|14012     |2       |
|4       |9502  |2017-03-18|23255     |3       |
|5       |2948  |2016-02-08|16558     |1       |
|5       |2948  |2016-02-08|2171      |4       |
|6       |32727 |2018-08-13|5092      |4       |
|6       |32727 |2018-08-13|7515      |2       |
|7       |43006 |2014-10-12|8706      |3       |
|8       |2941  |2019-09-20|17724     |5       |
|9       |38611 |2022-10-17|20695     |3       |
|10      |21556 |2019-08-22|24368     |4       |
|10      |21556 |2019-08-22|19999     |2       |
|11      |26579 |2014-02-05|13492     |1       |
|11      |26579 |201

In [None]:
## dla wiekszej ilosci conditions w zapytaniu, jakby mialo byc jeszcze data> od cos tam to lepiej tak
orders.join(
    order_details,
    orders["id"] == order_details["order_id"],
    "inner"
).drop(f.col("id")).show(truncate=False)


+------+----------+----------------------+--------------+--------+----------+--------+
|client|order_date|status                |payment_method|order_id|product_id|quantity|
+------+----------+----------------------+--------------+--------+----------+--------+
|20601 |2022-01-27|Częściowo zrealizowane|Transfer      |1       |28697     |5       |
|20601 |2022-01-27|Częściowo zrealizowane|Transfer      |1       |14490     |4       |
|35536 |2022-03-10|W realizacji          |Cash          |2       |6036      |4       |
|35536 |2022-03-10|W realizacji          |Cash          |2       |24995     |1       |
|24865 |2017-04-01|Dostarczone           |Cash          |3       |6258      |2       |
|9502  |2017-03-18|Dostarczone           |Transfer      |4       |23255     |3       |
|9502  |2017-03-18|Dostarczone           |Transfer      |4       |14012     |2       |
|2948  |2016-02-08|Wysłane               |Cash          |5       |2171      |4       |
|2948  |2016-02-08|Wysłane               |C

In [61]:
products = spark2.read.format("csv").options(header=True, delimeter=",").load("./work/pyspark_v1/from_slack/products.csv")
products.show()

+---+------------------+--------+---+----------+--------+
| id|              name|cost_net|vat|cost_gross|category|
+---+------------------+--------+---+----------+--------+
|  1|        Bindownica| 2003.46|  8|   2163.74|       B|
|  2|         Telewizor| 1144.24|  5|   1201.45|       H|
|  3|             Torba|   75.66|  5|     79.44|       E|
|  4|         Podkładka| 2653.38|  8|   2865.65|       D|
|  5|           Hub USB|  416.74| 23|    512.59|       G|
|  6|            Pralka| 4507.47| 12|   5048.37|       E|
|  7|             Tuner| 3951.41|  8|   4267.52|       D|
|  8|          Soundbar| 3513.14|  5|    3688.8|       D|
|  9|      Płyta główna| 4339.85| 12|   4860.63|       D|
| 10|Wzmacniacz sygnału| 2203.85| 12|   2468.31|       B|
| 11|              Mysz|   487.9| 23|    600.12|       B|
| 12|           Adapter|  4200.7|  5|   4410.73|       D|
| 13|             Modem| 4841.92|  5|   5084.02|       F|
| 14|          Zmywarka| 2925.31|  5|   3071.58|       H|
| 15|         

In [91]:
pre_products = products.select(
    f.col("id").alias("product_id"),
    f.col("cost_net"),
    f.col("cost_gross")
)

pre_join_orders = orders.select(
    f.col("id").alias("order_id"),
    f.col("client").alias("client"),
    f.col("order_date").alias("order_date")
)

window_opt = Window.partitionBy(f.col("order_id"), f.col("order_date"))

full = pre_join_orders.join(
    order_details,
    ["order_id"],
    "inner"
).join(
    pre_products,
    ["product_id"],
    "inner"
    )
#full.show(5)

full.groupBy(
    f.col("order_id"),
    f.col("order_date")
).agg(
    f.count(f.col("order_id")).alias("count_of_positions"),
    f.round(f.sum( f.col("cost_gross") * f.col("quantity") ), 2).alias("total_cost")
).orderBy(f.col("order_id")).show(5)

full.select(
        f.col("order_id"),
        f.col("order_date"),
        f.count("product_id").over(window_opt).alias("count"),
        f.sum(f.col("cost_gross") * f.col("quantity")).over(window_opt).alias("suma")
        ).orderBy("order_id").show(5)

+--------+----------+------------------+----------+
|order_id|order_date|count_of_positions|total_cost|
+--------+----------+------------------+----------+
|       1|2022-01-27|                 2|  30297.21|
|      10|2019-08-22|                 2|   6255.12|
|     100|2014-09-25|                 2|   1863.62|
|    1000|2015-07-13|                 1|  12019.59|
|   10000|2019-11-18|                 1|  13585.84|
+--------+----------+------------------+----------+
only showing top 5 rows

+--------+----------+-----+------------------+
|order_id|order_date|count|              suma|
+--------+----------+-----+------------------+
|       1|2022-01-27|    2|30297.210000000003|
|       1|2022-01-27|    2|30297.210000000003|
|      10|2019-08-22|    2|           6255.12|
|      10|2019-08-22|    2|           6255.12|
|     100|2014-09-25|    2|1863.6200000000001|
+--------+----------+-----+------------------+
only showing top 5 rows



In [None]:
spark.stop()
spark2.stop()