In [1]:
from glob import glob
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType
import pandas as pd

In [2]:
# Create a session
spark = SparkSession.builder.appName("PySpark Introduction").getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/11/27 14:46:25 WARN Utils: Your hostname, minhhuunguyen-macbookpro.local, resolves to a loopback address: 127.0.0.1; using 10.48.60.70 instead (on interface en0)
25/11/27 14:46:25 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/27 14:46:26 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
CSV_PATH = './data/datacamp_ecommerce.csv'

In [4]:
df = spark.read.csv(CSV_PATH)
df

DataFrame[_c0: string, _c1: string, _c2: string, _c3: string, _c4: string, _c5: string, _c6: string, _c7: string]

In [5]:
df.show()

+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|      _c0|      _c1|                 _c2|     _c3|           _c4|      _c5|       _c6|           _c7|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|   InvoiceDate|UnitPrice|CustomerID|       Country|
|   536365|   85123A|WHITE HANGING HEA...|       6|12/1/2010 8:26|     2.55|     17850|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|12/1/2010 8:26|     2.75|     17850|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
|   536365|    22752|SET 7 BABUSHKA NE...|       2|12/1/2010 8:26|     7.

In [6]:
df = spark.read.csv(CSV_PATH, header=True)
df

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: string, InvoiceDate: string, UnitPrice: string, CustomerID: string, Country: string]

In [7]:
df.show()

+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|   InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|12/1/2010 8:26|     2.55|     17850|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|12/1/2010 8:26|     2.75|     17850|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
|   536365|    22752|SET 7 BABUSHKA NE...|       2|12/1/2010 8:26|     7.65|     17850|United Kingdom|
|   536365|    21730|GLASS STAR FROSTE...|       6|12/1/2010 8:26|     4.

In [8]:
schema = StructType([
    StructField("InvoiceNo", StringType(), True),
    StructField("StockCode", StringType(), True),
    StructField("Description", StringType(), True),
    StructField("Quantity", IntegerType(), True),
    StructField("InvoiceDate", StringType(), True),
    StructField("UnitPrice", DoubleType(), True),
    StructField("CustomerID", StringType(), True),
    StructField("Country", StringType(), True)
])

df = spark.read.csv(CSV_PATH, header=True, schema=schema)
df

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: string, UnitPrice: double, CustomerID: string, Country: string]

In [9]:
df.show()

+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|   InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|12/1/2010 8:26|     2.55|     17850|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|12/1/2010 8:26|     2.75|     17850|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
|   536365|    22752|SET 7 BABUSHKA NE...|       2|12/1/2010 8:26|     7.65|     17850|United Kingdom|
|   536365|    21730|GLASS STAR FROSTE...|       6|12/1/2010 8:26|     4.

In [10]:
df.count()

541909

## 1. How many unique customers are present in the dataframe?

In [11]:
df.select('CustomerID').distinct().count()

                                                                                

4373

## 2. What country do most purchases come from?

In [12]:
from pyspark.sql.functions import countDistinct

df.groupBy('Country').agg(countDistinct('InvoiceNo').alias('country_count')).show()

+------------------+-------------+
|           Country|country_count|
+------------------+-------------+
|            Sweden|           46|
|         Singapore|           10|
|           Germany|          603|
|               RSA|            1|
|            France|          461|
|            Greece|            6|
|European Community|            5|
|           Belgium|          119|
|           Finland|           48|
|             Malta|           10|
|       Unspecified|           13|
|             Italy|           55|
|              EIRE|          360|
|         Lithuania|            4|
|            Norway|           40|
|             Spain|          105|
|           Denmark|           21|
|         Hong Kong|           15|
|            Israel|            9|
|           Iceland|            7|
+------------------+-------------+
only showing top 20 rows


In [13]:
from pyspark.sql.functions import desc

# with sort
df.groupBy('Country').agg(countDistinct('InvoiceNo').alias('country_count')).orderBy(desc('country_count')).show()

+---------------+-------------+
|        Country|country_count|
+---------------+-------------+
| United Kingdom|        23494|
|        Germany|          603|
|         France|          461|
|           EIRE|          360|
|        Belgium|          119|
|          Spain|          105|
|    Netherlands|          101|
|    Switzerland|           74|
|       Portugal|           71|
|      Australia|           69|
|          Italy|           55|
|        Finland|           48|
|         Sweden|           46|
|         Norway|           40|
|Channel Islands|           33|
|          Japan|           28|
|         Poland|           24|
|        Denmark|           21|
|         Cyprus|           20|
|        Austria|           19|
+---------------+-------------+
only showing top 20 rows


## 3. When was the most recent/most early purchase made by a customer on the platform?

In [14]:
df.dtypes

[('InvoiceNo', 'string'),
 ('StockCode', 'string'),
 ('Description', 'string'),
 ('Quantity', 'int'),
 ('InvoiceDate', 'string'),
 ('UnitPrice', 'double'),
 ('CustomerID', 'string'),
 ('Country', 'string')]

In [15]:
from pyspark.sql.functions import to_timestamp

spark.sql('set spark.sql.legacy.timeParserPolicy=LEGACY')
df = df.withColumn('InvoiceDate', to_timestamp('InvoiceDate', 'MM/dd/yyyy HH:mm'))
df.show()

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|     17850|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|     17850|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|2010-12-01 08:26:00|     2.75|     17850|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|2010-12-01 08:26:00|     3.39|     17850|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|2010-12-01 08:26:00|     3.39|     17850|United Kingdom|
|   536365|    22752|SET 7 BABUSHKA NE...|       2|2010-12-01 08:26:00|     7.65|     17850|United Kingdom|
|   536365|    21730|GLASS S

In [16]:
df.dtypes

[('InvoiceNo', 'string'),
 ('StockCode', 'string'),
 ('Description', 'string'),
 ('Quantity', 'int'),
 ('InvoiceDate', 'timestamp'),
 ('UnitPrice', 'double'),
 ('CustomerID', 'string'),
 ('Country', 'string')]

In [17]:
from pyspark.sql.functions import min as sql_func_min, max as sql_func_max

df.select(sql_func_min("InvoiceDate")).show()

+-------------------+
|   min(InvoiceDate)|
+-------------------+
|2010-12-01 08:26:00|
+-------------------+



In [18]:
df.select(sql_func_max("InvoiceDate")).show()

+-------------------+
|   max(InvoiceDate)|
+-------------------+
|2011-12-09 12:50:00|
+-------------------+



## 4. What was the highest/lowest purchase made by a customer on the platform?

In [19]:
df.show()

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|     17850|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|     17850|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|2010-12-01 08:26:00|     2.75|     17850|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|2010-12-01 08:26:00|     3.39|     17850|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|2010-12-01 08:26:00|     3.39|     17850|United Kingdom|
|   536365|    22752|SET 7 BABUSHKA NE...|       2|2010-12-01 08:26:00|     7.65|     17850|United Kingdom|
|   536365|    21730|GLASS S

In [20]:
df = df.withColumn('TotalPrice', df.Quantity * df.UnitPrice)
df.show()

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+------------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|        TotalPrice|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+------------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|     17850|United Kingdom|15.299999999999999|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|     17850|United Kingdom|             20.34|
|   536365|   84406B|CREAM CUPID HEART...|       8|2010-12-01 08:26:00|     2.75|     17850|United Kingdom|              22.0|
|   536365|   84029G|KNITTED UNION FLA...|       6|2010-12-01 08:26:00|     3.39|     17850|United Kingdom|             20.34|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|2010-12-01 08:26:00|     3.39|     17850|United Kingdom|    

In [21]:
df.dtypes

[('InvoiceNo', 'string'),
 ('StockCode', 'string'),
 ('Description', 'string'),
 ('Quantity', 'int'),
 ('InvoiceDate', 'timestamp'),
 ('UnitPrice', 'double'),
 ('CustomerID', 'string'),
 ('Country', 'string'),
 ('TotalPrice', 'double')]

In [22]:
from pyspark.sql.functions import sum as sql_func_sum, asc

df.groupby('InvoiceNo') \
    .agg(sql_func_sum(df.Quantity * df.UnitPrice).alias('InvoiceTotalPrice')) \
    .orderBy(asc('InvoiceTotalPrice')).show()

+---------+-----------------+
|InvoiceNo|InvoiceTotalPrice|
+---------+-----------------+
|  C581484|        -168469.6|
|  C541433|         -77183.6|
|  C556445|         -38970.0|
|  C550456|         -22998.4|
|  C580605|        -17836.46|
|  C540117|        -16888.02|
|  C540118|        -16453.71|
|  C537651|        -13541.33|
|  C537630|        -13541.33|
|  C537644|        -13474.79|
|  C570556|        -11816.64|
|  C580604|         -11586.5|
|  A563187|        -11062.06|
|  A563186|        -11062.06|
|  C573079|         -8322.12|
|  C574902|         -8286.22|
|  C551685|         -8142.75|
|  C566899|         -7427.97|
|  C553355|         -7006.83|
|  C551699|          -6930.0|
+---------+-----------------+
only showing top 20 rows


In [23]:
df.groupby('InvoiceNo') \
    .agg(sql_func_sum('TotalPrice').alias('InvoiceTotalPrice')) \
    .orderBy(desc('InvoiceTotalPrice')).show()

+---------+------------------+
|InvoiceNo| InvoiceTotalPrice|
+---------+------------------+
|   581483|          168469.6|
|   541431|           77183.6|
|   574941| 52940.93999999999|
|   576365|50653.909999999996|
|   556444|           38970.0|
|   567423|31698.159999999996|
|   556917|22775.930000000008|
|   572209|           22206.0|
|   567381|           22104.8|
|   563614|21880.439999999995|
|   548203|          21627.72|
|   550461|           21535.9|
|   572035|          20277.92|
|   563076| 19150.65999999999|
|   562439|18841.480000000003|
|   539750|18745.859999999997|
|   573585|16874.579999999936|
|   541220|16774.719999999998|
|   545475|          16726.84|
|   562955|16677.260000000002|
+---------+------------------+
only showing top 20 rows


## 5. Other syntax

### 5.1. SQL

In [24]:
df.createOrReplaceTempView("CUSTOMER_DATA")
sql_df = spark.sql("SELECT * from CUSTOMER_DATA")
sql_df.show()

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+------------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|        TotalPrice|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+------------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|     17850|United Kingdom|15.299999999999999|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|     17850|United Kingdom|             20.34|
|   536365|   84406B|CREAM CUPID HEART...|       8|2010-12-01 08:26:00|     2.75|     17850|United Kingdom|              22.0|
|   536365|   84029G|KNITTED UNION FLA...|       6|2010-12-01 08:26:00|     3.39|     17850|United Kingdom|             20.34|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|2010-12-01 08:26:00|     3.39|     17850|United Kingdom|    

In [25]:
sql_df = spark.sql(
    '''
    SELECT InvoiceDate, CustomerID, TotalPrice
    FROM CUSTOMER_DATA
    WHERE CustomerID == 13047
    '''
)
sql_df.show()

+-------------------+----------+------------------+
|        InvoiceDate|CustomerID|        TotalPrice|
+-------------------+----------+------------------+
|2010-12-01 08:34:00|     13047|             54.08|
|2010-12-01 08:34:00|     13047|12.600000000000001|
|2010-12-01 08:34:00|     13047|12.600000000000001|
|2010-12-01 08:34:00|     13047|              30.0|
|2010-12-01 08:34:00|     13047| 9.899999999999999|
|2010-12-01 08:34:00|     13047|              25.5|
|2010-12-01 08:34:00|     13047|14.850000000000001|
|2010-12-01 08:34:00|     13047|              19.9|
|2010-12-01 08:34:00|     13047|             17.85|
|2010-12-01 08:34:00|     13047|             17.85|
|2010-12-01 08:34:00|     13047|              31.8|
|2010-12-01 08:34:00|     13047|              31.8|
|2010-12-01 08:34:00|     13047|              25.5|
|2010-12-01 08:34:00|     13047|14.850000000000001|
|2010-12-01 08:34:00|     13047|14.850000000000001|
|2010-12-01 08:34:00|     13047|14.850000000000001|
|2010-12-01 

### 5.2. Parquet file

In [26]:
!ls ./data/dummy_parquet_dataset/part-00000.parquet

./data/dummy_parquet_dataset/part-00000.parquet


In [27]:
PARQUET_PATHS = glob('./data/dummy_parquet_dataset/*.parquet')
PARQUET_PATHS

['./data/dummy_parquet_dataset/part-00000.parquet',
 './data/dummy_parquet_dataset/part-00001.parquet']

In [28]:
par_df = spark.read.parquet(PARQUET_PATHS[0])
par_df.show()

+-------------+--------------------+--------------------+------+-----+-------+--------+-------------------+
|    SAMPLE_ID|                 URL|                TEXT|HEIGHT|WIDTH|LICENSE|    NSFW|         similarity|
+-------------+--------------------+--------------------+------+-----+-------+--------+-------------------+
|1581282014547|http://media.righ...|View EPC Rating G...|   109|  100|      ?|  UNSURE|  0.312813401222229|
|1060015003169|https://thumbs.eb...|Silverline Air Fr...|   225|  225|      ?|UNLIKELY| 0.3124845325946808|
|3372497001913|https://farm1.sta...|     Anhui Mountains|   800|  514|      ?|UNLIKELY| 0.3165116608142853|
| 382020002775|https://t2.ftcdn....|Acute pain in a w...|   257|  240|      ?|UNLIKELY| 0.3442777097225189|
|2928456001411|https://findingbl...|Venison – Sour Ch...|   764|  577|      ?|    NSFW|0.30439671874046326|
|2179119014444|https://i.pinimg....|Essentials Barnwo...|   236|  236|      ?|UNLIKELY|0.33279865980148315|
| 147688009067|http://d25hqt

                                                                                

In [29]:
par_df.count()

12933524

#### PySpark vs Pandas

1. Read and concat 2 parquet files
2. Drop `URL` column
3. Filter `similarity` value higher than 0.4
4. Create new `image_size` column by `HEIGHT * WIDTH`

In [30]:
spark_df_list = [spark.read.parquet(p) for p in PARQUET_PATHS]
spark_df_list[0].count()

12933524

In [31]:
from pyspark.sql import DataFrame
from functools import reduce

In [32]:
spark_df = reduce(DataFrame.unionAll, spark_df_list)

In [33]:
spark_df = spark_df.drop('URL')
spark_df.show()

+-------------+--------------------+------+-----+-------+--------+-------------------+
|    SAMPLE_ID|                TEXT|HEIGHT|WIDTH|LICENSE|    NSFW|         similarity|
+-------------+--------------------+------+-----+-------+--------+-------------------+
|1581282014547|View EPC Rating G...|   109|  100|      ?|  UNSURE|  0.312813401222229|
|1060015003169|Silverline Air Fr...|   225|  225|      ?|UNLIKELY| 0.3124845325946808|
|3372497001913|     Anhui Mountains|   800|  514|      ?|UNLIKELY| 0.3165116608142853|
| 382020002775|Acute pain in a w...|   257|  240|      ?|UNLIKELY| 0.3442777097225189|
|2928456001411|Venison – Sour Ch...|   764|  577|      ?|    NSFW|0.30439671874046326|
|2179119014444|Essentials Barnwo...|   236|  236|      ?|UNLIKELY|0.33279865980148315|
| 147688009067|Actimel vanilla -...|   140|  140|      ?|UNLIKELY| 0.4013020098209381|
| 172745002348|Ben Affleck Could...|   320|  320|      ?|UNLIKELY|0.35330262780189514|
|3138980000177|Minnesota Departm...|   200|

In [34]:
spark_df = spark_df.filter(spark_df.similarity > 0.4)
spark_df.show()

+-------------+--------------------+------+-----+-------+--------+-------------------+
|    SAMPLE_ID|                TEXT|HEIGHT|WIDTH|LICENSE|    NSFW|         similarity|
+-------------+--------------------+------+-----+-------+--------+-------------------+
| 147688009067|Actimel vanilla -...|   140|  140|      ?|UNLIKELY| 0.4013020098209381|
|  67366007169|Surfer Happy Twel...|   324|  324|      ?|UNLIKELY|0.40045371651649475|
|3929056008929|Total Art Match B...|   300|  156|      ?|UNLIKELY| 0.4210386574268341|
|3594076029003|HARRY CHAPIN - So...|   169|  200|      ?|UNLIKELY| 0.4186709523200989|
|2120422005488|Abstract Geometri...|   400|  400|      ?|UNLIKELY|0.40669721364974976|
|1386086017355|Budapest PopOut M...|   500|  364|      ?|UNLIKELY| 0.4046134948730469|
|  68429000530|SIBLINGS COLLECTI...|   152|  152|      ?|UNLIKELY| 0.4129495918750763|
|3381798019465|Alpha-gal Awarene...|   500|  500|      ?|UNLIKELY| 0.4030238687992096|
|2036761003991|Keep Austin Nuts ...|   455|

In [35]:
spark_df.count()

717074

In [36]:
spark_df = spark_df.withColumn('image_size', spark_df.HEIGHT * spark_df.WIDTH)
spark_df.show()

+-------------+--------------------+------+-----+-------+--------+-------------------+----------+
|    SAMPLE_ID|                TEXT|HEIGHT|WIDTH|LICENSE|    NSFW|         similarity|image_size|
+-------------+--------------------+------+-----+-------+--------+-------------------+----------+
| 147688009067|Actimel vanilla -...|   140|  140|      ?|UNLIKELY| 0.4013020098209381|     19600|
|  67366007169|Surfer Happy Twel...|   324|  324|      ?|UNLIKELY|0.40045371651649475|    104976|
|3929056008929|Total Art Match B...|   300|  156|      ?|UNLIKELY| 0.4210386574268341|     46800|
|3594076029003|HARRY CHAPIN - So...|   169|  200|      ?|UNLIKELY| 0.4186709523200989|     33800|
|2120422005488|Abstract Geometri...|   400|  400|      ?|UNLIKELY|0.40669721364974976|    160000|
|1386086017355|Budapest PopOut M...|   500|  364|      ?|UNLIKELY| 0.4046134948730469|    182000|
|  68429000530|SIBLINGS COLLECTI...|   152|  152|      ?|UNLIKELY| 0.4129495918750763|     23104|
|3381798019465|Alpha

In [37]:
from pyspark.sql.functions import col

In [38]:
%%timeit

spark_df = reduce(DataFrame.unionAll, [spark.read.parquet(p) for p in PARQUET_PATHS]) \
    .drop('URL') \
    .filter(col('similarity') > 0.4) \
    .withColumn('image_size', col('HEIGHT') * col('WIDTH'))
spark_df.show()

+-------------+--------------------+------+-----+-------+--------+-------------------+----------+
|    SAMPLE_ID|                TEXT|HEIGHT|WIDTH|LICENSE|    NSFW|         similarity|image_size|
+-------------+--------------------+------+-----+-------+--------+-------------------+----------+
| 147688009067|Actimel vanilla -...|   140|  140|      ?|UNLIKELY| 0.4013020098209381|     19600|
|  67366007169|Surfer Happy Twel...|   324|  324|      ?|UNLIKELY|0.40045371651649475|    104976|
|3929056008929|Total Art Match B...|   300|  156|      ?|UNLIKELY| 0.4210386574268341|     46800|
|3594076029003|HARRY CHAPIN - So...|   169|  200|      ?|UNLIKELY| 0.4186709523200989|     33800|
|2120422005488|Abstract Geometri...|   400|  400|      ?|UNLIKELY|0.40669721364974976|    160000|
|1386086017355|Budapest PopOut M...|   500|  364|      ?|UNLIKELY| 0.4046134948730469|    182000|
|  68429000530|SIBLINGS COLLECTI...|   152|  152|      ?|UNLIKELY| 0.4129495918750763|     23104|
|3381798019465|Alpha

In [39]:
spark_df.count()

717074

``` bash
pip install pyarrow
```
to handle parquet file

In [40]:
# pip install pyarrow

In [41]:
import pandas as pd
from glob import glob

In [42]:
%%timeit

df = pd.concat([
    pd.read_parquet(path) for path in PARQUET_PATHS
])
df = df.drop(columns=['URL'])
df = df.loc[df.similarity > 0.4]
df['image_size'] = df.HEIGHT * df.WIDTH

The slowest run took 4.84 times longer than the fastest. This could mean that an intermediate result is being cached.
2min 22s ± 1min 36s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [43]:
df

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: timestamp, UnitPrice: double, CustomerID: string, Country: string, TotalPrice: double]

In [44]:
spark_df.write.parquet('output_1_pyspark')

25/11/27 15:11:37 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
25/11/27 15:11:39 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
25/11/27 15:11:40 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
25/11/27 15:11:40 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
                                                                                

In [45]:
spark_df.coalesce(1).write.parquet('output_2_pyspark')

                                                                                