In [0]:
df_n = spark.read.csv("/Volumes/workspace/ecommerce/ecommerce_data/2019-Nov.csv")

In [0]:
df = spark.read.csv("/Volumes/workspace/ecommerce/ecommerce_data/2019-Oct.csv")

In [0]:
print(f"October 2019 - Total Events: {df.count():,}")
print("\n" + "="*60)
print("SCHEMA:")
print("="*60)
df.printSchema()

October 2019 - Total Events: 42,448,765

SCHEMA:
root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)
 |-- _c6: string (nullable = true)
 |-- _c7: string (nullable = true)
 |-- _c8: string (nullable = true)



In [0]:
print("\n" + "="*60)
print("SAMPLE DATA (First 5 rows):")
print("="*60)
df.show(5, truncate=False)


SAMPLE DATA (First 5 rows):
+-----------------------+----------+----------+-------------------+-----------------------------------+--------+------+---------+------------------------------------+
|_c0                    |_c1       |_c2       |_c3                |_c4                                |_c5     |_c6   |_c7      |_c8                                 |
+-----------------------+----------+----------+-------------------+-----------------------------------+--------+------+---------+------------------------------------+
|event_time             |event_type|product_id|category_id        |category_code                      |brand   |price |user_id  |user_session                        |
|2019-10-01 00:00:00 UTC|view      |44600062  |2103807459595387724|NULL                               |shiseido|35.79 |541312140|72d76fde-8bb3-4e00-8c23-a032dfed738c|
|2019-10-01 00:00:00 UTC|view      |3900821   |2053013552326770905|appliances.environment.water_heater|aqua    |33.20 |554748717|9333dfb

## Hands-on small file problem

In [0]:
# Reading OCT data
df_oct = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load("/Volumes/workspace/ecommerce/ecommerce_data/2019-Oct.csv")

df_oct.printSchema()

root
 |-- event_time: timestamp (nullable = true)
 |-- event_type: string (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- category_id: long (nullable = true)
 |-- category_code: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- price: double (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- user_session: string (nullable = true)



In [0]:
# Convert to delta
df_oct.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("events_delta")

In [0]:
# Append Nov Events
df_nov = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load("/Volumes/workspace/ecommerce/ecommerce_data/2019-Nov.csv")

In [0]:
# appending nov data in small chunks
for i in range(5):
    df_nov.limit(10000) \
        .write \
        .format("delta") \
        .mode("append") \
        .saveAsTable("events_delta")

In [0]:
spark.sql("DESCRIBE DETAIL events_delta").show(truncate=False)

+------+------------------------------------+------------------------------+-----------+--------+-----------------------+-------------------+----------------+-----------------+--------+-----------+-------------------------------------+----------------+----------------+-----------------------------------------+---------------------------------------------------------------+-------------+
|format|id                                  |name                          |description|location|createdAt              |lastModified       |partitionColumns|clusteringColumns|numFiles|sizeInBytes|properties                           |minReaderVersion|minWriterVersion|tableFeatures                            |statistics                                                     |clusterByAuto|
+------+------------------------------------+------------------------------+-----------+--------+-----------------------+-------------------+----------------+-----------------+--------+-----------+-----------------------

In [0]:
%time spark.sql("SELECT COUNT(*) FROM events_delta").show()

+--------+
|COUNT(*)|
+--------+
|42498764|
+--------+

CPU times: user 13.4 ms, sys: 7.3 ms, total: 20.7 ms
Wall time: 595 ms


In [0]:
%sql
DESCRIBE DETAIL events_delta;

format,id,name,description,location,createdAt,lastModified,partitionColumns,clusteringColumns,numFiles,sizeInBytes,properties,minReaderVersion,minWriterVersion,tableFeatures,statistics,clusterByAuto
delta,89ce5580-363e-446d-a707-d6a74961ec6f,workspace.default.events_delta,,,2026-02-21T02:17:35.606Z,2026-02-21T02:19:13.000Z,List(),List(),48,1406560013,Map(delta.enableDeletionVectors -> true),3,7,"List(appendOnly, deletionVectors, invariants)","Map(numRowsDeletedByDeletionVectors -> 0, numDeletionVectors -> 0)",False


OPTIMIZE - compacts small files into larger ones.

In [0]:
%sql
OPTIMIZE events_delta; 

path,metrics
,"List(16, 37, List(43276646, 69707156, 6.63100180625E7, 16, 1060960289), List(263047, 34458526, 2.7775174675675675E7, 37, 1027681463), 0, null, null, 0, 1, 48, 11, true, 0, 0, 1771640706141, 1771640714799, 8, 16, null, List(0, 0), null, 9, 9, 38683, 0, null, null)"


In [0]:
%time spark.sql("SELECT COUNT(*) FROM events_delta").show()

+--------+
|COUNT(*)|
+--------+
|42498764|
+--------+

CPU times: user 6.13 ms, sys: 11.5 ms, total: 17.6 ms
Wall time: 784 ms
