In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Data Integration').getOrCreate()

In [4]:
customer_df = spark.read.csv('/data/olist/olist_customers_dataset.csv', header=True, inferSchema=True)
geolocation_df = spark.read.csv('/data/olist/olist_geolocation_dataset.csv', header=True, inferSchema=True)
order_items_df = spark.read.csv('/data/olist/olist_order_items_dataset.csv', header=True, inferSchema=True)
order_payments_df = spark.read.csv('/data/olist/olist_order_payments_dataset.csv', header=True, inferSchema=True)
order_reviews_df = spark.read.csv('/data/olist/olist_order_reviews_dataset.csv', header=True, inferSchema=True)
orders_df = spark.read.csv('/data/olist/olist_orders_dataset.csv', header=True, inferSchema=True)
products_df = spark.read.csv('/data/olist/olist_products_dataset.csv', header=True, inferSchema=True)
sellers_df = spark.read.csv('/data/olist/olist_sellers_dataset.csv', header=True, inferSchema=True)

                                                                                

In [5]:
#caching frequently used data
orders_df.cache()
customer_df.cache()
order_items_df.cache()

DataFrame[order_id: string, order_item_id: int, product_id: string, seller_id: string, shipping_limit_date: timestamp, price: double, freight_value: double]

In [6]:
#combining datasets to get a full dataset

orders_items_joined_df = orders_df.join(order_items_df, 'order_id', 'inner')
orders_items_product_df = orders_items_joined_df.join(products_df, 'product_id', 'inner')
orders_items_product_seller_df = orders_items_product_df.join(sellers_df, 'seller_id', 'inner')
full_order_df = orders_items_product_seller_df.join(customer_df, 'customer_id', 'inner')

In [7]:
#join geo-location data and all other datasets
full_order_df = full_order_df.join(geolocation_df, full_order_df.customer_zip_code_prefix == geolocation_df.geolocation_zip_code_prefix, 'left') # left join : if some order don't have zipcode, we don't want to lose those order

In [8]:
full_order_df = full_order_df.join(order_reviews_df, 'order_id', 'left')
full_order_df = full_order_df.join(order_payments_df, 'order_id', 'left')

In [9]:
full_order_df.show(5)

25/06/26 03:11:18 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
[Stage 29:>                                                         (0 + 1) / 1]

+--------------------+--------------------+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+-------------+-------------------+-----+-------------+---------------------+-------------------+--------------------------+------------------+----------------+-----------------+-----------------+----------------+----------------------+-----------+------------+--------------------+------------------------+-------------+--------------+---------------------------+-------------------+------------------+----------------+-----------------+--------------------+------------+--------------------+----------------------+--------------------+-----------------------+------------------+------------+--------------------+-------------+
|            order_id|         customer_id|           seller_id|          product_id|order_status|order_purchase_timestamp|  order_approved_

                                                                                

In [10]:
full_order_df.cache()

DataFrame[order_id: string, customer_id: string, seller_id: string, product_id: string, order_status: string, order_purchase_timestamp: timestamp, order_approved_at: timestamp, order_delivered_carrier_date: timestamp, order_delivered_customer_date: timestamp, order_estimated_delivery_date: timestamp, order_item_id: int, shipping_limit_date: timestamp, price: double, freight_value: double, product_category_name: string, product_name_lenght: int, product_description_lenght: int, product_photos_qty: int, product_weight_g: int, product_length_cm: int, product_height_cm: int, product_width_cm: int, seller_zip_code_prefix: int, seller_city: string, seller_state: string, customer_unique_id: string, customer_zip_code_prefix: int, customer_city: string, customer_state: string, geolocation_zip_code_prefix: int, geolocation_lat: double, geolocation_lng: double, geolocation_city: string, geolocation_state: string, review_id: string, review_score: string, review_comment_title: string, review_commen

In [11]:
#total revenue per seller
from pyspark.sql.functions import *
seller_revenue_df = full_order_df.groupBy('seller_id').agg(sum('price').alias('total_revenue'))

In [12]:
seller_revenue_df.show(3)



+--------------------+------------------+
|           seller_id|     total_revenue|
+--------------------+------------------+
|d650b663c3b5f6fb3...|         2253869.1|
|cd06602b43d8800bd...|353150.98000000033|
|3c487ae8f8d7542be...|1618845.7000000055|
+--------------------+------------------+
only showing top 3 rows



                                                                                

In [13]:
full_order_df.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- seller_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- order_status: string (nullable = true)
 |-- order_purchase_timestamp: timestamp (nullable = true)
 |-- order_approved_at: timestamp (nullable = true)
 |-- order_delivered_carrier_date: timestamp (nullable = true)
 |-- order_delivered_customer_date: timestamp (nullable = true)
 |-- order_estimated_delivery_date: timestamp (nullable = true)
 |-- order_item_id: integer (nullable = true)
 |-- shipping_limit_date: timestamp (nullable = true)
 |-- price: double (nullable = true)
 |-- freight_value: double (nullable = true)
 |-- product_category_name: string (nullable = true)
 |-- product_name_lenght: integer (nullable = true)
 |-- product_description_lenght: integer (nullable = true)
 |-- product_photos_qty: integer (nullable = true)
 |-- product_weight_g: integer (nullable = true)
 |-- product_length_cm: integer (null

In [14]:
#total orders per customer
orders_per_customer = full_order_df.groupBy('customer_id').agg(count('order_id').alias('total_orders_per_customer')).orderBy(desc('total_orders_per_customer'))
orders_per_customer.show(5)

[Stage 54:>                                                         (0 + 4) / 4]

+--------------------+-------------------------+
|         customer_id|total_orders_per_customer|
+--------------------+-------------------------+
|351e40989da90e704...|                    11427|
|50920f8cd0681fd86...|                    10752|
|9b43e2a62de9bab3a...|                     8556|
|270c23a11d024a44c...|                     8001|
|5c87184371002d49e...|                     6876|
+--------------------+-------------------------+
only showing top 5 rows



                                                                                

In [15]:
#average review score per seller
avg_review_per_seller_df = full_order_df.groupBy('seller_id').agg(avg('review_score').alias('avg review'))
avg_review_per_seller_df.show(5)



+--------------------+-----------------+
|           seller_id|       avg review|
+--------------------+-----------------+
|7a67c85e85bb2ce85...|4.258920734844587|
|9d213f303afae4983...|              5.0|
|d2374cbcbb3ca4ab1...| 3.71899382437114|
|1835b56ce799e6a4d...|3.448500717288149|
|d650b663c3b5f6fb3...|4.276244626265428|
+--------------------+-----------------+
only showing top 5 rows



                                                                                

In [16]:
#top 10 sold products
most_sold_prod_df = full_order_df.groupBy('product_id').agg(count('product_id').alias('quantity_sold')).orderBy(desc('quantity_sold'))
most_sold_prod_df.show(10)



+--------------------+-------------+
|          product_id|quantity_sold|
+--------------------+-------------+
|aca2eb7d00ea1a7b8...|        86740|
|422879e10f4668299...|        81110|
|99a4788cb24856965...|        78775|
|389d119b48cf3043d...|        60248|
|d1c427060a0f73f6b...|        59274|
|368c6c730842d7801...|        58358|
|53759a2ecddad2bb8...|        52654|
|53b36df67ebb7c415...|        52105|
|154e7e31ebfa09220...|        42700|
|3dd2a17168ec895c7...|        40787|
+--------------------+-------------+
only showing top 10 rows



                                                                                

In [17]:
#top customers by spending
from pyspark.sql.functions import col
filtered_df = full_order_df.filter(col('order_status') == 'delivered') #spends should be analyze only on delivered products
top_cust_df = filtered_df.groupBy('customer_id').agg(sum('payment_value').alias('total_spends')).withColumn('total_spends_in_INR', (col('total_spends') * 15.69).cast('int')).orderBy(desc('total_spends'))
top_cust_df.show(10)

[Stage 75:>                                                         (0 + 4) / 4]

+--------------------+--------------------+-------------------+
|         customer_id|        total_spends|total_spends_in_INR|
+--------------------+--------------------+-------------------+
|1ff773612ab8934db...| 1.756825199999893E7|          275645873|
|05455dfa7cd02f13d...|1.3282083359999327E7|          208395887|
|ec5b2ba62e5743423...|1.0388528640000112E7|          162996014|
|0c792d32a3251b4f6...|   8254681.600000529|          129515954|
|78fc46047c4a639e8...|   7488519.999999339|          117494878|
|1617b1357756262bf...|   7433259.520000033|          116627841|
|1dbc055ccab23ed89...|   7216273.400000708|          113223329|
|d5f2b3f597c7ccafb...|   6800018.119998923|          106692284|
|dd3f1762eb601f41c...|  6746388.4800006235|          105850835|
|10de381f8a8d23fff...|   5184499.500000076|           81344797|
+--------------------+--------------------+-------------------+
only showing top 10 rows



                                                                                

In [18]:
from pyspark.sql.functions import col

full_order_df.select(col('order_status')).distinct().show()



+------------+
|order_status|
+------------+
|   delivered|
|    canceled|
|     shipped|
|    invoiced|
|  processing|
| unavailable|
|    approved|
+------------+



                                                                                

## Data Aggregation & Data Enrichment

In [19]:
full_order_df.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- seller_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- order_status: string (nullable = true)
 |-- order_purchase_timestamp: timestamp (nullable = true)
 |-- order_approved_at: timestamp (nullable = true)
 |-- order_delivered_carrier_date: timestamp (nullable = true)
 |-- order_delivered_customer_date: timestamp (nullable = true)
 |-- order_estimated_delivery_date: timestamp (nullable = true)
 |-- order_item_id: integer (nullable = true)
 |-- shipping_limit_date: timestamp (nullable = true)
 |-- price: double (nullable = true)
 |-- freight_value: double (nullable = true)
 |-- product_category_name: string (nullable = true)
 |-- product_name_lenght: integer (nullable = true)
 |-- product_description_lenght: integer (nullable = true)
 |-- product_photos_qty: integer (nullable = true)
 |-- product_weight_g: integer (nullable = true)
 |-- product_length_cm: integer (null

In [20]:
#Total revenue and AOV per customer
customer_spending_df = full_order_df.groupBy('customer_id')\
                                    .agg(
                                        count('order_id').alias('total_orders'),
                                        sum('price').alias('total_spent'),
                                        round(avg('price'), 2).alias('AOV')
                                    ).orderBy(desc('total_spent'))
customer_spending_df.show(5)



+--------------------+------------+-----------+------+
|         customer_id|total_orders|total_spent|   AOV|
+--------------------+------------+-----------+------+
|d3e82ccec3cb5f956...|        6876|  6662844.0| 969.0|
|df55c14d1476a9a34...|         743|  3565657.0|4799.0|
|fe5113a38e3575c04...|        2292|  3293604.0|1437.0|
|ec5b2ba62e5743423...|        1428|  2556120.0|1790.0|
|63b964e79dee32a35...|        6072|  2501664.0| 412.0|
+--------------------+------------+-----------+------+
only showing top 5 rows



                                                                                

In [21]:
#Seller Performance Metrics (Revenue, Average Review, Order Count)
seller_performance_df = full_order_df.groupBy('seller_id')\
                                    .agg(
                                        count('order_id').alias('total_orders'),
                                        sum('price').alias('total_revenue'),
                                        round(avg('review_score'), 2).alias('avg_review_score'),
                                        round(stddev('price'), 2).alias('price_variability')
                                    ).orderBy(desc('total_revenue'))
seller_performance_df.show(5)



+--------------------+------------+--------------------+----------------+-----------------+
|           seller_id|total_orders|       total_revenue|avg_review_score|price_variability|
+--------------------+------------+--------------------+----------------+-----------------+
|4869f7a5dfa277a7d...|      184587| 3.613871732000001E7|            4.09|           111.65|
|53243585a1d6dc264...|       54514| 3.429159294999999E7|            4.12|           499.65|
|4a3ca9315b744ce9f...|      330661| 3.375957084000009E7|            3.77|            59.37|
|7c67e1448b00f6e96...|      233306|3.2282321789999764E7|            3.42|            50.39|
|fa1c13f2614d7b5c4...|       87686|3.0139386309999976E7|            4.38|            307.7|
+--------------------+------------+--------------------+----------------+-----------------+
only showing top 5 rows



                                                                                

In [22]:
#Product popularity metrics
prod_metrics_df = full_order_df.groupBy('product_id')\
                               .agg(
                                    count('order_id').alias('total_sales'),
                               ).orderBy(desc('total_sales'))
prod_metrics_df.show(5)



+--------------------+-----------+
|          product_id|total_sales|
+--------------------+-----------+
|aca2eb7d00ea1a7b8...|      86740|
|422879e10f4668299...|      81110|
|99a4788cb24856965...|      78775|
|389d119b48cf3043d...|      60248|
|d1c427060a0f73f6b...|      59274|
+--------------------+-----------+
only showing top 5 rows



                                                                                

In [23]:
#Customer Retention
from pyspark.sql.functions import * 
cust_retention_df = (
    full_order_df
    .groupBy('customer_id')
    .agg(
        first('order_purchase_timestamp').alias('first_order'),
        last('order_purchase_timestamp').alias('last_order'),
        count('order_id').alias('total_order'),
        round(avg('price'), 2).alias('aov')
    )
    .orderBy(desc('total_order'))
)

cust_retention_df.show()


[Stage 110:>                                                        (0 + 4) / 4]

+--------------------+-------------------+-------------------+-----------+------+
|         customer_id|        first_order|         last_order|total_order|   aov|
+--------------------+-------------------+-------------------+-----------+------+
|351e40989da90e704...|2017-07-13 10:42:37|2017-07-13 10:42:37|      11427| 85.99|
|50920f8cd0681fd86...|2018-01-27 11:28:32|2018-01-27 11:28:32|      10752| 43.82|
|9b43e2a62de9bab3a...|2017-05-25 22:27:50|2017-05-25 22:27:50|       8556|  26.4|
|270c23a11d024a44c...|2017-08-08 20:26:31|2017-08-08 20:26:31|       8001| 36.59|
|5c87184371002d49e...|2018-01-05 19:15:37|2018-01-05 19:15:37|       6876| 12.49|
|d3e82ccec3cb5f956...|2017-03-18 14:28:34|2017-03-18 14:28:34|       6876| 969.0|
|d5f2b3f597c7ccafb...|2017-12-13 14:21:15|2017-12-13 14:21:15|       6706|  59.0|
|c2f18647725395af4...|2018-03-06 19:21:47|2018-03-06 19:21:47|       6612|  34.9|
|24e7dc2ff8c071263...|2017-11-24 16:16:45|2017-11-24 16:16:45|       6597|  59.2|
|7bb57d182bdc116

                                                                                

# Advance Enrichment

In [24]:
#order status flag
full_order_df = full_order_df.withColumn('is_delivered', when(col('order_status') == 'delivered', lit(1)).otherwise(lit(0)))\
                             .withColumn('is_canceled', when(col('order_status') == 'canceled', lit(1)).otherwise(lit(0)))
full_order_df.where(full_order_df['order_status'] == 'canceled').select('order_status', 'is_delivered', 'is_canceled').show(10)

+------------+------------+-----------+
|order_status|is_delivered|is_canceled|
+------------+------------+-----------+
|    canceled|           0|          1|
|    canceled|           0|          1|
|    canceled|           0|          1|
|    canceled|           0|          1|
|    canceled|           0|          1|
|    canceled|           0|          1|
|    canceled|           0|          1|
|    canceled|           0|          1|
|    canceled|           0|          1|
|    canceled|           0|          1|
+------------+------------+-----------+
only showing top 10 rows



In [25]:
#order revenue
full_order_df = full_order_df.withColumn('order_revenue', col('price') + col('freight_value'))
full_order_df.select('price', 'freight_value', 'order_revenue').show()

+-----+-------------+------------------+
|price|freight_value|     order_revenue|
+-----+-------------+------------------+
|28.99|         7.46|36.449999999999996|
|28.99|         7.46|36.449999999999996|
|28.99|         7.46|36.449999999999996|
|28.99|         7.46|36.449999999999996|
|28.99|         7.46|36.449999999999996|
|28.99|         7.46|36.449999999999996|
|28.99|         7.46|36.449999999999996|
|28.99|         7.46|36.449999999999996|
|28.99|         7.46|36.449999999999996|
|28.99|         7.46|36.449999999999996|
|28.99|         7.46|36.449999999999996|
|28.99|         7.46|36.449999999999996|
|28.99|         7.46|36.449999999999996|
|28.99|         7.46|36.449999999999996|
|28.99|         7.46|36.449999999999996|
|28.99|         7.46|36.449999999999996|
|28.99|         7.46|36.449999999999996|
|28.99|         7.46|36.449999999999996|
|28.99|         7.46|36.449999999999996|
|28.99|         7.46|36.449999999999996|
+-----+-------------+------------------+
only showing top

In [26]:
customer_spending_df.printSchema()

root
 |-- customer_id: string (nullable = true)
 |-- total_orders: long (nullable = false)
 |-- total_spent: double (nullable = true)
 |-- AOV: double (nullable = true)



In [27]:
#customer segment based on spending
customer_spending_df = customer_spending_df.withColumn(
        'customer_segment',
        when(col('AOV') >= 1200, 'High-Value')
        .when((col('AOV') < 1200) & (col('AOV') >= 500), 'Medium-Value')
        .otherwise('low-value')
    )

customer_spending_df.show()

[Stage 123:>                                                        (0 + 4) / 4]

+--------------------+------------+------------------+-------+----------------+
|         customer_id|total_orders|       total_spent|    AOV|customer_segment|
+--------------------+------------+------------------+-------+----------------+
|d3e82ccec3cb5f956...|        6876|         6662844.0|  969.0|    Medium-Value|
|df55c14d1476a9a34...|         743|         3565657.0| 4799.0|      High-Value|
|fe5113a38e3575c04...|        2292|         3293604.0| 1437.0|      High-Value|
|ec5b2ba62e5743423...|        1428|         2556120.0| 1790.0|      High-Value|
|63b964e79dee32a35...|        6072|         2501664.0|  412.0|       low-value|
|46bb3c0b1a65c8399...|         748|         2336752.0| 3124.0|      High-Value|
|05455dfa7cd02f13d...|        2184| 2160194.400000087|  989.1|    Medium-Value|
|3690e975641f01bd0...|         802|         2124498.0| 2649.0|      High-Value|
|349509b216bd5ec11...|         743|         1923627.0| 2589.0|      High-Value|
|695476b5848d64ba0...|         687|18205

                                                                                

In [28]:
#join cust_spending with full_order_df 
full_order_df = full_order_df.join(customer_spending_df, 'customer_id', 'left')
full_order_df.show()

                                                                                

+--------------------+--------------------+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+-------------+-------------------+-----+-------------+---------------------+-------------------+--------------------------+------------------+----------------+-----------------+-----------------+----------------+----------------------+-----------+------------+--------------------+------------------------+-------------+--------------+---------------------------+-------------------+-------------------+----------------+-----------------+--------------------+------------+--------------------+----------------------+--------------------+-----------------------+------------------+------------+--------------------+-------------+------------+-----------+------------------+------------+------------------+-----+----------------+
|         customer_id|            order_

In [29]:
full_order_df.printSchema()

root
 |-- customer_id: string (nullable = true)
 |-- order_id: string (nullable = true)
 |-- seller_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- order_status: string (nullable = true)
 |-- order_purchase_timestamp: timestamp (nullable = true)
 |-- order_approved_at: timestamp (nullable = true)
 |-- order_delivered_carrier_date: timestamp (nullable = true)
 |-- order_delivered_customer_date: timestamp (nullable = true)
 |-- order_estimated_delivery_date: timestamp (nullable = true)
 |-- order_item_id: integer (nullable = true)
 |-- shipping_limit_date: timestamp (nullable = true)
 |-- price: double (nullable = true)
 |-- freight_value: double (nullable = true)
 |-- product_category_name: string (nullable = true)
 |-- product_name_lenght: integer (nullable = true)
 |-- product_description_lenght: integer (nullable = true)
 |-- product_photos_qty: integer (nullable = true)
 |-- product_weight_g: integer (nullable = true)
 |-- product_length_cm: integer (null

In [30]:
#full_order_df = full_order_df.withColumn('freight_category')
full_order_df = full_order_df.withColumn('freight_category', 
                        when(col('freight_value') >= 20, 'high')
                        .when((col('freight_value') < 20) &  (col('freight_value') >= 10), 'medium')
                        .when(col('freight_value') < 10, 'low')
                    )
full_order_df.select('freight_value', 'freight_category').show()

+-------------+----------------+
|freight_value|freight_category|
+-------------+----------------+
|         7.46|             low|
|         7.46|             low|
|         7.46|             low|
|         7.46|             low|
|         7.46|             low|
|         7.46|             low|
|         7.46|             low|
|         7.46|             low|
|         7.46|             low|
|         7.46|             low|
|         7.46|             low|
|         7.46|             low|
|         7.46|             low|
|         7.46|             low|
|         7.46|             low|
|         7.46|             low|
|         7.46|             low|
|         7.46|             low|
|         7.46|             low|
|         7.46|             low|
+-------------+----------------+
only showing top 20 rows



In [31]:
full_order_df.groupBy('freight_value').count().show()



+-------------+------+
|freight_value| count|
+-------------+------+
|        38.61|   406|
|         14.9|  8418|
|        26.83|   474|
|        22.35|  2518|
|        54.02|   845|
|        10.96|116369|
|        21.18| 13366|
|        18.23|270229|
|        13.71| 46665|
|        31.76|  2289|
|        21.01|  3172|
|       107.02|   119|
|        15.56| 50561|
|         9.01|  2380|
|         18.7|  6828|
|        15.23|197525|
|        17.68|  3662|
|        56.38|   375|
|        17.94| 20332|
|         3.26|   245|
+-------------+------+
only showing top 20 rows



                                                                                

In [33]:
full_order_df.write.mode('overwrite').parquet('/olist/olist_processed_data/')

                                                                                

In [39]:
!hdfs dfs -ls -h /data/processed

Found 11 items
-rw-r--r--   2 root hadoop          0 2025-06-26 03:40 /data/processed/_SUCCESS
-rw-r--r--   2 root hadoop      9.0 M 2025-06-26 03:39 /data/processed/part-00000-bb40510d-b32d-4a74-81dc-0856f5b06be7-c000.snappy.parquet
-rw-r--r--   2 root hadoop      7.8 M 2025-06-26 03:39 /data/processed/part-00001-bb40510d-b32d-4a74-81dc-0856f5b06be7-c000.snappy.parquet
-rw-r--r--   2 root hadoop      7.5 M 2025-06-26 03:39 /data/processed/part-00002-bb40510d-b32d-4a74-81dc-0856f5b06be7-c000.snappy.parquet
-rw-r--r--   2 root hadoop      9.2 M 2025-06-26 03:39 /data/processed/part-00003-bb40510d-b32d-4a74-81dc-0856f5b06be7-c000.snappy.parquet
-rw-r--r--   2 root hadoop      9.0 M 2025-06-26 03:39 /data/processed/part-00004-bb40510d-b32d-4a74-81dc-0856f5b06be7-c000.snappy.parquet
-rw-r--r--   2 root hadoop      8.2 M 2025-06-26 03:39 /data/processed/part-00005-bb40510d-b32d-4a74-81dc-0856f5b06be7-c000.snappy.parquet
-rw-r--r--   2 root hadoop      9.1 M 2025-06-26 03:39 /data/processed/

In [37]:
!hdfs dfs -ls /data/

Found 14 items
-rw-r--r--   2 root              hadoop    1044577 2025-06-15 12:27 /data/customer_csv_for_project.csv
-rw-r--r--   2 root              hadoop    1048576 2025-06-03 03:00 /data/customers10.csv
drwxr-xr-x   - root              hadoop          0 2025-06-13 03:02 /data/customers150.csv
drwxr-xr-x   - root              hadoop          0 2025-06-13 03:10 /data/customers150_clean.csv
-rw-r--r--   2 root              hadoop    1048576 2025-06-04 03:32 /data/customers500.csv
drwxr-xr-x   - root              hadoop          0 2025-06-10 05:01 /data/external_data
-rw-r--r--   2 root              hadoop        511 2025-06-06 02:33 /data/malformed_customers.csv
drwxr-xr-x   - harishankargiri16 hadoop          0 2025-06-18 03:47 /data/olist
drwxr-xr-x   - root              hadoop          0 2025-06-21 14:46 /data/olist_data
drwxr-xr-x   - root              hadoop          0 2025-06-21 14:48 /data/olist_processed_data
-rw-r--r--   2 root              hadoop     863293 2025-06-16 03:45

In [38]:
full_order_df.write.mode('overwrite').parquet('/data/processed')

                                                                                

In [41]:
!hdfs dfs -ls /data/

Found 14 items
-rw-r--r--   2 root              hadoop    1044577 2025-06-15 12:27 /data/customer_csv_for_project.csv
-rw-r--r--   2 root              hadoop    1048576 2025-06-03 03:00 /data/customers10.csv
drwxr-xr-x   - root              hadoop          0 2025-06-13 03:02 /data/customers150.csv
drwxr-xr-x   - root              hadoop          0 2025-06-13 03:10 /data/customers150_clean.csv
-rw-r--r--   2 root              hadoop    1048576 2025-06-04 03:32 /data/customers500.csv
drwxr-xr-x   - root              hadoop          0 2025-06-10 05:01 /data/external_data
-rw-r--r--   2 root              hadoop        511 2025-06-06 02:33 /data/malformed_customers.csv
drwxr-xr-x   - harishankargiri16 hadoop          0 2025-06-18 03:47 /data/olist
drwxr-xr-x   - root              hadoop          0 2025-06-21 14:46 /data/olist_data
drwxr-xr-x   - root              hadoop          0 2025-06-21 14:48 /data/olist_processed_data
-rw-r--r--   2 root              hadoop     863293 2025-06-16 03:45

In [40]:
#Save in google cloud storage
full_order_df.write.mode('overwrite').parquet('gs://dataproc-staging-us-central1-95287947350-8qmatc83/')

                                                                                

In [43]:
#save as csv
full_order_df.write.mode('overwrite').option('header', 'true').csv('/data/olist_processed_data')

25/06/26 04:21:09 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_176_424 !
25/06/26 04:21:09 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_176_308 !
25/06/26 04:21:09 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_176_162 !
25/06/26 04:21:09 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_176_933 !
25/06/26 04:21:09 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_176_67 !
25/06/26 04:21:09 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_176_598 !
25/06/26 04:21:09 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_176_273 !
25/06/26 04:21:09 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_176_380 !
25/06/26 04:21:09 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_176_815 !
25/06/26 04:21:09 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_176_875 !
25/06/26 04:21:09 WARN BlockManagerMaster