In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
.appName('OlistData')\
.enableHiveSupport() \
.getOrCreate()

25/09/08 22:35:35 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [2]:
hdfs_path = '/user/olist/'

In [3]:
customers_df = spark.read.csv(hdfs_path + 'olist_customers_dataset.csv', header = True, inferSchema = True)
orders_df = spark.read.csv(hdfs_path + 'olist_orders_dataset.csv', header = True, inferSchema = True)
order_item_df = spark.read.csv(hdfs_path + 'olist_order_items_dataset.csv', header = True, inferSchema = True)
payments_df = spark.read.csv(hdfs_path + 'olist_order_payments_dataset.csv', header = True, inferSchema = True)
reviews_df = spark.read.csv(hdfs_path + 'olist_order_reviews_dataset.csv', header = True, inferSchema = True)
products_df = spark.read.csv(hdfs_path + 'olist_products_dataset.csv', header = True, inferSchema = True)
sellers_df = spark.read.csv(hdfs_path + 'olist_sellers_dataset.csv', header = True, inferSchema = True)
geolocation_df = spark.read.csv(hdfs_path + 'olist_geolocation_dataset.csv', header = True, inferSchema = True)
category_transformation_df = spark.read.csv(hdfs_path + 'product_category_name_translation.csv', header = True, inferSchema = True)

                                                                                

In [4]:
orders_df.cache()
customers_df.cache()
order_item_df.cache()

DataFrame[order_id: string, order_item_id: int, product_id: string, seller_id: string, shipping_limit_date: timestamp, price: double, freight_value: double]

In [5]:
orders_item_joined_df = orders_df.join(order_item_df,'order_id','inner')

In [6]:
orders_items_products_df = orders_item_joined_df.join(products_df,'product_id','inner')

In [7]:
orders_items_products_sellers_df = orders_items_products_df.join(sellers_df,'seller_id','inner')

In [8]:
full_orders_df = orders_items_products_sellers_df.join(customers_df,'customer_id','inner')

In [9]:
# Geolocation Data

full_orders_df = full_orders_df.join(geolocation_df,full_orders_df.customer_zip_code_prefix == geolocation_df.geolocation_zip_code_prefix,'left')

In [10]:
full_orders_df = full_orders_df.join(reviews_df,'order_id','left')

In [11]:
full_orders_df = full_orders_df.join(payments_df,'order_id','left')

In [12]:
full_orders_df.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- seller_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- order_status: string (nullable = true)
 |-- order_purchase_timestamp: timestamp (nullable = true)
 |-- order_approved_at: timestamp (nullable = true)
 |-- order_delivered_carrier_date: timestamp (nullable = true)
 |-- order_delivered_customer_date: timestamp (nullable = true)
 |-- order_estimated_delivery_date: timestamp (nullable = true)
 |-- order_item_id: integer (nullable = true)
 |-- shipping_limit_date: timestamp (nullable = true)
 |-- price: double (nullable = true)
 |-- freight_value: double (nullable = true)
 |-- product_category_name: string (nullable = true)
 |-- product_name_lenght: integer (nullable = true)
 |-- product_description_lenght: integer (nullable = true)
 |-- product_photos_qty: integer (nullable = true)
 |-- product_weight_g: integer (nullable = true)
 |-- product_length_cm: integer (null

In [13]:
full_orders_df.cache()

25/09/08 22:36:24 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


DataFrame[order_id: string, customer_id: string, seller_id: string, product_id: string, order_status: string, order_purchase_timestamp: timestamp, order_approved_at: timestamp, order_delivered_carrier_date: timestamp, order_delivered_customer_date: timestamp, order_estimated_delivery_date: timestamp, order_item_id: int, shipping_limit_date: timestamp, price: double, freight_value: double, product_category_name: string, product_name_lenght: int, product_description_lenght: int, product_photos_qty: int, product_weight_g: int, product_length_cm: int, product_height_cm: int, product_width_cm: int, seller_zip_code_prefix: int, seller_city: string, seller_state: string, customer_unique_id: string, customer_zip_code_prefix: int, customer_city: string, customer_state: string, geolocation_zip_code_prefix: int, geolocation_lat: double, geolocation_lng: double, geolocation_city: string, geolocation_state: string, review_id: string, review_score: string, review_comment_title: string, review_commen

In [14]:
from pyspark.sql.functions import *

In [15]:
# Total Revenue per seller

seller_revenue_df = full_orders_df.groupBy('seller_id').agg(sum('price').alias('total_revenue'))

In [16]:
seller_revenue_df.show(5)



+--------------------+--------------------+
|           seller_id|       total_revenue|
+--------------------+--------------------+
|7a67c85e85bb2ce85...|2.0312794890000053E7|
|9d213f303afae4983...|   2321.400000000004|
|d2374cbcbb3ca4ab1...|    3375517.55000001|
|1835b56ce799e6a4d...|   6097995.110000006|
|d650b663c3b5f6fb3...|           2253869.1|
+--------------------+--------------------+
only showing top 5 rows



                                                                                

In [17]:
# Total Orders Per Customer

total_orders_per_customer = full_orders_df.groupBy('customer_id').agg(count('order_id').alias('total_orders'))
total_orders_per_customer.show(10)



+--------------------+------------+
|         customer_id|total_orders|
+--------------------+------------+
|9c8410ae611b0c9fd...|          60|
|1e49d70b227359f77...|          64|
|2e3160c0e701d9d91...|          83|
|423b14adf6348b595...|         166|
|f81bb64a1e672e6cb...|         454|
|a2903717cca50b4ac...|          82|
|d81067f5a2e50e54c...|          83|
|e6e14f3e3173398dc...|          52|
|1b2c1c9762e9d1b8c...|          83|
|57e96b01256ca49dc...|          83|
+--------------------+------------+
only showing top 10 rows



                                                                                

In [18]:
# Average Review Score Per Seller
average_review = full_orders_df.groupBy('seller_id').agg(avg('review_score').alias('review_score'))
average_review.show(10)



+--------------------+------------------+
|           seller_id|      review_score|
+--------------------+------------------+
|d650b663c3b5f6fb3...| 4.276244626265428|
|cd06602b43d8800bd...|3.7134436635683516|
|3c487ae8f8d7542be...| 4.276648299448984|
|d354c38a7182125a7...| 4.266468180126535|
|e9779976487b77c6d...|4.2127329046350255|
|9616352088dcf83a7...| 4.341055718475073|
|de722cd6dad950a92...| 3.955182683228009|
|01cf7e3d21494c41f...| 4.378505552418596|
|ef0ace09169ac0905...|  4.32053782202607|
|fe2032dab1a61af87...| 4.427486696253918|
+--------------------+------------------+
only showing top 10 rows



                                                                                

In [19]:
# Top 10 Most Sold Products
top_products_df = full_orders_df.groupBy('product_id')\
.agg(count('order_id').alias('total_sold'))\
.orderBy(desc('total_sold'))\
.limit(10)

In [20]:
top_products_df.show()



+--------------------+----------+
|          product_id|total_sold|
+--------------------+----------+
|aca2eb7d00ea1a7b8...|     86740|
|422879e10f4668299...|     81110|
|99a4788cb24856965...|     78775|
|389d119b48cf3043d...|     60248|
|d1c427060a0f73f6b...|     59274|
|368c6c730842d7801...|     58358|
|53759a2ecddad2bb8...|     52654|
|53b36df67ebb7c415...|     52105|
|154e7e31ebfa09220...|     42700|
|3dd2a17168ec895c7...|     40787|
+--------------------+----------+



                                                                                

In [21]:
# Top 10 Customer by Spending
top_customer_spending = full_orders_df.groupBy('customer_id')\
.agg(sum('payment_value').alias('total_spent'))\
.orderBy(desc('total_spent'))\
.limit(10)

In [22]:
top_customer_spending.show()

[Stage 66:>                                                         (0 + 4) / 4]

+--------------------+--------------------+
|         customer_id|         total_spent|
+--------------------+--------------------+
|1ff773612ab8934db...| 1.756825199999893E7|
|05455dfa7cd02f13d...|1.3282083359999327E7|
|ec5b2ba62e5743423...|1.0388528640000112E7|
|0c792d32a3251b4f6...|   8254681.600000529|
|78fc46047c4a639e8...|   7488519.999999339|
|1617b1357756262bf...|   7433259.520000033|
|1dbc055ccab23ed89...|   7216273.400000708|
|d5f2b3f597c7ccafb...|   6800018.119998923|
|dd3f1762eb601f41c...|  6746388.4800006235|
|10de381f8a8d23fff...|   5184499.500000076|
+--------------------+--------------------+





# Window Function and Ranking

In [23]:
from pyspark.sql.window import Window

In [24]:
# Rank top Selling products per seller
window_spec = Window.partitionBy('seller_id').orderBy(desc('price'))

In [25]:
# Dense Rank for sellers based on revenue
top_seller_products_df = full_orders_df.withColumn('rank',rank().over(window_spec)).filter(col('rank')<=5)

top_seller_products_df.select('seller_id','price','rank').show()

[Stage 73:>                                                         (0 + 1) / 1]

+--------------------+-----+----+
|           seller_id|price|rank|
+--------------------+-----+----+
|0015a82c2db000af6...|895.0|   1|
|0015a82c2db000af6...|895.0|   1|
|0015a82c2db000af6...|895.0|   1|
|0015a82c2db000af6...|895.0|   1|
|0015a82c2db000af6...|895.0|   1|
|0015a82c2db000af6...|895.0|   1|
|0015a82c2db000af6...|895.0|   1|
|0015a82c2db000af6...|895.0|   1|
|0015a82c2db000af6...|895.0|   1|
|0015a82c2db000af6...|895.0|   1|
|0015a82c2db000af6...|895.0|   1|
|0015a82c2db000af6...|895.0|   1|
|0015a82c2db000af6...|895.0|   1|
|0015a82c2db000af6...|895.0|   1|
|0015a82c2db000af6...|895.0|   1|
|0015a82c2db000af6...|895.0|   1|
|0015a82c2db000af6...|895.0|   1|
|0015a82c2db000af6...|895.0|   1|
|0015a82c2db000af6...|895.0|   1|
|0015a82c2db000af6...|895.0|   1|
+--------------------+-----+----+
only showing top 20 rows



                                                                                

# Optimized joins for data integration

In [26]:
orders_items_products_sellers_df = orders_items_products_df.join(broadcast(sellers_df),'seller_id','inner')

In [27]:
#full_orders_df = full_orders_df.join(broadcast(geolocation_df),full_orders_df.customer_zip_code_prefix == geolocation_df.geolocation_zip_code_prefix,'left')

# Advance Aggregation and Enrichment

In [28]:
# Total Revenue & Average Order Value (AOV) per Customer

customer_spending_df = full_orders_df.groupBy('customer_id')\
.agg(
count('order_id').alias('total_orders'),
sum('price').alias('total_spent'),
round(avg('price'),2).alias('AOV'))\
.orderBy(desc('total_spent'))

customer_spending_df.show()

[Stage 80:>                                                         (0 + 4) / 4]

+--------------------+------------+------------------+-------+
|         customer_id|total_orders|       total_spent|    AOV|
+--------------------+------------+------------------+-------+
|d3e82ccec3cb5f956...|        6876|         6662844.0|  969.0|
|df55c14d1476a9a34...|         743|         3565657.0| 4799.0|
|fe5113a38e3575c04...|        2292|         3293604.0| 1437.0|
|ec5b2ba62e5743423...|        1428|         2556120.0| 1790.0|
|63b964e79dee32a35...|        6072|         2501664.0|  412.0|
|46bb3c0b1a65c8399...|         748|         2336752.0| 3124.0|
|05455dfa7cd02f13d...|        2184| 2160194.400000087|  989.1|
|3690e975641f01bd0...|         802|         2124498.0| 2649.0|
|349509b216bd5ec11...|         743|         1923627.0| 2589.0|
|695476b5848d64ba0...|         687|1820543.1299999943|2649.99|
|73236a0796f53d60d...|         832|         1755520.0| 2110.0|
|cc803a2c412833101...|         762|         1676400.0| 2200.0|
|1ff773612ab8934db...|        5820|1658641.7999999512| 



In [29]:
# Seller Performance Metrics (Revenue, Average Review, Order Count)

seller_performance_df = full_orders_df.groupBy('seller_id')\
.agg(
count('order_id').alias('total_orders'),
sum('price').alias('total_revenue'),
round(avg('review_score'),2).alias('avg_review_score'),
round(stddev('price'),2).alias('price_variability')
)\
.orderBy(desc('total_revenue'))

seller_performance_df.show()



+--------------------+------------+--------------------+----------------+-----------------+
|           seller_id|total_orders|       total_revenue|avg_review_score|price_variability|
+--------------------+------------+--------------------+----------------+-----------------+
|4869f7a5dfa277a7d...|      184587| 3.613871731999998E7|            4.09|           111.65|
|53243585a1d6dc264...|       54514| 3.429159294999999E7|            4.12|           499.65|
|4a3ca9315b744ce9f...|      330661| 3.375957084000014E7|            3.77|            59.37|
|7c67e1448b00f6e96...|      233306| 3.228232178999976E7|            3.42|            50.39|
|fa1c13f2614d7b5c4...|       87686|3.0139386309999954E7|            4.38|            307.7|
|da8622b14eb17ae28...|      264433| 2.985766973000004E7|            3.98|            72.92|
|7e93a43ef30c4f03f...|       50226|2.6315706299999952E7|            4.15|           377.24|
|1025f0e2d44d7041d...|      229587|2.2937518519999966E7|            3.89|       

                                                                                

In [30]:
# Product Popularity Metrics

product_metrics_df = full_orders_df.groupBy('product_id')\
.agg(
count('order_id').alias('total_sales'),
sum('price').alias('total_revenue'),
round(avg('price'),2).alias('avg_price'),
round(stddev('price'),2).alias('price_volatality'),\
collect_set('seller_id').alias('unique_sellers')
)\
.orderBy(desc('total_sales'))

product_metrics_df.show(5)



+--------------------+-----------+------------------+---------+----------------+--------------------+
|          product_id|total_sales|     total_revenue|avg_price|price_volatality|      unique_sellers|
+--------------------+-----------+------------------+---------+----------------+--------------------+
|aca2eb7d00ea1a7b8...|      86740| 6164630.300000018|    71.07|            3.17|[955fee9216a65b61...|
|422879e10f4668299...|      81110|4442791.5100000175|    54.77|            4.46|[1f50f920176fa81d...|
|99a4788cb24856965...|      78775| 6921762.710000014|    87.87|            4.08|[4a3ca9315b744ce9...|
|389d119b48cf3043d...|      60248|3280533.1300000106|    54.45|            4.37|[1f50f920176fa81d...|
|d1c427060a0f73f6b...|      59274|  8220103.32999999|   138.68|           16.58|[a1043bafd471dff5...|
+--------------------+-----------+------------------+---------+----------------+--------------------+
only showing top 5 rows



                                                                                

In [31]:
# Monthly Revenue and Order Count Trend
#order purchase timestamp ---> month
# Total orders
# Total Revenue
# Avg order value
# Min order value
# Max order value

monthly_revenue = full_orders_df.withColumn('year_month',date_format('order_purchase_timestamp','yyyy-MM'))\
.groupBy('year_month')\
.agg(count('order_id').alias('total_orders'),
round(sum('price'),2).alias('total_revenue'),
round(avg('price'),2).alias('avg_order_value'),
round(min('price'),2).alias('min_order_value'),
round(max('price'),2).alias('max_order_value'))\
.orderBy('year_month')

In [32]:
monthly_revenue.show()



+----------+------------+--------------+---------------+---------------+---------------+
|year_month|total_orders| total_revenue|avg_order_value|min_order_value|max_order_value|
+----------+------------+--------------+---------------+---------------+---------------+
|   2016-09|        1268|       57431.0|          45.29|           32.9|           59.5|
|   2016-10|       62812|    8301215.43|         132.16|            6.0|         1399.0|
|   2016-12|         304|        3313.6|           10.9|           10.9|           10.9|
|   2017-01|      154814| 1.749501938E7|         113.01|            2.9|         2999.0|
|   2017-02|      314901| 3.771518492E7|         119.77|            5.3|         6735.0|
|   2017-03|      493268| 6.410433591E7|         129.96|            4.9|         3999.9|
|   2017-04|      428106| 5.830563201E7|         136.19|            4.9|         4799.0|
|   2017-05|      681522| 8.320509538E7|         122.09|            3.5|         6499.0|
|   2017-06|      578

                                                                                

In [33]:
# Customer Retention Analysis ( First and Last Order )

customer_retention_df = full_orders_df.groupBy('customer_id')\
.agg(first('order_purchase_timestamp').alias('first_order_date'),
    last('order_purchase_timestamp').alias('last_order_date'),
    count('order_id').alias('total_orders'),
    round(avg('price'),2).alias('aov')
    )\
.orderBy(desc('total_orders'))

In [34]:
customer_retention_df.show()



+--------------------+-------------------+-------------------+------------+------+
|         customer_id|   first_order_date|    last_order_date|total_orders|   aov|
+--------------------+-------------------+-------------------+------------+------+
|351e40989da90e704...|2017-07-13 10:42:37|2017-07-13 10:42:37|       11427| 85.99|
|50920f8cd0681fd86...|2018-01-27 11:28:32|2018-01-27 11:28:32|       10752| 43.82|
|9b43e2a62de9bab3a...|2017-05-25 22:27:50|2017-05-25 22:27:50|        8556|  26.4|
|270c23a11d024a44c...|2017-08-08 20:26:31|2017-08-08 20:26:31|        8001| 36.59|
|5c87184371002d49e...|2018-01-05 19:15:37|2018-01-05 19:15:37|        6876| 12.49|
|d3e82ccec3cb5f956...|2017-03-18 14:28:34|2017-03-18 14:28:34|        6876| 969.0|
|d5f2b3f597c7ccafb...|2017-12-13 14:21:15|2017-12-13 14:21:15|        6706|  59.0|
|c2f18647725395af4...|2018-03-06 19:21:47|2018-03-06 19:21:47|        6612|  34.9|
|24e7dc2ff8c071263...|2017-11-24 16:16:45|2017-11-24 16:16:45|        6597|  59.2|
|7bb

                                                                                

# Extended Enrichment

In [35]:
# Order Status Flags

In [36]:
full_orders_df = full_orders_df.withColumn('is_delivered', when(col('order_status')=='delivered',lit(1)).otherwise(lit(0)))\
.withColumn('is_canceled', when(col('order_status')=='canceled',lit(1)).otherwise(lit(0)))

In [37]:
full_orders_df.where(full_orders_df['order_status']=='canceled').select('order_status','is_delivered','is_canceled').show(100)

+------------+------------+-----------+
|order_status|is_delivered|is_canceled|
+------------+------------+-----------+
|    canceled|           0|          1|
|    canceled|           0|          1|
|    canceled|           0|          1|
|    canceled|           0|          1|
|    canceled|           0|          1|
|    canceled|           0|          1|
|    canceled|           0|          1|
|    canceled|           0|          1|
|    canceled|           0|          1|
|    canceled|           0|          1|
|    canceled|           0|          1|
|    canceled|           0|          1|
|    canceled|           0|          1|
|    canceled|           0|          1|
|    canceled|           0|          1|
|    canceled|           0|          1|
|    canceled|           0|          1|
|    canceled|           0|          1|
|    canceled|           0|          1|
|    canceled|           0|          1|
|    canceled|           0|          1|
|    canceled|           0|          1|


In [38]:
# Order Revenue Calculation

full_orders_df = full_orders_df.withColumn('order_revenue',col('price')+col('freight_value'))

In [39]:
full_orders_df.select('price','freight_value','order_revenue').show()

+-----+-------------+------------------+
|price|freight_value|     order_revenue|
+-----+-------------+------------------+
|28.99|         7.46|36.449999999999996|
|28.99|         7.46|36.449999999999996|
|28.99|         7.46|36.449999999999996|
|28.99|         7.46|36.449999999999996|
|28.99|         7.46|36.449999999999996|
|28.99|         7.46|36.449999999999996|
|28.99|         7.46|36.449999999999996|
|28.99|         7.46|36.449999999999996|
|28.99|         7.46|36.449999999999996|
|28.99|         7.46|36.449999999999996|
|28.99|         7.46|36.449999999999996|
|28.99|         7.46|36.449999999999996|
|28.99|         7.46|36.449999999999996|
|28.99|         7.46|36.449999999999996|
|28.99|         7.46|36.449999999999996|
|28.99|         7.46|36.449999999999996|
|28.99|         7.46|36.449999999999996|
|28.99|         7.46|36.449999999999996|
|28.99|         7.46|36.449999999999996|
|28.99|         7.46|36.449999999999996|
+-----+-------------+------------------+
only showing top

In [40]:
customer_spending_df.printSchema()

root
 |-- customer_id: string (nullable = true)
 |-- total_orders: long (nullable = false)
 |-- total_spent: double (nullable = true)
 |-- AOV: double (nullable = true)



In [41]:
# Customer segmentation based on spending

customer_spending_df = customer_spending_df.withColumn(
'customer_segment',
when(col('AOV') >= 1200,'High-Value')
.when((col('AOV') < 1200) & (col('AOV')>=700),'Medium-Value')
.otherwise('Low-Value'))

In [42]:
customer_spending_df.show(10)



+--------------------+------------+------------------+-------+----------------+
|         customer_id|total_orders|       total_spent|    AOV|customer_segment|
+--------------------+------------+------------------+-------+----------------+
|d3e82ccec3cb5f956...|        6876|         6662844.0|  969.0|    Medium-Value|
|df55c14d1476a9a34...|         743|         3565657.0| 4799.0|      High-Value|
|fe5113a38e3575c04...|        2292|         3293604.0| 1437.0|      High-Value|
|ec5b2ba62e5743423...|        1428|         2556120.0| 1790.0|      High-Value|
|63b964e79dee32a35...|        6072|         2501664.0|  412.0|       Low-Value|
|46bb3c0b1a65c8399...|         748|         2336752.0| 3124.0|      High-Value|
|05455dfa7cd02f13d...|        2184| 2160194.400000087|  989.1|    Medium-Value|
|3690e975641f01bd0...|         802|         2124498.0| 2649.0|      High-Value|
|349509b216bd5ec11...|         743|         1923627.0| 2589.0|      High-Value|
|695476b5848d64ba0...|         687|18205

[Stage 121:>                                                        (0 + 4) / 4]                                                                                

In [43]:
full_orders_df = full_orders_df.join(customer_spending_df.select('customer_id','customer_segment'),on = 'customer_id',how = 'left')

In [44]:
full_orders_df.select('customer_id','customer_segment').show()

                                                                                

+--------------------+----------------+
|         customer_id|customer_segment|
+--------------------+----------------+
|dbef5eb24f60585a8...|       Low-Value|
|dbef5eb24f60585a8...|       Low-Value|
|dbef5eb24f60585a8...|       Low-Value|
|dbef5eb24f60585a8...|       Low-Value|
|dbef5eb24f60585a8...|       Low-Value|
|dbef5eb24f60585a8...|       Low-Value|
|dbef5eb24f60585a8...|       Low-Value|
|dbef5eb24f60585a8...|       Low-Value|
|dbef5eb24f60585a8...|       Low-Value|
|dbef5eb24f60585a8...|       Low-Value|
|dbef5eb24f60585a8...|       Low-Value|
|dbef5eb24f60585a8...|       Low-Value|
|dbef5eb24f60585a8...|       Low-Value|
|dbef5eb24f60585a8...|       Low-Value|
|dbef5eb24f60585a8...|       Low-Value|
|dbef5eb24f60585a8...|       Low-Value|
|dbef5eb24f60585a8...|       Low-Value|
|dbef5eb24f60585a8...|       Low-Value|
|dbef5eb24f60585a8...|       Low-Value|
|dbef5eb24f60585a8...|       Low-Value|
+--------------------+----------------+
only showing top 20 rows



In [45]:
full_orders_df.select('order_purchase_timestamp').show()

+------------------------+
|order_purchase_timestamp|
+------------------------+
|     2018-08-07 23:19:16|
|     2018-08-07 23:19:16|
|     2018-08-07 23:19:16|
|     2018-08-07 23:19:16|
|     2018-08-07 23:19:16|
|     2018-08-07 23:19:16|
|     2018-08-07 23:19:16|
|     2018-08-07 23:19:16|
|     2018-08-07 23:19:16|
|     2018-08-07 23:19:16|
|     2018-08-07 23:19:16|
|     2018-08-07 23:19:16|
|     2018-08-07 23:19:16|
|     2018-08-07 23:19:16|
|     2018-08-07 23:19:16|
|     2018-08-07 23:19:16|
|     2018-08-07 23:19:16|
|     2018-08-07 23:19:16|
|     2018-08-07 23:19:16|
|     2018-08-07 23:19:16|
+------------------------+
only showing top 20 rows



In [46]:
# Hourly Order Distribution
full_orders_df = full_orders_df.withColumn('hour_of_day',expr('hour(order_purchase_timestamp)'))

In [47]:
full_orders_df.select('order_purchase_timestamp','hour_of_day').show()

+------------------------+-----------+
|order_purchase_timestamp|hour_of_day|
+------------------------+-----------+
|     2018-08-07 23:19:16|         23|
|     2018-08-07 23:19:16|         23|
|     2018-08-07 23:19:16|         23|
|     2018-08-07 23:19:16|         23|
|     2018-08-07 23:19:16|         23|
|     2018-08-07 23:19:16|         23|
|     2018-08-07 23:19:16|         23|
|     2018-08-07 23:19:16|         23|
|     2018-08-07 23:19:16|         23|
|     2018-08-07 23:19:16|         23|
|     2018-08-07 23:19:16|         23|
|     2018-08-07 23:19:16|         23|
|     2018-08-07 23:19:16|         23|
|     2018-08-07 23:19:16|         23|
|     2018-08-07 23:19:16|         23|
|     2018-08-07 23:19:16|         23|
|     2018-08-07 23:19:16|         23|
|     2018-08-07 23:19:16|         23|
|     2018-08-07 23:19:16|         23|
|     2018-08-07 23:19:16|         23|
+------------------------+-----------+
only showing top 20 rows



In [48]:
# Weekday vs Weekend Order
full_orders_df = full_orders_df.withColumn('order_day_type',\
                                          when(dayofweek(col('order_purchase_timestamp')).isin(1,7),lit('Weekend')).otherwise(lit('Weekday')))

In [49]:
full_orders_df.select('order_purchase_timestamp','order_day_type').show()

+------------------------+--------------+
|order_purchase_timestamp|order_day_type|
+------------------------+--------------+
|     2018-08-07 23:19:16|       Weekday|
|     2018-08-07 23:19:16|       Weekday|
|     2018-08-07 23:19:16|       Weekday|
|     2018-08-07 23:19:16|       Weekday|
|     2018-08-07 23:19:16|       Weekday|
|     2018-08-07 23:19:16|       Weekday|
|     2018-08-07 23:19:16|       Weekday|
|     2018-08-07 23:19:16|       Weekday|
|     2018-08-07 23:19:16|       Weekday|
|     2018-08-07 23:19:16|       Weekday|
|     2018-08-07 23:19:16|       Weekday|
|     2018-08-07 23:19:16|       Weekday|
|     2018-08-07 23:19:16|       Weekday|
|     2018-08-07 23:19:16|       Weekday|
|     2018-08-07 23:19:16|       Weekday|
|     2018-08-07 23:19:16|       Weekday|
|     2018-08-07 23:19:16|       Weekday|
|     2018-08-07 23:19:16|       Weekday|
|     2018-08-07 23:19:16|       Weekday|
|     2018-08-07 23:19:16|       Weekday|
+------------------------+--------

In [50]:
# A new column Freight category value, low - med or high
full_orders_df = full_orders_df.withColumn(
'freight_category',
when(col('freight_value') < 50, 'Low')
.when((col('freight_value') >= 50) & (col('freight_value') < 150), 'Medium')
.otherwise('High')
)

In [51]:
full_orders_df.select('freight_category').show()

+----------------+
|freight_category|
+----------------+
|             Low|
|             Low|
|             Low|
|             Low|
|             Low|
|             Low|
|             Low|
|             Low|
|             Low|
|             Low|
|             Low|
|             Low|
|             Low|
|             Low|
|             Low|
|             Low|
|             Low|
|             Low|
|             Low|
|             Low|
+----------------+
only showing top 20 rows



In [52]:
# Order Volume by Customer State

order_volume_state = full_orders_df.groupBy('customer_state').count()
order_volume_state.show()



+--------------+-------+
|customer_state|  count|
+--------------+-------+
|            CE|  74749|
|            DF| 109466|
|            SP|6742255|
|            RJ|3626875|
|            PR| 746540|
|            SC| 644930|
|            RS| 971696|
|            MG|3433239|
|            MT| 155233|
|            BA| 443992|
|            AL|  37742|
|            PE| 132005|
|            ES| 367217|
|            GO| 162430|
|            RN|  24820|
|            AM|   6488|
|            TO|  22361|
|            PI|  27696|
|            PB|  33381|
|            PA|  96279|
+--------------+-------+
only showing top 20 rows



                                                                                

In [53]:
!hadoop fs -mkdir /user/olist/processed/

mkdir: `/user/olist/processed': File exists


In [54]:
full_orders_df.write.mode('overwrite').parquet('/user/olist/processed')

                                                                                

In [55]:
spark.stop()