# Hypotesis 23 & 24

### H23: The freight value is directly proportional to the order's time of delivery
### H24: The freight value is directly proportional to the order's items quantity

In [1]:
from pyspark.sql import SparkSession, functions as F, types as T
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
spark = SparkSession.builder.getOrCreate()

In [3]:
orders_df = spark.read \
                      .option('quote', '\"') \
                      .option('escape', '\"') \
                      .csv('./dataset/olist_orders_dataset.csv', header=True, multiLine=True, inferSchema=True)

order_items_df = spark.read \
                      .option('quote', '\"') \
                      .option('escape', '\"') \
                      .csv('./dataset/olist_order_items_dataset.csv', header=True, multiLine=True, inferSchema=True)

orders_df.printSchema()

order_items_df.printSchema()

df = orders_df.join(order_items_df, 'order_id').filter(F.col('order_delivered_customer_date').isNotNull())
df = df.join(df.groupBy('order_id')
                 .agg(F.sum('freight_value').alias('total_freight')),
                                             how='left', on='order_id')

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- order_status: string (nullable = true)
 |-- order_purchase_timestamp: timestamp (nullable = true)
 |-- order_approved_at: timestamp (nullable = true)
 |-- order_delivered_carrier_date: timestamp (nullable = true)
 |-- order_delivered_customer_date: timestamp (nullable = true)
 |-- order_estimated_delivery_date: timestamp (nullable = true)

root
 |-- order_id: string (nullable = true)
 |-- order_item_id: integer (nullable = true)
 |-- product_id: string (nullable = true)
 |-- seller_id: string (nullable = true)
 |-- shipping_limit_date: timestamp (nullable = true)
 |-- price: double (nullable = true)
 |-- freight_value: double (nullable = true)



In [4]:
df.limit(5).toPandas()

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value,total_freight
0,014405982914c2cde2796ddcf0b8703d,2de342d6e5905a5a8bb3a991c855f3e2,delivered,2017-07-26 17:38:47,2017-07-26 17:50:17,2017-07-27 19:39:52,2017-07-31 15:53:33,2017-08-17,1,6782d593f63105318f46bbf7633279bf,325f3178fb58e2a9778334621eecdbf9,2017-08-01 17:50:17,27.9,3.81,29.2
1,014405982914c2cde2796ddcf0b8703d,2de342d6e5905a5a8bb3a991c855f3e2,delivered,2017-07-26 17:38:47,2017-07-26 17:50:17,2017-07-27 19:39:52,2017-07-31 15:53:33,2017-08-17,2,e95ee6822b66ac6058e2e4aff656071a,a17f621c590ea0fab3d5d883e1630ec6,2017-08-01 17:50:17,21.33,25.39,29.2
2,019886de8f385a39b75bedbb726fd4ef,8cf88d7ba142365ef2ca619ef06f9a0f,delivered,2018-02-10 12:52:51,2018-02-10 13:08:12,2018-02-14 15:28:51,2018-02-23 02:03:03,2018-03-14,1,e9a69340883a438c3f91739d14d3a56d,1b4c3a6f53068f0b6944d2d005c9fc89,2018-02-15 13:08:12,159.9,28.5,28.5
3,01a6ad782455876aa89081449d49c452,71accffbcbdf8e02f67a469f65cdbf73,delivered,2018-01-18 10:07:52,2018-01-18 10:17:29,2018-01-22 22:37:04,2018-02-01 21:02:22,2018-02-20,1,036734b5a58d5d4f46b0616ddc047ced,ea8482cd71df3c1969d7b9473ff13abc,2018-01-24 10:17:29,34.99,15.1,15.1
4,01d907b3e209269e120a365fc2b97524,d02cc92f5e33eb58d9ff4d5cce6ae901,delivered,2017-08-09 16:21:06,2017-08-10 10:25:08,2017-08-11 19:05:53,2017-08-16 22:34:11,2017-08-29,1,b1434a8f79cb3528540d9b21e686e823,d1c281d3ae149232351cd8c8cc885f0d,2017-08-16 10:25:08,151.99,17.77,17.77


## H23

In [5]:
aux_df = df.select(F.col('total_freight'),
                   F.col('order_purchase_timestamp').alias('purchase'),
                   F.col('order_delivered_customer_date').alias('deliver'))
aux_df = aux_df.withColumn('order_duration', F.datediff(F.col('deliver'), F.col('purchase')))
aux_df = aux_df.drop('purchase', 'deliver')
aux_df.show()

+-------------+--------------+
|total_freight|order_duration|
+-------------+--------------+
|         29.2|             5|
|         29.2|             5|
|         28.5|            13|
|         15.1|            14|
|        17.77|             7|
|        11.73|             2|
|        14.14|             9|
|         8.88|             4|
|        14.73|             7|
|        45.34|            27|
|        45.34|            27|
|        24.52|            27|
|        11.37|             2|
|        57.33|            27|
|        15.15|             9|
|        37.38|             3|
|        37.38|             3|
|        33.87|             8|
|        21.51|             6|
|         28.0|            10|
+-------------+--------------+
only showing top 20 rows



In [6]:
aux_df.stat.corr('total_freight', 'order_duration')

0.12590689743280645

# Conclusion H23

### The hypotesis 23 is not valid, as there is nearly no correlation between the freight value and the time of delivery

## H24

In [None]:
aux_df = order_items_df.groupBy('order_id').agg(F.max('order_item_id').alias('total_items'))