# Review score by delivery in same and different states

In [47]:
from pyspark.sql import SparkSession, functions as F
spark = SparkSession.builder.getOrCreate()

orders_items_df = spark.read \
                 .option('escape', '\"') \
                 .option('quote', '\"') \
                 .csv('./dataset/olist_order_items_dataset.csv', header=True, multiLine=True, inferSchema=True)

orders_df = spark.read \
                 .option('escape', '\"') \
                 .option('quote', '\"') \
                 .csv('./dataset/olist_orders_dataset.csv', header=True, multiLine=True, inferSchema=True)

reviews_df = spark.read \
                  .option('escape', '\"') \
                  .option('quote', '\"') \
                  .csv('./dataset/olist_order_reviews_dataset.csv', header=True, multiLine=True, inferSchema=True)

customers_df = spark.read \
                  .option('escape', '\"') \
                  .option('quote', '\"') \
                  .csv('./dataset/olist_customers_dataset.csv', header=True, multiLine=True, inferSchema=True)

sellers_df = spark.read \
                  .option('escape', '\"') \
                  .option('quote', '\"') \
                  .csv('./dataset/olist_sellers_dataset.csv', header=True, multiLine=True, inferSchema=True)

data_df = orders_df.join(customers_df, 'customer_id') \
                   .join(reviews_df, 'order_id')

data_df = orders_items_df.join(data_df, 'order_id') \
                         .join(sellers_df, 'seller_id') \
                         .select('customer_state', 'seller_state', 'review_score')

data_df.show()

+--------------+------------+------------+
|customer_state|seller_state|review_score|
+--------------+------------+------------+
|            SP|          SP|           4|
|            BA|          SP|           4|
|            GO|          SP|           5|
|            RN|          MG|           5|
|            SP|          SP|           5|
|            PR|          SP|           4|
|            RS|          SP|           2|
|            RJ|          SP|           5|
|            RS|          SP|           1|
|            SP|          SP|           5|
|            RJ|          ES|           1|
|            RJ|          ES|           1|
|            SP|          RS|           4|
|            MG|          SP|           5|
|            SP|          SP|           5|
|            SP|          SP|           4|
|            GO|          DF|           5|
|            RJ|          SP|           4|
|            SC|          SP|           5|
|            SP|          SP|           5|
+----------

## Same state

In [42]:
data_df.filter(F.col('customer_state') == F.col('seller_state')).groupBy().agg(F.mean('review_score'), F.stddev('review_score')).show()

+-----------------+-------------------------+
|avg(review_score)|stddev_samp(review_score)|
+-----------------+-------------------------+
| 4.13092594851775|       1.3257080150521428|
+-----------------+-------------------------+



## Different state

In [43]:
data_df.filter(F.col('customer_state') != F.col('seller_state')).groupBy().agg(F.mean('review_score'), F.stddev('review_score')).show()

+-----------------+-------------------------+
|avg(review_score)|stddev_samp(review_score)|
+-----------------+-------------------------+
|3.950053223108506|        1.436542696903658|
+-----------------+-------------------------+



# Calculating shipment delay time and comparing between states

In [44]:
orders_df = orders_df.withColumn('shipment_delay_time', F.datediff('order_estimated_delivery_date', 'order_delivered_customer_date')) \
                     .filter(F.col('order_status') == 'delivered').filter(F.col('shipment_delay_time') < 0)

orders_df = orders_df.join(customers_df, 'customer_id')

orders_df = orders_items_df.join(orders_df, 'order_id') \
                           .join(sellers_df, 'seller_id') \
                           .select('shipment_delay_time', 'customer_state', 'seller_state') \
                           .orderBy('shipment_delay_time').withColumn('shipment_delay_time', -orders_df['shipment_delay_time'])

orders_df.show()

+-------------------+--------------+------------+
|shipment_delay_time|customer_state|seller_state|
+-------------------+--------------+------------+
|                188|            RJ|          SP|
|                181|            ES|          MG|
|                175|            SP|          SP|
|                167|            SP|          RS|
|                166|            SE|          MG|
|                165|            PA|          MG|
|                162|            MG|          SP|
|                161|            SP|          SP|
|                161|            PI|          SP|
|                161|            SP|          SP|
|                159|            SE|          SP|
|                155|            PI|          SP|
|                155|            RS|          SP|
|                155|            RS|          SP|
|                153|            SP|          SP|
|                153|            RS|          SP|
|                152|            GO|          SP|


# Same state

In [45]:
orders_df.filter(F.col('customer_state') == F.col('seller_state')).groupBy().agg(F.mean('shipment_delay_time'), F.stddev('shipment_delay_time')).show()

+------------------------+--------------------------------+
|avg(shipment_delay_time)|stddev_samp(shipment_delay_time)|
+------------------------+--------------------------------+
|       7.678510998307953|              12.856139921192154|
+------------------------+--------------------------------+



# Different state

In [46]:
orders_df.filter(F.col('customer_state') != F.col('seller_state')).groupBy().agg(F.mean('shipment_delay_time'), F.stddev('shipment_delay_time')).show()

+------------------------+--------------------------------+
|avg(shipment_delay_time)|stddev_samp(shipment_delay_time)|
+------------------------+--------------------------------+
|      11.395920597341103|              14.881237613184904|
+------------------------+--------------------------------+



# Conclusion

### It's not possible to conclude that the review score or the shipment delay time changes when the customer state and the seller state is different or equal since the difference between these two is little.