# Importing PySpark and opening files

In [2]:
from pyspark.sql import SparkSession, functions as F
spark = SparkSession.builder.getOrCreate()

orders_items_df = spark.read \
                 .option('escape', '\"') \
                 .option('quote', '\"') \
                 .csv('./dataset/olist_order_items_dataset.csv', header=True, multiLine=True, inferSchema=True)

orders_df = spark.read \
                 .option('escape', '\"') \
                 .option('quote', '\"') \
                 .csv('./dataset/olist_orders_dataset.csv', header=True, multiLine=True, inferSchema=True)

reviews_df = spark.read \
                  .option('escape', '\"') \
                  .option('quote', '\"') \
                  .csv('./dataset/olist_order_reviews_dataset.csv', header=True, multiLine=True, inferSchema=True)

customers_df = spark.read \
                  .option('escape', '\"') \
                  .option('quote', '\"') \
                  .csv('./dataset/olist_customers_dataset.csv', header=True, multiLine=True, inferSchema=True)

sellers_df = spark.read \
                  .option('escape', '\"') \
                  .option('quote', '\"') \
                  .csv('./dataset/olist_sellers_dataset.csv', header=True, multiLine=True, inferSchema=True)

orders_df = orders_df.join(customers_df, 'customer_id') \
                     .join(reviews_df, 'order_id')

data_df = orders_items_df.join(orders_df, 'order_id') \
                         .join(sellers_df, 'seller_id') \
                         .select('customer_state', 'seller_state', 'review_score')

## Same state

In [3]:
data_df.filter(F.col('customer_state') == F.col('seller_state')).groupBy().agg(F.mean('review_score'), F.stddev('review_score')).show()

+-----------------+-------------------------+
|avg(review_score)|stddev_samp(review_score)|
+-----------------+-------------------------+
| 4.13092594851775|       1.3257080150521428|
+-----------------+-------------------------+



## Different state

In [4]:
data_df.filter(F.col('customer_state') != F.col('seller_state')).groupBy().agg(F.mean('review_score'), F.stddev('review_score')).show()

+-----------------+-------------------------+
|avg(review_score)|stddev_samp(review_score)|
+-----------------+-------------------------+
|3.950053223108506|        1.436542696903658|
+-----------------+-------------------------+



# Conclusion

### It's not possible to conclude that the review score changes when the customer state and the seller state is different or equal since the difference between these two is little.