In [1]:
import findspark
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import sum, col, avg, first, round
import json

In [2]:
findspark.init()

In [3]:
spark = SparkSession.builder.getOrCreate()

In [4]:
with open ("config.json", "r") as config_file:
    config_data = json.load(config_file)

path_products = config_data["csv_paths"]["path_products"]
path_sales = config_data["csv_paths"]["path_sales"]
path_sellers = config_data["csv_paths"]["path_sellers"]

In [5]:
products_schema = StructType ([
    StructField ("product_id", IntegerType(), False),
    StructField ("product_name", StringType(), False),
    StructField ("price", IntegerType(), False)
])
products = spark.read.options(header=True,delimiter=",").schema(products_schema).csv(path_products)

products.printSchema()

root
 |-- product_id: integer (nullable = true)
 |-- product_name: string (nullable = true)
 |-- price: integer (nullable = true)



In [6]:
sales_schema = StructType ([
    StructField ("order_id", IntegerType(), False),
    StructField ("product_id", IntegerType(), False),
    StructField ("seller_id", IntegerType(), False),
    StructField ("date", DateType(), False),
    StructField ("num_pieces_sold", IntegerType(), False),
    StructField ("bill_raw_text", StringType(), False),
    StructField ("product_id_num", IntegerType(), False)
])

sales = spark.read.options(header=True,delimiter=",").schema(sales_schema).csv(path_sales)

sales.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- seller_id: integer (nullable = true)
 |-- date: date (nullable = true)
 |-- num_pieces_sold: integer (nullable = true)
 |-- bill_raw_text: string (nullable = true)
 |-- product_id_num: integer (nullable = true)



In [7]:
sellers_schema = StructType ([
    StructField ("seller_id", IntegerType(), nullable=False),
    StructField ("seller_name", StringType(), True),
    StructField ("daily_target", IntegerType(), True)
])
sellers = spark.read.options(header=True,delimiter=",").schema(sellers_schema).csv(path_sellers)

sellers.printSchema()

root
 |-- seller_id: integer (nullable = true)
 |-- seller_name: string (nullable = true)
 |-- daily_target: integer (nullable = true)



#### We'll load the three datasets and show the first 5 columns of each one

In [8]:
products.show(5)
sales.show(5)
sellers.show(5)

+----------+------------+-----+
|product_id|product_name|price|
+----------+------------+-----+
|         0|   product_0|   22|
|         1|   product_1|   30|
|         2|   product_2|   91|
|         3|   product_3|   37|
|         4|   product_4|  145|
+----------+------------+-----+
only showing top 5 rows

+--------+----------+---------+----------+---------------+--------------------+--------------+
|order_id|product_id|seller_id|      date|num_pieces_sold|       bill_raw_text|product_id_num|
+--------+----------+---------+----------+---------------+--------------------+--------------+
| 2998575|     39495|        8|2020-07-09|             69|cbgztjphaqxaolwgd...|         39495|
| 3981313|     35340|        9|2020-07-04|             47|nedrhdkrndwhonulx...|         35340|
| 3992645|     11870|        6|2020-07-08|             66|pjjfnblolzqdzzxbj...|         11870|
| 6475605|      7915|        8|2020-07-03|             12|wmsofzpxwlyhorqwf...|          7915|
|12492936|     85210| 

#### How many products, sales and sellers are there in total?

Total of products and sellers is trivial, but in one sale there can be more than one product. With the purpose of counting the total amount of sales, we'll sum the number of pieces sold

In [9]:
total_sales = sales.select(sum(sales.num_pieces_sold)).first()[0]

In [10]:
print ("Products:", products.count())
print ("Sales:", total_sales)
print ("Sellers:", sellers.count())

Products: 100000
Sales: 71194
Sellers: 10


#### How many products have been sold at least once?

In [11]:
sales.select("product_id").distinct().count()

1376

#### Which product has been sold in more orders?

In [22]:
max_sales = (
    sales.groupBy("product_id")
    .count()
    .sort("count", ascending = False)
    .first()[1]
)
print ("The product sold in the most orders has been sold", max_sales, "times.")

The product sold in the most orders has been sold 2 times.


In the following query we can see the most frequent products among all orders

In [23]:
(
    sales.groupBy("product_id")
    .count()
    .where(col("count") == max_sales)
    .show()
)

+----------+-----+
|product_id|count|
+----------+-----+
|      2999|    2|
|     20913|    2|
|     72638|    2|
|     30418|    2|
|     91069|    2|
|     62258|    2|
|     42883|    2|
|     22297|    2|
|     71771|    2|
|     81906|    2|
|     75721|    2|
|     96545|    2|
+----------+-----+



#### How many different products have been sold each day?

In [24]:
(
    sales.select("product_id", "date")
    .distinct()
    .groupBy("date")
    .count()
    .sort("date", ascending = True)
    .show()
)

+----------+-----+
|      date|count|
+----------+-----+
|2020-07-01|  142|
|2020-07-02|  132|
|2020-07-03|  159|
|2020-07-04|  147|
|2020-07-05|  142|
|2020-07-06|  145|
|2020-07-07|  123|
|2020-07-08|  114|
|2020-07-09|  146|
|2020-07-10|  138|
+----------+-----+



#### How much is the average expense per order?

In [21]:
avg_order_cost = (
    sales.join(products, sales.product_id == products.product_id, "inner")
    .withColumn("cost", col("num_pieces_sold") * col("price"))
    .select("order_id","cost")
    .agg({"cost": 'avg'}).first()[0]
)
print ('Average cost of a order is', avg_order_cost)

Average cost of a order is 3916.516570605187


### We calculate the average percentage that a sales order contributes to the quota of each seller

In [17]:
(
    sales.join(sellers, sales.seller_id == sellers.seller_id, "inner")
    .withColumn("quota_per_order", col("num_pieces_sold") / col("daily_target") * 100)
    .groupBy(sales.seller_id)
    .agg(round(avg('quota_per_order'),4).alias('quota'))
    .sort(sales.seller_id, ascending=True)
    .show()
)

+---------+------+
|seller_id| quota|
+---------+------+
|        1|0.0189|
|        2|0.0069|
|        3|0.0168|
|        4|0.0035|
|        5|0.0041|
|        6|0.0048|
|        7|0.0029|
|        8|0.0094|
|        9|0.0036|
+---------+------+



### We calculate the total sold

In [18]:
total_spent = (
    sales.join(products, sales.product_id == products.product_id)
    .withColumn("spend_per_order", col('price') * col('num_pieces_sold'))
    .select(sum("spend_per_order"))
    .first()[0]
)
print ("The total sold is", total_spent)

The total sold is 5436125


Now we'll check the total sold by seller 

In [19]:
(
    sales.join(products, sales.product_id == products.product_id)
    .withColumn("spend_per_order", col('price') * col('num_pieces_sold'))
    .groupBy('seller_id')
    .agg(first('spend_per_order')
        .alias('spend_per_order'),sum('spend_per_order').alias('spend_per_seller'))
    .select('seller_id', 'spend_per_seller')
    .sort('spend_per_seller', ascending=False)
    .show()
)

+---------+----------------+
|seller_id|spend_per_seller|
+---------+----------------+
|        7|          775363|
|        5|          646894|
|        8|          627624|
|        2|          624802|
|        3|          620734|
|        4|          567151|
|        1|          561092|
|        9|          534637|
|        6|          477828|
+---------+----------------+

