In [1]:
import findspark
import pyspark.pandas as ps
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import sum, col
from pyspark import sql
import json



In [2]:
findspark.init()

In [3]:
spark = SparkSession.builder.getOrCreate()

In [4]:
with open ("config.json", "r") as config_file:
    config_data = json.load(config_file)

path_products = config_data["csv_paths"]["path_products"]
path_sales = config_data["csv_paths"]["path_sales"]
path_sellers = config_data["csv_paths"]["path_sellers"]

1. Lee los tres datasets y muestra 5 registros de todas las columnas

In [5]:
products_schema = StructType ([
    StructField ("product_id", IntegerType(), False),
    StructField ("product_name", StringType(), False),
    StructField ("price", IntegerType(), False)
])
products = spark.read.options(header=True,delimiter=",").schema(products_schema).csv(path_products)

products.printSchema()

root
 |-- product_id: integer (nullable = true)
 |-- product_name: string (nullable = true)
 |-- price: integer (nullable = true)



In [6]:
sales_schema = StructType ([
    StructField ("order_id", IntegerType(), False),
    StructField ("product_id", IntegerType(), False),
    StructField ("seller_id", IntegerType(), False),
    StructField ("date", DateType(), False),
    StructField ("num_pieces_sold", IntegerType(), False),
    StructField ("bill_raw_text", StringType(), False),
    StructField ("product_id_num", IntegerType(), False)
])

sales = spark.read.options(header=True,delimiter=",").schema(sales_schema).csv(path_sales)

sales.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- seller_id: integer (nullable = true)
 |-- date: date (nullable = true)
 |-- num_pieces_sold: integer (nullable = true)
 |-- bill_raw_text: string (nullable = true)
 |-- product_id_num: integer (nullable = true)



In [7]:
sellers_schema = StructType ([
    StructField ("seller_id", IntegerType(), nullable=False),
    StructField ("seller_name", StringType(), True),
    StructField ("daily_target", IntegerType(), True)
])
sellers = spark.read.options(header=True,delimiter=",").schema(sellers_schema).csv(path_sellers)

sellers.printSchema()

root
 |-- seller_id: integer (nullable = true)
 |-- seller_name: string (nullable = true)
 |-- daily_target: integer (nullable = true)



### Reads the three datasets and displays 5 records from all columns.

In [8]:
products.show(5)
sales.show(5)
sellers.show(5)

+----------+------------+-----+
|product_id|product_name|price|
+----------+------------+-----+
|         0|   product_0|   22|
|         1|   product_1|   30|
|         2|   product_2|   91|
|         3|   product_3|   37|
|         4|   product_4|  145|
+----------+------------+-----+
only showing top 5 rows

+--------+----------+---------+----------+---------------+--------------------+--------------+
|order_id|product_id|seller_id|      date|num_pieces_sold|       bill_raw_text|product_id_num|
+--------+----------+---------+----------+---------------+--------------------+--------------+
| 2998575|     39495|        8|2020-07-09|             69|cbgztjphaqxaolwgd...|         39495|
| 3981313|     35340|        9|2020-07-04|             47|nedrhdkrndwhonulx...|         35340|
| 3992645|     11870|        6|2020-07-08|             66|pjjfnblolzqdzzxbj...|         11870|
| 6475605|      7915|        8|2020-07-03|             12|wmsofzpxwlyhorqwf...|          7915|
|12492936|     85210| 

### How many products, sales and sellers are there in total?

Total of products and sellers is trivial, but in one sale there can be more than one product. With the purpose of counting the total amount of sales, we'll sum the number of pieces sold

In [9]:
total_sales = sales.select(sum(sales.num_pieces_sold)).first()[0]

In [10]:
print ("Products:", products.count())
print ("Sales:", total_sales)
print ("Sellers:", sellers.count())

Products: 100000
Sales: 71194
Sellers: 10


### How many products have been sold at least once?

In [11]:
sales.select("product_id").distinct().count()

1376

### Which product has been sold in more orders?

In [12]:
max_sales = sales.groupBy("product_id").count().sort("count", ascending = False).first()[1]
print ("The product sold in the most orders has been sold", max_sales, "times.")

The product sold in the most orders has been sold 2 times.


In the following query we can see the most frequent products among all orders

In [13]:
sales.groupBy("product_id").count().where(col("count") == max_sales).show()

+----------+-----+
|product_id|count|
+----------+-----+
|      2999|    2|
|     20913|    2|
|     72638|    2|
|     30418|    2|
|     91069|    2|
|     62258|    2|
|     42883|    2|
|     22297|    2|
|     71771|    2|
|     81906|    2|
|     75721|    2|
|     96545|    2|
+----------+-----+

