In [46]:
from pyspark.sql import SparkSession


# Initialize Spark Session
spark = SparkSession.builder.appName("cycles").getOrCreate()

In [47]:
# Base path to csv files
base_path = "../raw-files/Person Data/"

# only keep the columns we need for the person data
required_cols = [
    "PersonID",
    "Title",
    "FirstName",
    "MiddleName",
    "LastName",
    "NameStyle",
    "Demographics",
    "Suffix",
    "EmailAddress",
    "AddressLine1",
    "AddressLine2",
    "PhoneNumber",
]

# Read required files
person_df = spark.read.csv(base_path + "Person Person.csv", header=True)
emailAddress_df = spark.read.csv(base_path + "Person EmailAddress.csv", header=True)
businessEntity_df = spark.read.csv(base_path + "Person BusinessEntity.csv", header=True)
businessEntityAddress_df = spark.read.csv(base_path + "Person BusinessEntityAddress.csv", header=True)
address_df = spark.read.csv(base_path + "Person Address.csv", header=True)
personPhone_df = spark.read.csv(base_path + "Person PersonPhone.csv", header=True)

person_details_df = (
    person_df.join(emailAddress_df, "BusinessEntityID", "left")
    .join(businessEntity_df, "BusinessEntityID", "left")
    .join(businessEntityAddress_df, "BusinessEntityID", "left")
    .join(address_df, "AddressID", "left")
    .join(personPhone_df, "BusinessEntityID", "left")
)

person_details_df = person_details_df.withColumnRenamed("BusinessEntityID", "PersonID")

person_details_df = person_details_df.select(required_cols)

In [48]:
# Base path to csv files
base_path = "../raw-files/Sales Data/"

# only keep the columns we need for the sales data
required_cols = [
    "CustomerKey",
    "SalesPersonID",
    "PersonID",
    "Region",
    "SalesOrderNumber",
    "ProductID",
    "OrderQty",
    "OrderDate",
    "ShipDate",
    "UnitPrice",
    "UnitPriceDiscount",
]

# Read required files
customer_df = spark.read.csv(base_path + "Sales Customer.csv", header=True)
salesOrderHeader_df = spark.read.csv(base_path + "Sales SalesOrderHeader.csv", header=True)
salesOrderDetail_df = spark.read.csv(base_path + "Sales SalesOrderDetail.csv", header=True)
salesTerritory_df = spark.read.csv(base_path + "Sales SalesTerritory.csv", header=True)

sales_details_df = (
    customer_df.join(salesOrderHeader_df, "CustomerID", "left")
    .join(salesOrderDetail_df, "SalesOrderID", "left")
    .join(salesTerritory_df, "TerritoryID", "left")
)

# rename columns : Group -> Region CustomerID -> CustomerKey
sales_details_df = (sales_details_df
                    .withColumnRenamed("Group", "Region")
                    .withColumnRenamed("CustomerID", "CustomerKey")
)

sales_details_df = sales_details_df.select(required_cols)

In [49]:
from pyspark.sql.functions import coalesce

# Base path to csv files
base_path = "../raw-files/Production Data/"

# only keep the columns we need for the production data
required_cols = ["ProductID", "ProductCategoryName", "Model"]

# Read required files
product_df = spark.read.csv(base_path + "Production Product.csv", header=True)
productSubcategory_df = spark.read.csv(base_path + "Production ProductSubcategory.csv", header=True)
productCategory_df = spark.read.csv(base_path + "Production ProductCategory.csv", header=True)
productModel_df = spark.read.csv(base_path + "Production ProductModel.csv", header=True)


product_details_df = product_df.join(productSubcategory_df, "ProductSubCategoryID", "left")

# rename column to SubCategoryName
product_details_df = product_details_df.withColumnRenamed("Name", "SubCategoryName")

product_details_df = product_details_df.join(productCategory_df, "ProductCategoryID", "left")

# rename name column to ProductCategoryName
product_details_df = product_details_df.withColumnRenamed("Name", "ProductCategoryName")

product_details_df = product_details_df.join(productModel_df, "ProductModelID", "left")

# Add column Model which is a coalesce of Name and ProductCategoryName
product_details_df = product_details_df.withColumn(
    "Model",
    coalesce(product_details_df["Name"], product_details_df["ProductCategoryName"]),
)

product_details_df = product_details_df.select(required_cols)

In [50]:
# save files to csv
person_output_path = "../workshop-files/person.csv"
sales_output_path = "../workshop-files/sales.csv"
products_output_path = "../workshop-files/products.csv"

# Convert Spark DataFrame to pandas DataFrame and save
person_details_df.toPandas().to_csv(person_output_path, index=False, header=True)
sales_details_df.toPandas().to_csv(sales_output_path, index=False, header=True)
product_details_df.toPandas().to_csv(products_output_path, index=False, header=True)

In [51]:
# join sales and person data
customer_details = sales_details_df.join(person_details_df, "PersonID", "inner")

cust_prod_details = customer_details.join(product_details_df, "ProductID", "inner")

# filter where model or product category name is bikes
cust_prod_details = cust_prod_details.filter(
    (cust_prod_details["Model"] == "Bikes") | (cust_prod_details["ProductCategoryName"] == "Bikes")
).select("PersonID", "Region", "ProductCategoryName", "Model", "Demographics")

In [52]:
cust_prod_details.show()

                                                                                

+--------+-------------+-------------------+--------------+--------------------+
|PersonID|       Region|ProductCategoryName|         Model|        Demographics|
+--------+-------------+-------------------+--------------+--------------------+
|    4937|North America|              Bikes|  Mountain-200|"<IndividualSurve...|
|    4937|North America|              Bikes|  Mountain-200|"<IndividualSurve...|
|    6731|       Europe|              Bikes|    Road-550-W|"<IndividualSurve...|
|    7273|      Pacific|              Bikes|  Mountain-200|"<IndividualSurve...|
|    7273|      Pacific|              Bikes|  Touring-1000|"<IndividualSurve...|
|    7273|      Pacific|              Bikes|  Mountain-100|"<IndividualSurve...|
|   11722|North America|              Bikes|      Road-650|"<IndividualSurve...|
|   11722|North America|              Bikes|  Mountain-200|"<IndividualSurve...|
|   12394|       Europe|              Bikes|  Mountain-200|"<IndividualSurve...|
|   12394|       Europe|    