In [1]:
#
# Name:         nb-spark-lake-database-code
#

#
# Design Phase:
#     Author:   John Miner
#     Date:     09-15-2022
#     Purpose:  Data can either be processed into final file or exposed as views.
#


In [None]:
%%sql

--
-- Create database - saleslt
--

USE default;
CREATE DATABASE IF NOT EXISTS saleslt;
USE saleslt;

In [None]:
%%sql

--
--  Create Table - dim.currency
--

CREATE TABLE dim_currency
(
    CurrencyKey INT,
	CurrencyAlternateKey STRING,
	CurrencyName STRING
)
USING CSV
LOCATION 'abfss://sc4adls2030@sa4adls2030.dfs.core.windows.net/synapse/csv-files/DimCurrency.csv'
OPTIONS 
(
    header = "false", 
    delimiter = "|"
)


In [None]:
%%sql

--
--  Create Table - dim.customer
--

CREATE TABLE dim_customer
(
    CustomerKey INT, 
    GeographyKey INT, 
    CustomerAlternateKey STRING, 
    Title STRING, 
    FirstName STRING, 
    MiddleName STRING, 
    LastName STRING, 
    NameStyle INT, 
    BirthDate DATE, 
    MaritalStatus STRING, 
    Suffix STRING, 
    Gender STRING, 
    EmailAddress STRING, 
    YearlyIncome DECIMAL, 
    TotalChildren INT, 
    NumberChildrenAtHome INT, 
    EnglishEducation STRING, 
    SpanishEducation STRING, 
    FrenchEducation STRING, 
    EnglishOccupation STRING, 
    SpanishOccupation STRING, 
    FrenchOccupation STRING, 
    HouseOwnerFlag STRING, 
    NumberCarsOwned INT, 
    AddressLine1 STRING, 
    AddressLine2 STRING, 
    Phone STRING, 
    DateFirstPurchase DATE, 
    CommuteDistance STRING
)
USING CSV
LOCATION 'abfss://sc4adls2030@sa4adls2030.dfs.core.windows.net/synapse/csv-files/DimCustomer.csv'
OPTIONS 
(
    header = "false", 
    delimiter = "|"
)

In [None]:
%%sql

--
--  Create Table - dim.date
--

CREATE TABLE dim_date
(
    DateKey INT, 
    FullDateAlternateKey TIMESTAMP,
    DayNumberOfWeek SHORT,
    EnglishDayNameOfWeek STRING,
    SpanishDayNameOfWeek STRING,
    FrenchDayNameOfWeek STRING,
    DayNumberOfMonth SHORT,
    DayNumberOfYear SHORT,
    WeekNumberOfYear SHORT,
    EnglishMonthName STRING,
    SpanishMonthName STRING,
    FrenchMonthName STRING,
    MonthNumberOfYear SHORT,
    CalendarQuarter SHORT,
    CalendarYear SHORT,
    CalendarSemester SHORT,
    FiscalQuarter SHORT,
    FiscalYear SHORT,
    FiscalSemester SHORT
)
USING CSV
LOCATION 'abfss://sc4adls2030@sa4adls2030.dfs.core.windows.net/synapse/csv-files/DimDate.csv'
OPTIONS 
(
    header = "false", 
    delimiter = "|"
)

In [None]:
%%sql

--
--  Create Table - dim.geography
--

CREATE TABLE dim_geography
(
    GeographyKey INT,
    City STRING,
    StateProvinceCode STRING,
    StateProvinceName STRING,
    CountryRegionCode STRING,
    EnglishCountryRegionName STRING,
    SpanishCountryRegionName STRING,
    FrenchCountryRegionName STRING,
    PostalCode STRING,
    SalesTerritoryKey INT,
    IpAddressLocator STRING
)
USING CSV
LOCATION 'abfss://sc4adls2030@sa4adls2030.dfs.core.windows.net/synapse/csv-files/DimGeography.csv'
OPTIONS 
(
    header = "false", 
    delimiter = "|"
)

In [None]:
%%sql

--
--  Create Table - dim.product
--

CREATE TABLE dim_product
(
    ProductKey INTEGER,
	ProductAlternateKey STRING,
	ProductSubcategoryKey INTEGER,
	WeightUnitMeasureCode STRING,
	SizeUnitMeasureCode STRING,
	EnglishProductName STRING,
	SpanishProductName STRING,
	FrenchProductName STRING,
	StandardCost decimal(19,4),
	FinishedGoodsFlag BOOLEAN,
	Color STRING,
	SafetyStockLevel SHORT,
	ReorderPoint SHORT,
	ListPrice decimal(19,4),
	Size STRING,
	SizeRange STRING,
	Weight decimal(19,4),
	DaysToManufacture INTEGER,
	ProductLine STRING,
	DealerPrice decimal(19,4),
	Class STRING,
	Style STRING,
	ModelName STRING,
	StartDate TIMESTAMP,
	EndDate TIMESTAMP,
	Status STRING
)
USING CSV
LOCATION 'abfss://sc4adls2030@sa4adls2030.dfs.core.windows.net/synapse/csv-files/DimProduct.csv'
OPTIONS 
(
    header = "false", 
    delimiter = "|"
)

In [None]:
%%sql

--
--  Create Table - dim.product_category
--

CREATE TABLE dim_product_category
(
    ProductCategoryKey INT,
    ProductCategoryAlternateKey INT,
    EnglishProductCategoryName STRING,
    SpanishProductCategoryName STRING,
    FrenchProductCategoryName STRING
)
USING CSV
LOCATION 'abfss://sc4adls2030@sa4adls2030.dfs.core.windows.net/synapse/csv-files/DimProductCategory.csv'
OPTIONS 
(
    header = "false", 
    delimiter = "|"
)

In [None]:
%%sql

--
--  Create Table - dim.product_subcategory
--

CREATE TABLE dim_product_subcategory
(
    ProductSubcategoryKey INT,
    ProductSubcategoryAlternateKey INT,
    EnglishProductSubcategoryName STRING,
    SpanishProductSubcategoryName STRING,
    FrenchProductSubcategoryName STRING,
    ProductCategoryKey INT
)
USING CSV
LOCATION 'abfss://sc4adls2030@sa4adls2030.dfs.core.windows.net/synapse/csv-files/DimProductSubcategory.csv'
OPTIONS 
(
    header = "false", 
    delimiter = "|"
)

In [None]:
%%sql

--
--  Create Table - dim.sales_reason
--

CREATE TABLE dim_sales_reason
(
    SalesReasonKey INT,
    SalesReasonAlternateKey INT,
    SalesReasonName STRING,
    SalesReasonReasonType STRING
)
USING CSV
LOCATION 'abfss://sc4adls2030@sa4adls2030.dfs.core.windows.net/synapse/csv-files/DimSalesReason.csv'
OPTIONS 
(
    header = "false", 
    delimiter = "|"
)

In [6]:
%%sql

--
--  Create Table - dim.sales_territory
--

CREATE TABLE dim_sales_territory
(
    SalesTerritoryKey INT,
    SalesTerritoryAlternateKey INT,
    SalesTerritoryRegion STRING,
    SalesTerritoryCountry STRING,
    SalesTerritoryGroup STRING
)
USING CSV
LOCATION 'abfss://sc4adls2030@sa4adls2030.dfs.core.windows.net/synapse/csv-files/DimSalesTerritory.csv'
OPTIONS 
(
    header = "false", 
    delimiter = "|"
)

In [None]:
%%sql

--
--  Create Table - fact.internet_sales
--

CREATE TABLE IF NOT EXISTS fact_internet_sales
(
    ProductKey int,
    OrderDateKey int,
    DueDateKey int,
    ShipDateKey int,
    CustomerKey int,
    PromotionKey int,
    CurrencyKey int,
    SalesTerritoryKey int,
    SalesOrderNumber string,
    SalesOrderLineNumber short,
    RevisionNumber short,
    OrderQuantity short,
    UnitPrice decimal,
    ExtendedAmount decimal,
    UnitPriceDiscountPct decimal,
    DiscountAmount decimal,
    ProductStandardCost decimal,
    TotalProductCost decimal,
    SalesAmount decimal,
    TaxAmt decimal,
    Freight decimal,
    CarrierTrackingNumber string,
    CustomerPONumber string,
    OrderDate timestamp ,
    DueDate timestamp ,
    ShipDate timestamp
)
USING CSV
LOCATION 'abfss://sc4adls2030@sa4adls2030.dfs.core.windows.net/synapse/csv-files/FactInternetSales.csv'
OPTIONS 
(
    header = "false", 
    delimiter = "|"
)

In [None]:
%%sql

--
--  Create Table - fact.internet_sales_reason
--

CREATE TABLE fact_internet_sales_reason
(
    SalesOrderNumber STRING,
	SalesOrderLineNumber SHORT,
	SalesReasonKey INT
)
USING CSV
LOCATION 'abfss://sc4adls2030@sa4adls2030.dfs.core.windows.net/synapse/csv-files/FactInternetSalesReason.csv'
OPTIONS 
(
    header = "false", 
    delimiter = "|"
)

In [3]:
%%sql

--
-- !~ peak at data ~!
-- 

-- select * from dim_currency limit 5
-- select * from dim_customer limit 5
-- select * from dim_date limit 5
-- select * from dim_geography limit 5
-- select * from dim_product limit 5
-- select * from dim_product_category limit 5
-- select * from dim_product_subcategory limit 5
-- select * from dim_sales_reason limit 5
-- select * from dim_sales_territory limit 5
-- select * from fact_internet_sales limit 5
-- select * from fact_internet_sales_reason limit 5

-- select 'finished' as msg

--
-- !~ drop table if needed ~!
--

-- drop table dim_product
-- drop view rpt_prepared_data

-- use saleslt;
-- drop table dim_sales_territory;

In [2]:
%%sql

--
--  Check table counts
--

-- https://dataedo.com/samples/html/Data_warehouse/doc/AdventureWorksDW_4/modules/Internet_Sales_101/module.html

USE saleslt;

SELECT 'dim.currency' as Label, COUNT(*) as Total FROM dim_currency
UNION
SELECT 'dim.customer' as Label, COUNT(*) as Total FROM dim_customer 
UNION
SELECT 'dim.date' as Label, COUNT(*) as Total FROM dim_date 
UNION
SELECT 'dim.geography' as Label, COUNT(*) as Total FROM dim_geography
UNION
SELECT 'dim.product' as Label, COUNT(*) as Total FROM dim_product
UNION
SELECT 'dim.product_category' as Label, COUNT(*) as Total FROM dim_product_category
UNION
SELECT 'dim.product_subcategory' as Label, COUNT(*) as Total FROM dim_product_subcategory
UNION
SELECT 'dim.sales_reason' as Label, COUNT(*) as Total FROM dim_sales_reason
UNION
SELECT 'dim.sales_territory' as Label, COUNT(*) as Total FROM dim_sales_territory
UNION
SELECT 'fact.internet_sales' as Label, COUNT(*) as Total FROM fact_internet_sales 
UNION
SELECT 'fact.internet_sales_reason' as Label, COUNT(*) as Total FROM fact_internet_sales_reason 

ORDER BY Label


In [None]:
%%sql

--
-- Drop database - saleslt
--

USE default;
-- DROP DATABASE saleslt CASCADE;


In [None]:
%%sql

--
-- Create a view 
-- 

CREATE VIEW rpt_prepared_data
AS
SELECT
   pc.EnglishProductCategoryName
  ,Coalesce(p.ModelName, p.EnglishProductName) AS Model
  ,c.CustomerKey
  ,s.SalesTerritoryGroup AS Region
  ,CASE
    WHEN month(current_timestamp) < month(c.BirthDate) THEN 
      year(c.BirthDate) - year(current_timestamp) - 1
    WHEN month(current_timestamp) = month(c.BirthDate) AND day(current_timestamp) < day(c.BirthDate) THEN 
	  year(c.BirthDate) - year(current_timestamp) - 1
    ELSE 
	    year(c.BirthDate) - year(current_timestamp)
  END AS Age
  ,CASE
      WHEN c.YearlyIncome < 40000 THEN 'Low'
      WHEN c.YearlyIncome > 60000 THEN 'High'
      ELSE 'Moderate'
  END AS IncomeGroup
  ,d.CalendarYear
  ,d.FiscalYear
  ,d.MonthNumberOfYear AS Month
  ,f.SalesOrderNumber AS OrderNumber
  ,f.SalesOrderLineNumber AS LineNumber
  ,f.OrderQuantity AS Quantity
  ,f.ExtendedAmount AS Amount   
FROM
  fact_internet_sales as f
INNER JOIN 
  dim_date as d
ON 
  f.OrderDateKey = d.DateKey

INNER JOIN 
  dim_product as p
ON 
  f.ProductKey = p.ProductKey
  
INNER JOIN 
  dim_product_subcategory as psc
ON 
  p.ProductSubcategoryKey = psc.ProductSubcategoryKey

INNER JOIN 
  dim_product_category as pc
ON 
  psc.ProductCategoryKey = pc.ProductCategoryKey
  
INNER JOIN 
  dim_customer as c
ON 
  f.CustomerKey = c.CustomerKey

INNER JOIN 
  dim_geography as g
ON 
  c.GeographyKey = g.GeographyKey

INNER JOIN 
  dim_sales_territory as s
ON 
  g.SalesTerritoryKey = s.SalesTerritoryKey 

In [None]:
%%sql

--
-- Use view in aggregation
-- 

SELECT 
  CalendarYear as RptYear,
  Month as RptMonth,
  Region as RptRegion,
  Model as ModelNo,
  SUM(Quantity) as TotalQty,
  SUM(Amount) as TotalAmt
FROM 
  rpt_prepared_data 
GROUP BY
  CalendarYear,
  Month,
  Region,
  Model
ORDER BY
  CalendarYear,
  Month,
  Region

In [None]:
%%sql

--
--  Create Table - dim.date
--

USE saleslt;

CREATE TABLE dim_date2
USING PARQUET
LOCATION 'abfss://sc4adls2030@sa4adls2030.dfs.core.windows.net/synapse/parquet-files/DimDate';

In [None]:
%%sql
SELECT * FROM dim_date2