In [1]:
import pandas as pd
import numpy as np
import sys
sys.path.append('../')
import config 

In [2]:
# get module variable for root directory
ROOT_DIR = config.ROOT_DIR

#### Transform AdventureWorks_Sales

In [3]:
aw_sales_2015 = pd.read_csv(rf"{ROOT_DIR}/../storage/raw/AdventureWorks_Sales_2015.csv")
aw_sales_2016 = pd.read_csv(rf"{ROOT_DIR}/../storage/raw/AdventureWorks_Sales_2016.csv")
aw_sales_2017 = pd.read_csv(rf"{ROOT_DIR}/../storage/raw/AdventureWorks_Sales_2017.csv")

In [4]:
# concatenate AdventureWorks_Sales 2015, 2016 and 2017 along vertical axis (equivalent to sql union operation)
aw_sales_fact = pd.concat([aw_sales_2015, aw_sales_2016, aw_sales_2017], ignore_index=True)

In [5]:
aw_sales_fact.head()

Unnamed: 0,OrderDate,StockDate,OrderNumber,ProductKey,CustomerKey,TerritoryKey,OrderLineItem,OrderQuantity
0,1/1/2015,9/21/2001,SO45080,332,14657,1,1,1
1,1/1/2015,12/5/2001,SO45079,312,29255,4,1,1
2,1/1/2015,10/29/2001,SO45082,350,11455,9,1,1
3,1/1/2015,11/16/2001,SO45081,338,26782,6,1,1
4,1/2/2015,12/15/2001,SO45083,312,14947,10,1,1


In [6]:
aw_sales_fact.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56046 entries, 0 to 56045
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   OrderDate      56046 non-null  object
 1   StockDate      56046 non-null  object
 2   OrderNumber    56046 non-null  object
 3   ProductKey     56046 non-null  int64 
 4   CustomerKey    56046 non-null  int64 
 5   TerritoryKey   56046 non-null  int64 
 6   OrderLineItem  56046 non-null  int64 
 7   OrderQuantity  56046 non-null  int64 
dtypes: int64(5), object(3)
memory usage: 3.4+ MB


In [7]:
aw_sales_fact["QuantityType"] = np.where(aw_sales_fact["OrderQuantity"] > 1, "Multiple Items", "Single Item")

In [8]:
# convert OrderDate and StockDate to datetime[ns] data type
aw_sales_fact["OrderDate"] = pd.to_datetime(aw_sales_fact["OrderDate"], format="%m/%d/%Y")
aw_sales_fact["StockDate"] = pd.to_datetime(aw_sales_fact["StockDate"], format="%m/%d/%Y")

In [9]:
aw_sales_fact.head()

Unnamed: 0,OrderDate,StockDate,OrderNumber,ProductKey,CustomerKey,TerritoryKey,OrderLineItem,OrderQuantity,QuantityType
0,2015-01-01,2001-09-21,SO45080,332,14657,1,1,1,Single Item
1,2015-01-01,2001-12-05,SO45079,312,29255,4,1,1,Single Item
2,2015-01-01,2001-10-29,SO45082,350,11455,9,1,1,Single Item
3,2015-01-01,2001-11-16,SO45081,338,26782,6,1,1,Single Item
4,2015-01-02,2001-12-15,SO45083,312,14947,10,1,1,Single Item


In [10]:
aw_sales_fact.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56046 entries, 0 to 56045
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   OrderDate      56046 non-null  datetime64[ns]
 1   StockDate      56046 non-null  datetime64[ns]
 2   OrderNumber    56046 non-null  object        
 3   ProductKey     56046 non-null  int64         
 4   CustomerKey    56046 non-null  int64         
 5   TerritoryKey   56046 non-null  int64         
 6   OrderLineItem  56046 non-null  int64         
 7   OrderQuantity  56046 non-null  int64         
 8   QuantityType   56046 non-null  object        
dtypes: datetime64[ns](2), int64(5), object(2)
memory usage: 3.8+ MB


In [11]:
# write transformed aw_sales_fact from raw to processed layer for powerbi consumption
aw_sales_fact.to_csv(rf"{ROOT_DIR}/../storage/processed/aw_sales_fact.csv", index=False)