In [48]:
import pandas as pd
import numpy as np
import sys
sys.path.append('../')
import config 

In [49]:
# get module variable for root directory
ROOT_DIR = config.ROOT_DIR

#### Transform AdventureWorks_Products.csv from raw to processed

In [50]:
aw_products_dim = pd.read_csv(rf"{ROOT_DIR}/../storage/raw/AdventureWorks_Products.csv")

In [51]:
aw_products_dim.head()

Unnamed: 0,ProductKey,ProductSubcategoryKey,ProductSKU,ProductName,ModelName,ProductDescription,ProductColor,ProductSize,ProductStyle,ProductCost,ProductPrice
0,214,31,HL-U509-R,"Sport-100 Helmet, Red",Sport-100,"Universal fit, well-vented, lightweight , snap...",Red,0,0,13.0863,34.99
1,215,31,HL-U509,"Sport-100 Helmet, Black",Sport-100,"Universal fit, well-vented, lightweight , snap...",Black,0,0,12.0278,33.6442
2,218,23,SO-B909-M,"Mountain Bike Socks, M",Mountain Bike Socks,Combination of natural and synthetic fibers st...,White,M,U,3.3963,9.5
3,219,23,SO-B909-L,"Mountain Bike Socks, L",Mountain Bike Socks,Combination of natural and synthetic fibers st...,White,L,U,3.3963,9.5
4,220,31,HL-U509-B,"Sport-100 Helmet, Blue",Sport-100,"Universal fit, well-vented, lightweight , snap...",Blue,0,0,12.0278,33.6442


In [52]:
aw_products_dim.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 293 entries, 0 to 292
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   ProductKey             293 non-null    int64  
 1   ProductSubcategoryKey  293 non-null    int64  
 2   ProductSKU             293 non-null    object 
 3   ProductName            293 non-null    object 
 4   ModelName              293 non-null    object 
 5   ProductDescription     293 non-null    object 
 6   ProductColor           243 non-null    object 
 7   ProductSize            293 non-null    object 
 8   ProductStyle           293 non-null    object 
 9   ProductCost            293 non-null    float64
 10  ProductPrice           293 non-null    float64
dtypes: float64(2), int64(2), object(7)
memory usage: 25.3+ KB


In [53]:
# drop ProductSize and ProductKey columns
aw_products_dim.drop(columns="ProductSize", inplace=True)
aw_products_dim.sort_values(by="ProductKey", inplace=True)

In [54]:
# create new column DiscountPrice with 15% discount
aw_products_dim["DiscountPrice"] = aw_products_dim["ProductPrice"] * 0.85

# limit numeric columns ProductCost and ProductPrice to 2 decimal places 
aw_products_dim.loc[:, ["ProductCost", "ProductPrice", "DiscountPrice"]] = aw_products_dim[["ProductCost", "ProductPrice", "DiscountPrice"]].round(2)

In [55]:
# extract letters before second '-' in ProductSKU column
product_sku_split = aw_products_dim["ProductSKU"].str.split("-", expand=True)
aw_products_dim["SKUType"] = product_sku_split[0].str.cat(product_sku_split[1], sep="-")
aw_products_dim["SKUCategory"] = product_sku_split[0]

In [56]:
# replace 0s in ProductStyle column with null values
aw_products_dim["ProductStyle"].replace("0", np.nan, inplace=True)

In [57]:
# add new column PricePoint based on ProductPrice
cond_list = [
    aw_products_dim["ProductPrice"] > 500,
    aw_products_dim["ProductPrice"] > 100,
]
choice_list = [
    "High",
    "Mid-Range"
]

aw_products_dim["PricePoint"] = np.select(cond_list, choice_list, default="Low")

In [58]:
# check products after making all requisite transformations
aw_products_dim.head()

Unnamed: 0,ProductKey,ProductSubcategoryKey,ProductSKU,ProductName,ModelName,ProductDescription,ProductColor,ProductStyle,ProductCost,ProductPrice,DiscountPrice,SKUType,SKUCategory,PricePoint
0,214,31,HL-U509-R,"Sport-100 Helmet, Red",Sport-100,"Universal fit, well-vented, lightweight , snap...",Red,,13.09,34.99,29.74,HL-U509,HL,Low
1,215,31,HL-U509,"Sport-100 Helmet, Black",Sport-100,"Universal fit, well-vented, lightweight , snap...",Black,,12.03,33.64,28.6,HL-U509,HL,Low
2,218,23,SO-B909-M,"Mountain Bike Socks, M",Mountain Bike Socks,Combination of natural and synthetic fibers st...,White,U,3.4,9.5,8.07,SO-B909,SO,Low
3,219,23,SO-B909-L,"Mountain Bike Socks, L",Mountain Bike Socks,Combination of natural and synthetic fibers st...,White,U,3.4,9.5,8.07,SO-B909,SO,Low
4,220,31,HL-U509-B,"Sport-100 Helmet, Blue",Sport-100,"Universal fit, well-vented, lightweight , snap...",Blue,,12.03,33.64,28.6,HL-U509,HL,Low


In [59]:
aw_products_dim.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 293 entries, 0 to 292
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   ProductKey             293 non-null    int64  
 1   ProductSubcategoryKey  293 non-null    int64  
 2   ProductSKU             293 non-null    object 
 3   ProductName            293 non-null    object 
 4   ModelName              293 non-null    object 
 5   ProductDescription     293 non-null    object 
 6   ProductColor           243 non-null    object 
 7   ProductStyle           209 non-null    object 
 8   ProductCost            293 non-null    float64
 9   ProductPrice           293 non-null    float64
 10  DiscountPrice          293 non-null    float64
 11  SKUType                293 non-null    object 
 12  SKUCategory            293 non-null    object 
 13  PricePoint             293 non-null    object 
dtypes: float64(3), int64(2), object(9)
memory usage: 34.3+ KB


In [60]:
# save transformed products dimension table to storage - processed layer for powerbi consumption
aw_products_dim.to_csv(rf"{ROOT_DIR}/../storage/processed/aw_products_dim.csv", index=False)