In [153]:
import pandas as pd
import numpy as np
import datetime 
import config 

In [154]:
# get module variables
ROOT_DIR = config.ROOT_DIR

In [155]:
import findspark
findspark.init()

In [156]:
# spark session start to begin transforming data (processing layer)
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName("adventure_works_analysis").getOrCreate()

#### Transform AdventureWorks_Products.csv from raw to processed

In [157]:
aw_products_lookup = pd.read_csv(rf"{ROOT_DIR}/storage/raw/AdventureWorks_Products.csv")

In [158]:
aw_products_lookup.head()

Unnamed: 0,ProductKey,ProductSubcategoryKey,ProductSKU,ProductName,ModelName,ProductDescription,ProductColor,ProductSize,ProductStyle,ProductCost,ProductPrice
0,214,31,HL-U509-R,"Sport-100 Helmet, Red",Sport-100,"Universal fit, well-vented, lightweight , snap...",Red,0,0,13.0863,34.99
1,215,31,HL-U509,"Sport-100 Helmet, Black",Sport-100,"Universal fit, well-vented, lightweight , snap...",Black,0,0,12.0278,33.6442
2,218,23,SO-B909-M,"Mountain Bike Socks, M",Mountain Bike Socks,Combination of natural and synthetic fibers st...,White,M,U,3.3963,9.5
3,219,23,SO-B909-L,"Mountain Bike Socks, L",Mountain Bike Socks,Combination of natural and synthetic fibers st...,White,L,U,3.3963,9.5
4,220,31,HL-U509-B,"Sport-100 Helmet, Blue",Sport-100,"Universal fit, well-vented, lightweight , snap...",Blue,0,0,12.0278,33.6442


In [159]:
aw_products_lookup.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 293 entries, 0 to 292
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   ProductKey             293 non-null    int64  
 1   ProductSubcategoryKey  293 non-null    int64  
 2   ProductSKU             293 non-null    object 
 3   ProductName            293 non-null    object 
 4   ModelName              293 non-null    object 
 5   ProductDescription     293 non-null    object 
 6   ProductColor           243 non-null    object 
 7   ProductSize            293 non-null    object 
 8   ProductStyle           293 non-null    object 
 9   ProductCost            293 non-null    float64
 10  ProductPrice           293 non-null    float64
dtypes: float64(2), int64(2), object(7)
memory usage: 25.3+ KB


In [160]:
aw_products_lookup.drop(columns="ProductSize", inplace=True)
aw_products_lookup.sort_values(by="ProductKey", inplace=True)

In [161]:
# limit numeric columns ProductCost and ProductPrice to 2 decimal places 
aw_products_lookup.loc[:, ["ProductCost", "ProductPrice"]] = aw_products_lookup[["ProductCost", "ProductPrice"]].round(2)

# create new column DiscountPrice with 10% discount
aw_products_lookup["DiscountPrice"] = aw_products_lookup["ProductPrice"] * 0.9

# limit numeric columns DiscountPrice to 2 decimal places
aw_products_lookup.loc[:, ["DiscountPrice"]] = aw_products_lookup[["DiscountPrice"]].round(2)

In [162]:
aw_products_lookup.head()

Unnamed: 0,ProductKey,ProductSubcategoryKey,ProductSKU,ProductName,ModelName,ProductDescription,ProductColor,ProductStyle,ProductCost,ProductPrice,DiscountPrice
0,214,31,HL-U509-R,"Sport-100 Helmet, Red",Sport-100,"Universal fit, well-vented, lightweight , snap...",Red,0,13.09,34.99,31.49
1,215,31,HL-U509,"Sport-100 Helmet, Black",Sport-100,"Universal fit, well-vented, lightweight , snap...",Black,0,12.03,33.64,30.28
2,218,23,SO-B909-M,"Mountain Bike Socks, M",Mountain Bike Socks,Combination of natural and synthetic fibers st...,White,U,3.4,9.5,8.55
3,219,23,SO-B909-L,"Mountain Bike Socks, L",Mountain Bike Socks,Combination of natural and synthetic fibers st...,White,U,3.4,9.5,8.55
4,220,31,HL-U509-B,"Sport-100 Helmet, Blue",Sport-100,"Universal fit, well-vented, lightweight , snap...",Blue,0,12.03,33.64,30.28


In [163]:
aw_products_lookup.to_csv(rf"{ROOT_DIR}/storage/processed/aw_products_lookup.csv", index=False)

#### Transform AdventureWorks_Customers.csv from raw to processed

In [164]:
aw_customers_lookup = pd.read_csv(rf"{ROOT_DIR}/storage/raw/AdventureWorks_Customers.csv")

In [165]:
aw_customers_lookup.head()

Unnamed: 0,CustomerKey,Prefix,FirstName,LastName,BirthDate,MaritalStatus,Gender,EmailAddress,AnnualIncome,TotalChildren,EducationLevel,Occupation,HomeOwner
0,11000,MR.,JON,YANG,4/8/1966,M,M,jon24@adventure-works.com,"$90,000",2,Bachelors,Professional,Y
1,11001,MR.,EUGENE,HUANG,5/14/1965,S,M,eugene10@adventure-works.com,"$60,000",3,Bachelors,Professional,N
2,11002,MR.,RUBEN,TORRES,8/12/1965,M,M,ruben35@adventure-works.com,"$60,000",3,Bachelors,Professional,Y
3,11003,MS.,CHRISTY,ZHU,2/15/1968,S,F,christy12@adventure-works.com,"$70,000",0,Bachelors,Professional,N
4,11004,MRS.,ELIZABETH,JOHNSON,8/8/1968,S,F,elizabeth5@adventure-works.com,"$80,000",5,Bachelors,Professional,Y


In [166]:
aw_customers_lookup.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18148 entries, 0 to 18147
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   CustomerKey     18148 non-null  int64 
 1   Prefix          18018 non-null  object
 2   FirstName       18148 non-null  object
 3   LastName        18148 non-null  object
 4   BirthDate       18148 non-null  object
 5   MaritalStatus   18148 non-null  object
 6   Gender          18018 non-null  object
 7   EmailAddress    18148 non-null  object
 8   AnnualIncome    18148 non-null  object
 9   TotalChildren   18148 non-null  int64 
 10  EducationLevel  18148 non-null  object
 11  Occupation      18148 non-null  object
 12  HomeOwner       18148 non-null  object
dtypes: int64(2), object(11)
memory usage: 1.8+ MB


In [167]:
# use series string methods with transform to capitalize Prefix, FirstName, LastName
aw_customers_lookup.loc[:, ["Prefix", "FirstName", "LastName"]] = aw_customers_lookup.loc[:, ["Prefix", "FirstName", "LastName"]].transform(lambda x: x.str.capitalize())

In [168]:
email_split = aw_customers_lookup["EmailAddress"].str.split("@", expand=True)
aw_customers_lookup["UserName"] = email_split[0]
aw_customers_lookup["Domain"] = email_split[1].str.split(".com", expand=True)[0]

In [169]:
aw_customers_lookup["AnnualIncome"] = aw_customers_lookup["AnnualIncome"].str.split("$", expand=True)[1].str.replace(",", "").astype(float)

In [170]:
aw_customers_lookup.head()

Unnamed: 0,CustomerKey,Prefix,FirstName,LastName,BirthDate,MaritalStatus,Gender,EmailAddress,AnnualIncome,TotalChildren,EducationLevel,Occupation,HomeOwner,UserName,Domain
0,11000,Mr.,Jon,Yang,4/8/1966,M,M,jon24@adventure-works.com,90000.0,2,Bachelors,Professional,Y,jon24,adventure-works
1,11001,Mr.,Eugene,Huang,5/14/1965,S,M,eugene10@adventure-works.com,60000.0,3,Bachelors,Professional,N,eugene10,adventure-works
2,11002,Mr.,Ruben,Torres,8/12/1965,M,M,ruben35@adventure-works.com,60000.0,3,Bachelors,Professional,Y,ruben35,adventure-works
3,11003,Ms.,Christy,Zhu,2/15/1968,S,F,christy12@adventure-works.com,70000.0,0,Bachelors,Professional,N,christy12,adventure-works
4,11004,Mrs.,Elizabeth,Johnson,8/8/1968,S,F,elizabeth5@adventure-works.com,80000.0,5,Bachelors,Professional,Y,elizabeth5,adventure-works


In [171]:
aw_customers_lookup.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18148 entries, 0 to 18147
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   CustomerKey     18148 non-null  int64  
 1   Prefix          18018 non-null  object 
 2   FirstName       18148 non-null  object 
 3   LastName        18148 non-null  object 
 4   BirthDate       18148 non-null  object 
 5   MaritalStatus   18148 non-null  object 
 6   Gender          18018 non-null  object 
 7   EmailAddress    18148 non-null  object 
 8   AnnualIncome    18148 non-null  float64
 9   TotalChildren   18148 non-null  int64  
 10  EducationLevel  18148 non-null  object 
 11  Occupation      18148 non-null  object 
 12  HomeOwner       18148 non-null  object 
 13  UserName        18148 non-null  object 
 14  Domain          18148 non-null  object 
dtypes: float64(1), int64(2), object(12)
memory usage: 2.1+ MB


In [172]:
aw_customers_lookup.to_csv(rf"{ROOT_DIR}/storage/processed/aw_customers_lookup.csv", index=False)