In [21]:
import pandas as pd
import numpy as np
import sys
sys.path.append('../')
import config 

In [22]:
# get module variable for root directory
ROOT_DIR = config.ROOT_DIR

#### Transform AdventureWorks_Customers.csv from raw to processed

In [23]:
aw_customers_dim = pd.read_csv(rf"{ROOT_DIR}/../storage/raw/AdventureWorks_Customers.csv")

In [24]:
aw_customers_dim.head()

Unnamed: 0,CustomerKey,Prefix,FirstName,LastName,BirthDate,MaritalStatus,Gender,EmailAddress,AnnualIncome,TotalChildren,EducationLevel,Occupation,HomeOwner
0,11000,MR.,JON,YANG,4/8/1966,M,M,jon24@adventure-works.com,"$90,000",2,Bachelors,Professional,Y
1,11001,MR.,EUGENE,HUANG,5/14/1965,S,M,eugene10@adventure-works.com,"$60,000",3,Bachelors,Professional,N
2,11002,MR.,RUBEN,TORRES,8/12/1965,M,M,ruben35@adventure-works.com,"$60,000",3,Bachelors,Professional,Y
3,11003,MS.,CHRISTY,ZHU,2/15/1968,S,F,christy12@adventure-works.com,"$70,000",0,Bachelors,Professional,N
4,11004,MRS.,ELIZABETH,JOHNSON,8/8/1968,S,F,elizabeth5@adventure-works.com,"$80,000",5,Bachelors,Professional,Y


In [25]:
aw_customers_dim.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18148 entries, 0 to 18147
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   CustomerKey     18148 non-null  int64 
 1   Prefix          18018 non-null  object
 2   FirstName       18148 non-null  object
 3   LastName        18148 non-null  object
 4   BirthDate       18148 non-null  object
 5   MaritalStatus   18148 non-null  object
 6   Gender          18018 non-null  object
 7   EmailAddress    18148 non-null  object
 8   AnnualIncome    18148 non-null  object
 9   TotalChildren   18148 non-null  int64 
 10  EducationLevel  18148 non-null  object
 11  Occupation      18148 non-null  object
 12  HomeOwner       18148 non-null  object
dtypes: int64(2), object(11)
memory usage: 1.8+ MB


In [26]:
# use series string methods with transform to capitalize (proper case) Prefix, FirstName, LastName
aw_customers_dim.loc[:, ["Prefix", "FirstName", "LastName"]] = aw_customers_dim.loc[:, ["Prefix", "FirstName", "LastName"]].transform(lambda x: x.str.capitalize())

In [27]:
# concatenate prefix, first name, and last name to create full name
aw_customers_dim["FullName"] = aw_customers_dim["Prefix"].str.cat(aw_customers_dim["FirstName"], sep=" ").str.cat(aw_customers_dim["LastName"], sep=" ")

In [28]:
# create UserName and Domain columns by splitting EmailAddress
email_split = aw_customers_dim["EmailAddress"].str.split("@", expand=True)
aw_customers_dim["UserName"] = email_split[0]
aw_customers_dim["Domain"] = email_split[1].str.split(".com", expand=True)[0].str.replace("-", " ").str.title()

In [29]:
# convert annual income to float data type
aw_customers_dim["AnnualIncome"] = aw_customers_dim["AnnualIncome"].str.split("$", expand=True)[1].str.replace(",", "").astype(float)

In [30]:
# convert birth date to datetime data type, extract birth year, and calculate current age
aw_customers_dim["BirthDate"] = pd.to_datetime(aw_customers_dim["BirthDate"], format="%m/%d/%Y")
aw_customers_dim["BirthYear"] = aw_customers_dim["BirthDate"].dt.year
aw_customers_dim["CurrentAge"] = (pd.Timestamp.now() - aw_customers_dim["BirthDate"]).astype("m8[Y]")

In [31]:
# is the customer a parent?
aw_customers_dim["Parent"] = np.where(aw_customers_dim["TotalChildren"] > 0, "Yes", "No")

In [32]:
# set customer priority based on age and income
aw_customers_dim["CustomerPriority"] = np.where((aw_customers_dim["CurrentAge"] < 50) & (aw_customers_dim["AnnualIncome"] > 100000), "Priority", "Standard")

In [33]:
aw_customers_dim.head()

Unnamed: 0,CustomerKey,Prefix,FirstName,LastName,BirthDate,MaritalStatus,Gender,EmailAddress,AnnualIncome,TotalChildren,EducationLevel,Occupation,HomeOwner,FullName,UserName,Domain,BirthYear,CurrentAge,Parent,CustomerPriority
0,11000,Mr.,Jon,Yang,1966-04-08,M,M,jon24@adventure-works.com,90000.0,2,Bachelors,Professional,Y,Mr. Jon Yang,jon24,Adventure Works,1966,57.0,Yes,Standard
1,11001,Mr.,Eugene,Huang,1965-05-14,S,M,eugene10@adventure-works.com,60000.0,3,Bachelors,Professional,N,Mr. Eugene Huang,eugene10,Adventure Works,1965,58.0,Yes,Standard
2,11002,Mr.,Ruben,Torres,1965-08-12,M,M,ruben35@adventure-works.com,60000.0,3,Bachelors,Professional,Y,Mr. Ruben Torres,ruben35,Adventure Works,1965,57.0,Yes,Standard
3,11003,Ms.,Christy,Zhu,1968-02-15,S,F,christy12@adventure-works.com,70000.0,0,Bachelors,Professional,N,Ms. Christy Zhu,christy12,Adventure Works,1968,55.0,No,Standard
4,11004,Mrs.,Elizabeth,Johnson,1968-08-08,S,F,elizabeth5@adventure-works.com,80000.0,5,Bachelors,Professional,Y,Mrs. Elizabeth Johnson,elizabeth5,Adventure Works,1968,54.0,Yes,Standard


In [34]:
aw_customers_dim.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18148 entries, 0 to 18147
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   CustomerKey       18148 non-null  int64         
 1   Prefix            18018 non-null  object        
 2   FirstName         18148 non-null  object        
 3   LastName          18148 non-null  object        
 4   BirthDate         18148 non-null  datetime64[ns]
 5   MaritalStatus     18148 non-null  object        
 6   Gender            18018 non-null  object        
 7   EmailAddress      18148 non-null  object        
 8   AnnualIncome      18148 non-null  float64       
 9   TotalChildren     18148 non-null  int64         
 10  EducationLevel    18148 non-null  object        
 11  Occupation        18148 non-null  object        
 12  HomeOwner         18148 non-null  object        
 13  FullName          18018 non-null  object        
 14  UserName          1814

In [35]:
# write transformed customers data to storage - processed layer for powerbi consumption
aw_customers_dim.to_csv(rf"{ROOT_DIR}/../storage/processed/aw_customers_dim.csv", index=False)