In [1]:
# Base path to csv files
base_path = "../raw-files/Person Data/"

# List file names
file_names = [
    'Person Address.csv',
    'Person AddressType.csv',
    'Person BusinessEntity.csv',
    'Person BusinessEntityAddress.csv',
    'Person BusinessEntityContact.csv',
    'Person ContactType.csv',
    'Person CountryRegion.csv',
    'Person EmailAddress.csv',
    'Person Password.csv',
    'Person Person.csv',
    'Person PersonPhone.csv',
    'Person PhoneNumberType.csv',
    'Person StateProvince.csv',
]

In [2]:
from pyspark.sql import SparkSession

# Initialize Spark Session
spark = SparkSession.builder.appName("person").getOrCreate()

# Load data into DataFrames
dataframes = {}
for file_name in file_names:
    df_name = file_name.replace(' ', '_').replace('.csv', '').lower()
    dataframes[df_name] = spark.read.format('csv').option('header', 'true').load(f'{base_path}/{file_name}')

address_df = dataframes['person_address']
addressType_df = dataframes['person_addresstype']
businessEntity_df = dataframes['person_businessentity']
businessEntityAddress_df = dataframes['person_businessentityaddress']
businessEntityContact_df = dataframes['person_businessentitycontact']
contactType_df = dataframes['person_contacttype']
countryRegion_df = dataframes['person_countryregion']
emailAddress_df = dataframes['person_emailaddress']
password_df = dataframes['person_password']
person_df = dataframes['person_person']
personPhone_df = dataframes['person_personphone']
phoneNumberType_df = dataframes['person_phonenumbertype']
stateProvince_df = dataframes['person_stateprovince']

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/06/24 16:09:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# Rename columns

def rename_columns(df, rename_mappings):
    for old_name, new_name in rename_mappings.items():
        df = df.withColumnRenamed(old_name, new_name)
    return df

rename_mappings = {
    'address_df': {'rowguid':'Addressrowguid','ModifiedDate': 'AddressModifiedDate'},
    'addressType_df': {'Name':'AddressTypeName', 'rowguid': 'AddressTyperowguid' ,'ModifiedDate': 'AddressTypeModifiedDate'},
    'businessEntity_df': {'rowguid':'BusinessEntityrowguid','ModifiedDate': 'BusinessEntityModifiedDate'},
    'businessEntityAddress_df': {'rowguid':'BusinessEntityAddressrowguid','ModifiedDate': 'BusinessEntityAddressModifiedDate'},
    'businessEntityContact_df': {'rowguid':'BusinessEntityContactrowguid','ModifiedDate': 'BusinessEntityContactModifiedDate'},
    'contactType_df': {'Name':'ContactTypeName','ModifiedDate': 'ContactTypeModifiedDate'},
    'countryRegion_df': {'Name':'CountryRegionName','ModifiedDate': 'CountryRegionModifiedDate'},
    'person_df': {'rowguid':'Personrowguid','ModifiedDate': 'PersonModifiedDate'},
    'phoneNumberType_df': {'Name': 'PhoneNumberTypeName', 'ModifiedDate':'PhoneNumberTypeModified'},
    'stateProvince_df': {'Name':'StateProvinceName', 'rowguid': 'StateProvincerowguid' ,'ModifiedDate': 'StateProvinceModifiedDate'},
    'password_df': {'rowguid': 'Passwordrowguid' ,'ModifiedDate': 'PasswordModifiedDate'},
    'emailAddress_df': {'rowguid': 'EmailAddressrowguid' ,'ModifiedDate': 'EmailAddressModifiedDate'},
    'personPhone_df': {'rowguid': 'PersonPhonerowguid' ,'ModifiedDate': 'PersonPhoneModifiedDate'}
}

address_df = rename_columns(address_df, rename_mappings['address_df'])
addressType_df = rename_columns(addressType_df, rename_mappings['addressType_df'])
businessEntity_df = rename_columns(businessEntity_df, rename_mappings['businessEntity_df'])
businessEntityAddress_df = rename_columns(businessEntityAddress_df, rename_mappings['businessEntityAddress_df'])
businessEntityContact_df = rename_columns(businessEntityContact_df, rename_mappings['businessEntityContact_df'])
contactType_df = rename_columns(contactType_df, rename_mappings['contactType_df'])
countryRegion_df = rename_columns(countryRegion_df, rename_mappings['countryRegion_df'])
person_df = rename_columns(person_df, rename_mappings['person_df'])
phoneNumberType_df = rename_columns(phoneNumberType_df, rename_mappings['phoneNumberType_df'])
stateProvince_df = rename_columns(stateProvince_df, rename_mappings['stateProvince_df'])
password_df = rename_columns(password_df,rename_mappings['password_df'])
emailAddress_df = rename_columns(emailAddress_df,rename_mappings['emailAddress_df'])
personPhone_df = rename_columns(personPhone_df,rename_mappings['personPhone_df'])

In [4]:
#Join dataframes

# person_details_df = person_df.join(password_df, "BusinessEntityID", "left")
# person_details_df = person_details_df.join(emailAddress_df,"BusinessEntityID", "left")
# person_details_df = person_details_df.join(personPhone_df,"BusinessEntityID", "left")
# person_details_df = person_details_df.join(businessEntityContact_df,"BusinessEntityID", "left") 
# person_details_df = person_details_df.join(businessEntity_df,"BusinessEntityID", "left")
# person_details_df = person_details_df.join(businessEntityAddress_df,"BusinessEntityID", "left")

#Join more data frames
# person_details_df = person_details_df.join(contactType_df,"ContactTypeID", "left")
# person_details_df = person_details_df.join(phoneNumberType_df,"PhoneNumberTypeID", "left")
# person_details_df = person_details_df.join(address_df,"AddressID", "left")
# person_details_df = person_details_df.join(addressType_df,"AddressTypeID", "left")
# person_details_df = person_details_df.join(stateProvince_df,"StateProvinceID", "left")
# person_details_df = person_details_df.join(countryRegion_df,"CountryRegionCode", "left")

In [15]:
# only keep the columns we need
required_cols = [
    "BusinessEntityID",
    "Title",
    "FirstName",
    "MiddleName",
    "LastName",
    "NameStyle",
    "Demographics",
    "Suffix",
    "EmailAddress",
    "AddressLine1",
    "AddressLine2",
    "PhoneNumber",
]

#todo columns to add: customerkey, geographykey, birthdate,mariatalstatus, gender, yearlyincome, totalchildern, incomegroup, educattion, numberofchildrenatHome, homeownerflag, numberofcarsowned, age, region
person_details_df = person_df.join(emailAddress_df, 'BusinessEntityID', 'left') \
    .join(businessEntity_df, 'BusinessEntityID', 'left') \
    .join(businessEntityAddress_df, 'BusinessEntityID', 'left') \
    .join(address_df, 'AddressID', 'left') \
    .join(personPhone_df, 'BusinessEntityID', 'left')


person_details_df = person_details_df.select(required_cols)

In [16]:
person_details_df.columns

['BusinessEntityID',
 'Title',
 'FirstName',
 'MiddleName',
 'LastName',
 'NameStyle',
 'Demographics',
 'Suffix',
 'EmailAddress',
 'AddressLine1',
 'AddressLine2',
 'PhoneNumber']

In [17]:
# save to csv file
output_path = "../denormalized-files/person.csv"

# Convert Spark DataFrame to pandas DataFrame
person_details_pd_df = person_details_df.toPandas()

# Save to CSV using pandas, ensuring it's a single file
person_details_pd_df.to_csv(output_path, index=False, header=True)



CodeCache: size=131072Kb used=31687Kb max_used=31699Kb free=99384Kb
 bounds [0x00000001061e8000, 0x0000000108118000, 0x000000010e1e8000]
 total_blobs=11529 nmethods=10609 adapters=831
 compilation: disabled (not enough contiguous free space left)
