In [3]:
# Base path to csv files
base_path = "../raw-files/Purchasing Data/"

# File names
file_names = [
    'Purchasing ProductVendor.csv',
    'Purchasing PurchaseOrderDetail.csv',
    'Purchasing PurchaseOrderHeader.csv',
    'Purchasing ShipMethod.csv',
    'Purchasing Vendor.csv',
    'Purchasing vVendorWithAddresses.csv',
    'Purchasing vVendorWithContacts.csv',
]

In [4]:
from pyspark.sql import SparkSession

# Initialize Spark Session
spark = SparkSession.builder.appName("adventure-works").getOrCreate()

dataframes = {}
for file_name in file_names:
    df_name = file_name.replace(' ', '_').replace('.csv', '').lower()
    dataframes[df_name] = spark.read.format('csv').option('header', 'true').load(f'{base_path}/{file_name}')

productVendor_df = dataframes['purchasing_productvendor']
purchaseOrderDetail_df = dataframes['purchasing_purchaseorderdetail']
purchaseOrderHeader_df = dataframes['purchasing_purchaseorderheader']
shipMethod_df = dataframes['purchasing_shipmethod']
vendor_df = dataframes['purchasing_vendor']
vendorWithAddress_df = dataframes['purchasing_vvendorwithaddresses']
vendorWithContacts_df = dataframes['purchasing_vvendorwithcontacts']

                                                                                

In [5]:
def rename_columns(df, rename_mappings):
    for old_name, new_name in rename_mappings.items():
        df = df.withColumnRenamed(old_name, new_name)
    return df

#Rename columns

rename_mappings = {
    'productVendor_df': {'ModifiedDate': 'ProductVendorModifiedDate'},
    'purchaseOrderDetail_df': {'ModifiedDate': 'PurchaseOrderDetailModifiedDate'},
    'purchaseOrderHeader_df': {'ModifiedDate': 'PurchaseOrderHeaderModifiedDate'},
    'shipMethod_df': {'Name':'ShipMethodName','ModifiedDate': 'ShipMethodModifiedDate'},
    'vendor_df': {'Name': 'VendorName', 'ModifiedDate': 'VendorModifiedDate'},
    'vendorWithAddress_df': {'ModifiedDate': 'VendorWithAddressesModifiedDate'},
    'vendorWithContacts_df': {'ModifiedDate': 'VendorWithContactsModifiedDate'},
}

productVendor_df = rename_columns(productVendor_df, rename_mappings['productVendor_df'])
purchaseOrderDetail_df = rename_columns(purchaseOrderDetail_df, rename_mappings['purchaseOrderDetail_df'])
purchaseOrderHeader_df = rename_columns(purchaseOrderHeader_df, rename_mappings['purchaseOrderHeader_df'])
shipMethod_df = rename_columns(shipMethod_df, rename_mappings['shipMethod_df'])
vendor_df = rename_columns(vendor_df, rename_mappings['vendor_df'])
vendorWithAddress_df = rename_columns(vendorWithAddress_df, rename_mappings['vendorWithAddress_df'])
vendorWithContacts_df = rename_columns(vendorWithContacts_df, rename_mappings['vendorWithContacts_df'])

In [6]:
# Join purchaseOrderDetail_df and purchaseOrderHeader_df
purchase_details_df = purchaseOrderDetail_df.join(purchaseOrderHeader_df, "PurchaseOrderID","left")

# Join other data frames
purchase_details_df = purchase_details_df.join(productVendor_df, "ProductID", "left")
purchase_details_df = purchase_details_df.join(vendor_df, "BusinessEntityID", "left")
purchase_details_df = purchase_details_df.join(shipMethod_df, "ShipMethodID", "left")

In [7]:
display(purchase_details_df)

DataFrame[ShipMethodID: string, BusinessEntityID: string, ProductID: string, PurchaseOrderID: string, PurchaseOrderDetailID: string, DueDate: string, OrderQty: string, UnitPrice: string, LineTotal: string, ReceivedQty: string, RejectedQty: string, StockedQty: string, PurchaseOrderDetailModifiedDate: string, RevisionNumber: string, Status: string, EmployeeID: string, VendorID: string, OrderDate: string, ShipDate: string, SubTotal: string, TaxAmt: string, Freight: string, TotalDue: string, PurchaseOrderHeaderModifiedDate: string, AverageLeadTime: string, StandardPrice: string, LastReceiptCost: string, LastReceiptDate: string, MinOrderQty: string, MaxOrderQty: string, OnOrderQty: string, UnitMeasureCode: string, ProductVendorModifiedDate: string, AccountNumber: string, VendorName: string, CreditRating: string, PreferredVendorStatus: string, ActiveFlag: string, PurchasingWebServiceURL: string, VendorModifiedDate: string, ShipMethodName: string, ShipBase: string, ShipRate: string, rowguid: 

In [8]:
# save to csv file
output_path = "../denormalized-files/purchasing.csv"

# Convert Spark DataFrame to pandas DataFrame
purchasing_details_pd_df = purchase_details_df.toPandas()

# Save to CSV using pandas, ensuring it's a single file
purchasing_details_pd_df.to_csv(output_path, index=False, header=True)

24/06/24 13:53:28 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


CodeCache: size=131072Kb used=26775Kb max_used=26787Kb free=104296Kb
 bounds [0x000000010a9e8000, 0x000000010c438000, 0x00000001129e8000]
 total_blobs=10266 nmethods=9343 adapters=834
 compilation: disabled (not enough contiguous free space left)
