In [1758]:
import pandas as pd
import matplotlib.pyplot as plt
import os

*Importing the tables*

In [1759]:
Address = pd.read_csv('/Users/khuenguyen/Desktop/data_warehouse/tables/Address.csv')
CountryRegion = pd.read_csv('/Users/khuenguyen/Desktop/data_warehouse/tables/CountryRegion.csv')
Product = pd.read_csv('/Users/khuenguyen/Desktop/data_warehouse/tables/Product.csv')
ProductSubcategory = pd.read_csv('/Users/khuenguyen/Desktop/data_warehouse/tables/ProductSubcategory.csv')
SalesOrderDetail = pd.read_csv('/Users/khuenguyen/Desktop/data_warehouse/tables/SalesOrderDetail.csv')
SalesOrderHeader = pd.read_csv('/Users/khuenguyen/Desktop/data_warehouse/tables/SalesOrderHeader.csv')
StateProvince = pd.read_csv('/Users/khuenguyen/Desktop/data_warehouse/tables/StateProvince.csv')
ProductCategory = pd.read_csv('/Users/khuenguyen/Desktop/data_warehouse/tables/ProductCategory.csv')
ProductUnitPrice = pd.read_csv('/Users/khuenguyen/Desktop/data_warehouse/tables/Product_UnitPrice.csv')

In [1760]:
current_directory = os.getcwd()
output_folder_path = os.path.join(current_directory, "transformed_tables")
if not os.path.exists(output_folder_path):
    os.makedirs(output_folder_path)
print(output_folder_path)

/Users/khuenguyen/Desktop/data_warehouse/transformed_tables


**1. Address**

In [1761]:
Address.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   AddressID        1000 non-null   int64 
 1   AddressLine1     1000 non-null   object
 2   AddressLine2     48 non-null     object
 3   City             1000 non-null   object
 4   StateProvinceID  1000 non-null   int64 
 5   PostalCode       1000 non-null   object
 6   SpatialLocation  1000 non-null   object
 7   rowguid          1000 non-null   object
 8   ModifiedDate     1000 non-null   object
dtypes: int64(2), object(7)
memory usage: 70.4+ KB


Remove unecessary columns

In [1762]:
Address = Address.drop(columns=['SpatialLocation','rowguid','ModifiedDate'])

In [1763]:
object_columns = Address.select_dtypes(include='object').columns
numeric_columns = Address.select_dtypes(include='int64').columns

Check duplicated values

In [1764]:
print(Address.duplicated().sum())

0


Check null values

In [1765]:
Address.isnull().sum()

AddressID            0
AddressLine1         0
AddressLine2       952
City                 0
StateProvinceID      0
PostalCode           0
dtype: int64

Because "AddressLine2" is just a complement address used to support "AddressLine1" (apartment number, block number, etc.), we can accept null value in this column

In [1766]:
Address.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   AddressID        1000 non-null   int64 
 1   AddressLine1     1000 non-null   object
 2   AddressLine2     48 non-null     object
 3   City             1000 non-null   object
 4   StateProvinceID  1000 non-null   int64 
 5   PostalCode       1000 non-null   object
dtypes: int64(2), object(4)
memory usage: 47.0+ KB


In [1767]:
Address.describe(include=['object'])

Unnamed: 0,AddressLine1,AddressLine2,City,PostalCode
count,1000,48,1000,1000
unique,989,45,421,480
top,Horizon Outlet Center,# 14,Seattle,98104
freq,3,2,45,45


In [1768]:
Address.to_csv(os.path.join(output_folder_path,'AddressLineDim.csv'),index=False)

**2.StateProvice**


In [1769]:
StateProvince.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 181 entries, 0 to 180
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   StateProvinceID          181 non-null    int64 
 1   StateProvinceCode        181 non-null    object
 2   CountryRegionCode        181 non-null    object
 3   IsOnlyStateProvinceFlag  181 non-null    int64 
 4   Name                     181 non-null    object
 5   TerritoryID              181 non-null    int64 
 6   rowguid                  181 non-null    object
 7   ModifiedDate             181 non-null    object
dtypes: int64(3), object(5)
memory usage: 11.4+ KB


Remove unwanted columns

In [1770]:
StateProvince = StateProvince.drop(columns=['IsOnlyStateProvinceFlag','rowguid','ModifiedDate'])

Check duplicated

In [1771]:
print(StateProvince.duplicated().sum())

0


Check null values

In [1772]:
StateProvince.isnull().sum()

StateProvinceID      0
StateProvinceCode    0
CountryRegionCode    0
Name                 0
TerritoryID          0
dtype: int64

Convert CountryRegionCode into CountryRegionName

In [1773]:
StateProvince['CountryRegionCode'] = StateProvince['CountryRegionCode'].map(CountryRegion.set_index('CountryRegionCode')['Name'])

In [1774]:
StateProvince = StateProvince.rename(columns={'CountryRegionCode': 'CountryRegionName'})

In [1775]:
StateProvince.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 181 entries, 0 to 180
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   StateProvinceID    181 non-null    int64 
 1   StateProvinceCode  181 non-null    object
 2   CountryRegionName  181 non-null    object
 3   Name               181 non-null    object
 4   TerritoryID        181 non-null    int64 
dtypes: int64(2), object(3)
memory usage: 7.2+ KB


In [1776]:
StateProvince.head()

Unnamed: 0,StateProvinceID,StateProvinceCode,CountryRegionName,Name,TerritoryID
0,1,AB,Canada,Alberta,6
1,2,AK,United States,Alaska,1
2,3,AL,United States,Alabama,5
3,4,AR,United States,Arkansas,3
4,5,AS,American Samoa,American Samoa,1


In [1777]:
StateProvince.to_csv(os.path.join(output_folder_path,'StateProvinceDim.csv'),index=False)

**3.Product**

In [1778]:
Product.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 504 entries, 0 to 503
Data columns (total 25 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   ProductID              504 non-null    int64  
 1   Name                   504 non-null    object 
 2   ProductNumber          504 non-null    object 
 3   MakeFlag               504 non-null    int64  
 4   FinishedGoodsFlag      504 non-null    int64  
 5   Color                  256 non-null    object 
 6   SafetyStockLevel       504 non-null    int64  
 7   ReorderPoint           504 non-null    int64  
 8   StandardCost           504 non-null    object 
 9   ListPrice              504 non-null    object 
 10  Size                   211 non-null    object 
 11  SizeUnitMeasureCode    176 non-null    object 
 12  WeightUnitMeasureCode  205 non-null    object 
 13  Weight                 205 non-null    float64
 14  DaysToManufacture      504 non-null    int64  
 15  Produc

Remove unwanted columns

In [1779]:
selected_columns = ['ProductID', 'Name', 'Color', 'ProductSubcategoryID', 'DaysToManufacture']
Product = Product.loc[:, selected_columns]


Check duplicated values

In [1780]:
print(Product.duplicated().sum())

0


Check null values

In [1781]:
Product.isnull().sum()

ProductID                 0
Name                      0
Color                   248
ProductSubcategoryID    209
DaysToManufacture         0
dtype: int64

Some products doesn't belong to any subcategory so its "ProductSubcategoryID" is null, we will keep that. 
For color, we will assume that the null value for colors are because the products have no distinguishing color, therefor we willl fill in those columns with "No color" value

In [1782]:
Product.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 504 entries, 0 to 503
Data columns (total 5 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   ProductID             504 non-null    int64  
 1   Name                  504 non-null    object 
 2   Color                 256 non-null    object 
 3   ProductSubcategoryID  295 non-null    float64
 4   DaysToManufacture     504 non-null    int64  
dtypes: float64(1), int64(2), object(2)
memory usage: 19.8+ KB


As we can see, black color exist in 93 rows in all 256 non null colors, which is a significant number of appearance. For that reason, we are going to impute those null value using most frequently appeared color - Black

In [1783]:
Product['Color'] = Product['Color'].fillna('No color')

In [1784]:
Product.to_csv(os.path.join(output_folder_path,'ProductDim.csv'),index=False)

**4.ProductSubcategory**

In [1785]:
ProductSubcategory.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37 entries, 0 to 36
Data columns (total 5 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   ProductSubcategoryID  37 non-null     int64 
 1   ProductCategoryID     37 non-null     int64 
 2   Name                  37 non-null     object
 3   rowguid               37 non-null     object
 4   ModifiedDate          37 non-null     object
dtypes: int64(2), object(3)
memory usage: 1.6+ KB


In [1786]:
ProductSubcategory.head()

Unnamed: 0,ProductSubcategoryID,ProductCategoryID,Name,rowguid,ModifiedDate
0,1,1,Mountain Bikes,2d364ade-264a-433c-b092-4fcbf3804e01,2008-04-30 00:00:00.000
1,2,1,Road Bikes,000310c0-bcc8-42c4-b0c3-45ae611af06b,2008-04-30 00:00:00.000
2,3,1,Touring Bikes,02c5061d-ecdc-4274-b5f1-e91d76bc3f37,2008-04-30 00:00:00.000
3,4,2,Handlebars,3ef2c725-7135-4c85-9ae6-ae9a3bdd9283,2008-04-30 00:00:00.000
4,5,2,Bottom Brackets,a9e54089-8a1e-4cf5-8646-e3801f685934,2008-04-30 00:00:00.000


In [1787]:
ProductSubcategory['ProductCategoryName'] = ProductSubcategory['ProductCategoryID'].map(ProductCategory.set_index('ProductCategoryID')['Name'])

Remove unwanted columns

In [1788]:
selected_columns = ['ProductSubcategoryID', 'Name', 'ProductCategoryName']
ProductSubcategory = ProductSubcategory.loc[:, selected_columns]
ProductSubcategory = ProductSubcategory.rename(columns={'Name': 'ProductSubcategoryName'})

In [1789]:
ProductSubcategory.to_csv(os.path.join(output_folder_path,'ProductSubcategoryDim.csv'),index=False)

**5.SalesOrderHeader**

In [1790]:
SalesOrderHeader.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31465 entries, 0 to 31464
Data columns (total 26 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   SalesOrderID            31465 non-null  int64  
 1   RevisionNumber          31465 non-null  int64  
 2   OrderDate               31465 non-null  object 
 3   DueDate                 31465 non-null  object 
 4   ShipDate                31465 non-null  object 
 5   Status                  31465 non-null  int64  
 6   OnlineOrderFlag         31465 non-null  int64  
 7   SalesOrderNumber        31465 non-null  object 
 8   PurchaseOrderNumber     3806 non-null   object 
 9   AccountNumber           31465 non-null  object 
 10  CustomerID              31465 non-null  int64  
 11  SalesPersonID           3806 non-null   float64
 12  TerritoryID             31465 non-null  int64  
 13  BillToAddressID         31465 non-null  int64  
 14  ShipToAddressID         31465 non-null

Remove unwanted columns

In [1791]:
selected_columns = ['SalesOrderID', 'OrderDate', 'DueDate','ShipDate','CustomerID','ShipToAddressID','TaxAmt','Freight','OnlineOrderFlag','Status']
SalesOrderHeader = SalesOrderHeader.loc[:, selected_columns]

In [1792]:
SalesOrderHeader['TaxAmt'] = SalesOrderHeader['TaxAmt'].astype('str').str.replace(',', '.').astype(float)
SalesOrderHeader['Freight'] = SalesOrderHeader['Freight'].astype('str').str.replace(',', '.').astype(float)
SalesOrderHeader['OnlineOrderFlag'] = SalesOrderHeader['OnlineOrderFlag'].astype('bool')


Check null values and duplicated

In [1793]:
SalesOrderHeader.isnull().sum()

SalesOrderID       0
OrderDate          0
DueDate            0
ShipDate           0
CustomerID         0
ShipToAddressID    0
TaxAmt             0
Freight            0
OnlineOrderFlag    0
Status             0
dtype: int64

In [1794]:
print(SalesOrderHeader.duplicated().sum())

0


In [1797]:
SalesOrderHeader = SalesOrderHeader[(SalesOrderHeader['Status']!=6)]

In [1798]:
SalesOrderHeader[(SalesOrderHeader['OrderDate'] >= SalesOrderHeader['DueDate']) | (SalesOrderHeader['OrderDate'] >= SalesOrderHeader['ShipDate'])]


Unnamed: 0,SalesOrderID,OrderDate,DueDate,ShipDate,CustomerID,ShipToAddressID,TaxAmt,Freight,OnlineOrderFlag,Status
21,43680,2012-05-31 00:00:00.000,2011-06-12 00:00:00.000,2011-06-07 00:00:00.000,29489,1069,1093.6394,341.7623,False,5
22,43681,2012-05-31 00:00:00.000,2011-06-12 00:00:00.000,2011-06-07 00:00:00.000,29661,955,1323.0668,413.4584,False,5


In [1799]:
SalesOrderHeader[(SalesOrderHeader['OrderDate'] >= SalesOrderHeader['DueDate']) | (SalesOrderHeader['OrderDate'] >= SalesOrderHeader['ShipDate'])]


Unnamed: 0,SalesOrderID,OrderDate,DueDate,ShipDate,CustomerID,ShipToAddressID,TaxAmt,Freight,OnlineOrderFlag,Status
21,43680,2012-05-31 00:00:00.000,2011-06-12 00:00:00.000,2011-06-07 00:00:00.000,29489,1069,1093.6394,341.7623,False,5
22,43681,2012-05-31 00:00:00.000,2011-06-12 00:00:00.000,2011-06-07 00:00:00.000,29661,955,1323.0668,413.4584,False,5


In [1800]:
SalesOrderHeader['OrderDate'] = pd.to_datetime(SalesOrderHeader['OrderDate'])
SalesOrderHeader['DueDate'] = pd.to_datetime(SalesOrderHeader['DueDate'])
SalesOrderHeader['ShipDate'] = pd.to_datetime(SalesOrderHeader['ShipDate'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  SalesOrderHeader['OrderDate'] = pd.to_datetime(SalesOrderHeader['OrderDate'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  SalesOrderHeader['DueDate'] = pd.to_datetime(SalesOrderHeader['DueDate'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  SalesOrderHeader['ShipDate'] = pd.to_datetime(SalesOr

Examine the difference between OrderDate and DueDate

In [1801]:
(SalesOrderHeader['DueDate'] - SalesOrderHeader['OrderDate']).value_counts()


12 days      31139
13 days          9
-354 days        2
Name: count, dtype: int64

In [1802]:
(SalesOrderHeader['ShipDate'] - SalesOrderHeader['OrderDate']).value_counts()

7 days       31139
8 days           9
-359 days        2
Name: count, dtype: int64

2 rows have negative time interval, majority are 12 days and some are 13 days from DueDate, while 7 and 8 days from ShipDate. Therefore, we will subtract 12 days from DueDate or 7 days from ShipDate to get the OrderDate of negative intervals respectively

In [1803]:
SalesOrderHeader.loc[(SalesOrderHeader['OrderDate'] >= SalesOrderHeader['DueDate']) | (SalesOrderHeader['OrderDate'] >= SalesOrderHeader['ShipDate']), 'OrderDate']=SalesOrderHeader[(SalesOrderHeader['OrderDate'] >= SalesOrderHeader['DueDate']) | (SalesOrderHeader['OrderDate'] >= SalesOrderHeader['ShipDate'])]['ShipDate'] - pd.Timedelta(days=7)


Create Date dimension

    Extract unique date values

In [1804]:
all_dates = pd.concat([
    SalesOrderHeader['OrderDate'],
    SalesOrderHeader['DueDate'],
    SalesOrderHeader['ShipDate']
]).sort_values().reset_index(drop=True)
date_records = pd.DataFrame({
    'DateID': all_dates.dt.strftime('%Y%m%d').astype('int64'),  # Generating sequential DateID starting from 1
    'FullDate': all_dates.dt.strftime('%Y-%m-%d'),  # Converting datetime to string
    'IsWeekDay': all_dates.dt.weekday < 5,  # Weekdays are less than 5
    'DayOfWeek': all_dates.dt.day_name() 
})

date_records = date_records.drop_duplicates()
date_records.to_csv(os.path.join(output_folder_path,'DateDim.csv'),index=False) 

In [1805]:
date_records

Unnamed: 0,DateID,FullDate,IsWeekDay,DayOfWeek
0,20110531,2011-05-31,True,Tuesday
42,20110601,2011-06-01,True,Wednesday
46,20110602,2011-06-02,True,Thursday
51,20110603,2011-06-03,True,Friday
53,20110604,2011-06-04,False,Saturday
...,...,...,...,...
93297,20140708,2014-07-08,True,Tuesday
93325,20140709,2014-07-09,True,Wednesday
93357,20140710,2014-07-10,True,Thursday
93388,20140711,2014-07-11,True,Friday


In [1806]:
def date_to_id(date):
    return date.strftime('%Y%m%d')

In [1807]:
for column in ['OrderDate','DueDate','ShipDate']:
    SalesOrderHeader[column] = SalesOrderHeader[column].apply(date_to_id)
    SalesOrderHeader[column] = SalesOrderHeader[column].astype('int64')
    SalesOrderHeader.rename(columns={column: column+'ID'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  SalesOrderHeader[column] = SalesOrderHeader[column].apply(date_to_id)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  SalesOrderHeader[column] = SalesOrderHeader[column].astype('int64')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  SalesOrderHeader.rename(columns={column: column+'ID'}, inplace=True)
A value is trying to be set on a copy of a slice from a

In [1808]:
SalesOrderHeader.info()

<class 'pandas.core.frame.DataFrame'>
Index: 31150 entries, 0 to 31464
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   SalesOrderID     31150 non-null  int64  
 1   OrderDateID      31150 non-null  int64  
 2   DueDateID        31150 non-null  int64  
 3   ShipDateID       31150 non-null  int64  
 4   CustomerID       31150 non-null  int64  
 5   ShipToAddressID  31150 non-null  int64  
 6   TaxAmt           31150 non-null  float64
 7   Freight          31150 non-null  float64
 8   OnlineOrderFlag  31150 non-null  bool   
 9   Status           31150 non-null  int64  
dtypes: bool(1), float64(2), int64(7)
memory usage: 2.4 MB


**6.SalesOrderDetail**

In [1809]:
SalesOrderDetail.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121317 entries, 0 to 121316
Data columns (total 11 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   SalesOrderID           121317 non-null  int64  
 1   SalesOrderDetailID     121317 non-null  int64  
 2   CarrierTrackingNumber  60919 non-null   object 
 3   OrderQty               121302 non-null  float64
 4   ProductID              121317 non-null  int64  
 5   SpecialOfferID         121317 non-null  int64  
 6   UnitPrice              121314 non-null  object 
 7   UnitPriceDiscount      121317 non-null  object 
 8   LineTotal              121317 non-null  float64
 9   rowguid                121317 non-null  object 
 10  ModifiedDate           121317 non-null  object 
dtypes: float64(2), int64(4), object(5)
memory usage: 10.2+ MB


In [1810]:
selected_columns = ['SalesOrderID', 'SalesOrderDetailID','OrderQty','ProductID','UnitPrice','UnitPriceDiscount','LineTotal']
SalesOrderDetail = SalesOrderDetail.loc[:, selected_columns]

In [1811]:
SalesOrderDetail['UnitPrice'] = SalesOrderDetail['UnitPrice'].astype('str').str.replace(',', '.').astype(float)
SalesOrderDetail['LineTotal'] = SalesOrderDetail['LineTotal'].astype('str').str.replace(',', '.').astype(float)
SalesOrderDetail['UnitPriceDiscount'] = SalesOrderDetail['UnitPriceDiscount'].astype('str').str.replace(',', '.').astype(float)

In [1812]:
def get_most_frequent_price(product_id):
    return SalesOrderDetail[SalesOrderDetail['ProductID'] == product_id]['UnitPrice'].mode()

In [1813]:
SalesOrderDetail.loc[(SalesOrderDetail['UnitPrice'] <= 0)| (SalesOrderDetail['UnitPrice'].isna())  ,'UnitPrice'] = SalesOrderDetail.loc[(SalesOrderDetail['UnitPrice'] <= 0)| (SalesOrderDetail['UnitPrice'].isna())]['ProductID'].apply(get_most_frequent_price)

In [1814]:
SalesOrderDetail.loc[(SalesOrderDetail['OrderQty'] <= 0),'OrderQty'] = - SalesOrderDetail.loc[(SalesOrderDetail['OrderQty'] <= 0)]['OrderQty']

In [1815]:
SalesOrderDetail = SalesOrderDetail.drop_duplicates().dropna()

In [1816]:
SalesOrderDetail['LineTotal']  = (SalesOrderDetail['OrderQty'] * SalesOrderDetail["UnitPrice"]*(1-SalesOrderDetail['UnitPriceDiscount']))

In [1817]:
SalesOrderDetail.info()

<class 'pandas.core.frame.DataFrame'>
Index: 121268 entries, 0 to 121316
Data columns (total 7 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   SalesOrderID        121268 non-null  int64  
 1   SalesOrderDetailID  121268 non-null  int64  
 2   OrderQty            121268 non-null  float64
 3   ProductID           121268 non-null  int64  
 4   UnitPrice           121268 non-null  float64
 5   UnitPriceDiscount   121268 non-null  float64
 6   LineTotal           121268 non-null  float64
dtypes: float64(4), int64(3)
memory usage: 7.4 MB


In [1818]:
SalesOrderDetail.rename(columns={'UnitPriceDiscount': 'PercentDiscount'}, inplace=True)

In [1819]:
SalesOrderDetail.rename(columns={'SalesOrderDetailID': 'SaleItemFactID'}, inplace=True)

In [1820]:
SalesOrderDetail.to_csv(os.path.join(output_folder_path,'SaleItemFact.csv'),index=False)

In [1821]:
# Assuming 'LineTotal' is the column representing the line total in the merged DataFrame 'SalesOrderHeader'
# Step 1: Aggregate the sum of 'LineTotal' for each 'SalesOrderID'
aggregate_totals = SalesOrderDetail.groupby('SalesOrderID')['LineTotal'].sum().rename('SubTotal').reset_index()

# Step 2: Merge this aggregated data back with the original 'SalesOrderHeader' DataFrame
SalesOrderHeader = pd.merge(SalesOrderHeader, aggregate_totals, on='SalesOrderID', how='inner')
# This will give you a DataFrame 'aggregated_sales_order' with two columns: 'SalesOrderID' and the sum of 'LineTotal' for each 'SalesOrderID'.


In [1822]:
SalesOrderHeader

Unnamed: 0,SalesOrderID,OrderDateID,DueDateID,ShipDateID,CustomerID,ShipToAddressID,TaxAmt,Freight,OnlineOrderFlag,Status,SubTotal
0,43659,20110531,20110612,20110607,29825,985,1971.5149,616.0984,False,5,20565.6206
1,43660,20110531,20110612,20110607,29672,921,124.2483,38.8276,False,5,1294.2529
2,43661,20110531,20110612,20110607,29734,517,3153.7696,985.5530,False,5,32726.4786
3,43662,20110531,20110612,20110607,29994,482,2775.1646,867.2389,False,5,28832.5289
4,43663,20110531,20110612,20110607,29565,1073,40.2681,12.5838,False,5,419.4589
...,...,...,...,...,...,...,...,...,...,...,...
31141,75119,20140630,20140712,20140707,11981,17649,3.3824,1.0570,True,5,42.2800
31142,75120,20140630,20140712,20140707,18749,28374,6.7968,2.1240,True,5,84.9600
31143,75121,20140630,20140712,20140707,15251,26553,5.9984,1.8745,True,5,74.9800
31144,75122,20140630,20140712,20140707,15868,14616,2.4776,0.7743,True,5,30.9700


In [1827]:
SalesOrderHeader

Unnamed: 0,SalesOrderID,OrderDateID,DueDateID,ShipDateID,CustomerID,ShipToAddressID,SubTotal,TaxAmt,Freight,OnlineOrderFlag
0,43659,20110531,20110612,20110607,29825,985,20565.6206,1971.5149,616.0984,False
1,43660,20110531,20110612,20110607,29672,921,1294.2529,124.2483,38.8276,False
2,43661,20110531,20110612,20110607,29734,517,32726.4786,3153.7696,985.5530,False
3,43662,20110531,20110612,20110607,29994,482,28832.5289,2775.1646,867.2389,False
4,43663,20110531,20110612,20110607,29565,1073,419.4589,40.2681,12.5838,False
...,...,...,...,...,...,...,...,...,...,...
31141,75119,20140630,20140712,20140707,11981,17649,42.2800,3.3824,1.0570,True
31142,75120,20140630,20140712,20140707,18749,28374,84.9600,6.7968,2.1240,True
31143,75121,20140630,20140712,20140707,15251,26553,74.9800,5.9984,1.8745,True
31144,75122,20140630,20140712,20140707,15868,14616,30.9700,2.4776,0.7743,True


In [1826]:
selected_columns = ['SalesOrderID', 'OrderDateID', 'DueDateID','ShipDateID','CustomerID','ShipToAddressID','SubTotal','TaxAmt','Freight','OnlineOrderFlag']
SalesOrderHeader = SalesOrderHeader.loc[:, selected_columns]

In [None]:
SalesOrderHeader.to_csv(os.path.join(output_folder_path,'OrderDim.csv'),index=False)