In [542]:
import pandas as pd
import matplotlib.pyplot as plt
import os

*Importing the tables*

In [543]:
Address = pd.read_csv('/Users/khuenguyen/Desktop/data_warehouse/tables/Address.csv')
CountryRegion = pd.read_csv('/Users/khuenguyen/Desktop/data_warehouse/tables/CountryRegion.csv')
Product = pd.read_csv('/Users/khuenguyen/Desktop/data_warehouse/tables/Product.csv')
ProductSubcategory = pd.read_csv('/Users/khuenguyen/Desktop/data_warehouse/tables/ProductSubcategory.csv')
SalesOrderDetail = pd.read_csv('/Users/khuenguyen/Desktop/data_warehouse/tables/SalesOrderDetail.csv')
SalesOrderHeader = pd.read_csv('/Users/khuenguyen/Desktop/data_warehouse/tables/SalesOrderHeader.csv')
StateProvince = pd.read_csv('/Users/khuenguyen/Desktop/data_warehouse/tables/StateProvince.csv')
ProductCategory = pd.read_csv('/Users/khuenguyen/Desktop/data_warehouse/tables/ProductCategory.csv')
ProductUnitPrice = pd.read_csv('/Users/khuenguyen/Desktop/data_warehouse/tables/Product_UnitPrice.csv')

In [544]:
current_directory = os.getcwd()
output_folder_path = os.path.join(current_directory, "transformed_tables")
if not os.path.exists(output_folder_path):
    os.makedirs(output_folder_path)
print(output_folder_path)

/Users/khuenguyen/Desktop/data_warehouse/transformed_tables


**1. Address**

In [545]:
Address.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   AddressID        1000 non-null   int64 
 1   AddressLine1     1000 non-null   object
 2   AddressLine2     48 non-null     object
 3   City             1000 non-null   object
 4   StateProvinceID  1000 non-null   int64 
 5   PostalCode       1000 non-null   object
 6   SpatialLocation  1000 non-null   object
 7   rowguid          1000 non-null   object
 8   ModifiedDate     1000 non-null   object
dtypes: int64(2), object(7)
memory usage: 70.4+ KB


Remove unecessary columns

In [546]:
Address = Address.drop(columns=['SpatialLocation','rowguid','ModifiedDate'])

In [547]:
object_columns = Address.select_dtypes(include='object').columns
numeric_columns = Address.select_dtypes(include='int64').columns

Check duplicated values

In [548]:
print(Address.duplicated().sum())

0


Check null values

In [549]:
Address.isnull().sum()

AddressID            0
AddressLine1         0
AddressLine2       952
City                 0
StateProvinceID      0
PostalCode           0
dtype: int64

Because "AddressLine2" is just a complement address used to support "AddressLine1" (apartment number, block number, etc.), we can accept null value in this column

In [550]:
Address.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   AddressID        1000 non-null   int64 
 1   AddressLine1     1000 non-null   object
 2   AddressLine2     48 non-null     object
 3   City             1000 non-null   object
 4   StateProvinceID  1000 non-null   int64 
 5   PostalCode       1000 non-null   object
dtypes: int64(2), object(4)
memory usage: 47.0+ KB


In [551]:
Address.describe(include=['object'])

Unnamed: 0,AddressLine1,AddressLine2,City,PostalCode
count,1000,48,1000,1000
unique,989,45,421,480
top,Horizon Outlet Center,# 14,Seattle,98104
freq,3,2,45,45


In [552]:
Address.to_csv(os.path.join(output_folder_path,'AddressLineDim.csv'),index=False)

**2.StateProvice**


In [553]:
StateProvince.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 181 entries, 0 to 180
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   StateProvinceID          181 non-null    int64 
 1   StateProvinceCode        181 non-null    object
 2   CountryRegionCode        181 non-null    object
 3   IsOnlyStateProvinceFlag  181 non-null    int64 
 4   Name                     181 non-null    object
 5   TerritoryID              181 non-null    int64 
 6   rowguid                  181 non-null    object
 7   ModifiedDate             181 non-null    object
dtypes: int64(3), object(5)
memory usage: 11.4+ KB


Remove unwanted columns

In [554]:
StateProvince = StateProvince.drop(columns=['IsOnlyStateProvinceFlag','rowguid','ModifiedDate'])

Check duplicated

In [555]:
print(StateProvince.duplicated().sum())

0


Check null values

In [556]:
StateProvince.isnull().sum()

StateProvinceID      0
StateProvinceCode    0
CountryRegionCode    0
Name                 0
TerritoryID          0
dtype: int64

Convert CountryRegionCode into CountryRegionName

In [557]:
StateProvince['CountryRegionCode'] = StateProvince['CountryRegionCode'].map(CountryRegion.set_index('CountryRegionCode')['Name'])

In [558]:
StateProvince = StateProvince.rename(columns={'CountryRegionCode': 'CountryRegionName'})

In [559]:
StateProvince.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 181 entries, 0 to 180
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   StateProvinceID    181 non-null    int64 
 1   StateProvinceCode  181 non-null    object
 2   CountryRegionName  181 non-null    object
 3   Name               181 non-null    object
 4   TerritoryID        181 non-null    int64 
dtypes: int64(2), object(3)
memory usage: 7.2+ KB


In [560]:
StateProvince.head()

Unnamed: 0,StateProvinceID,StateProvinceCode,CountryRegionName,Name,TerritoryID
0,1,AB,Canada,Alberta,6
1,2,AK,United States,Alaska,1
2,3,AL,United States,Alabama,5
3,4,AR,United States,Arkansas,3
4,5,AS,American Samoa,American Samoa,1


In [561]:
StateProvince.to_csv(os.path.join(output_folder_path,'StateProvinceDim.csv'),index=False)

**3.Product**

In [562]:
Product.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 504 entries, 0 to 503
Data columns (total 25 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   ProductID              504 non-null    int64  
 1   Name                   504 non-null    object 
 2   ProductNumber          504 non-null    object 
 3   MakeFlag               504 non-null    int64  
 4   FinishedGoodsFlag      504 non-null    int64  
 5   Color                  256 non-null    object 
 6   SafetyStockLevel       504 non-null    int64  
 7   ReorderPoint           504 non-null    int64  
 8   StandardCost           504 non-null    object 
 9   ListPrice              504 non-null    object 
 10  Size                   211 non-null    object 
 11  SizeUnitMeasureCode    176 non-null    object 
 12  WeightUnitMeasureCode  205 non-null    object 
 13  Weight                 205 non-null    float64
 14  DaysToManufacture      504 non-null    int64  
 15  Produc

Remove unwanted columns

In [563]:
selected_columns = ['ProductID', 'Name', 'Color', 'ProductSubcategoryID', 'DaysToManufacture']
Product = Product.loc[:, selected_columns]


Check duplicated values

In [564]:
print(Product.duplicated().sum())

0


Check null values

In [565]:
Product.isnull().sum()

ProductID                 0
Name                      0
Color                   248
ProductSubcategoryID    209
DaysToManufacture         0
dtype: int64

Some products doesn't belong to any subcategory so its "ProductSubcategoryID" is null, we will keep that 
For color, we will impute using the most frequently appeared color

In [566]:
most_frequent_color = Product['Color'].mode()[0]
print(most_frequent_color)

Black


In [567]:
Product[Product['Color']=='Black']

Unnamed: 0,ProductID,Name,Color,ProductSubcategoryID,DaysToManufacture
5,317,LL Crankarm,Black,,0
6,318,ML Crankarm,Black,,0
7,319,HL Crankarm,Black,,0
10,322,Chainring,Black,,0
209,680,"HL Road Frame - Black, 58",Black,14.0,1
...,...,...,...,...,...
496,992,"Mountain-500 Black, 48",Black,1.0,4
497,993,"Mountain-500 Black, 52",Black,1.0,4
501,997,"Road-750 Black, 44",Black,2.0,4
502,998,"Road-750 Black, 48",Black,2.0,4


In [568]:
Product.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 504 entries, 0 to 503
Data columns (total 5 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   ProductID             504 non-null    int64  
 1   Name                  504 non-null    object 
 2   Color                 256 non-null    object 
 3   ProductSubcategoryID  295 non-null    float64
 4   DaysToManufacture     504 non-null    int64  
dtypes: float64(1), int64(2), object(2)
memory usage: 19.8+ KB


As we can see, black color exist in 93 rows in all 256 non null colors, which is a significant number of appearance. For that reason, we are going to impute those null value using most frequently appeared color - Black

In [569]:
Product['Color'] = Product['Color'].fillna(most_frequent_color)

In [570]:
Product.to_csv(os.path.join(output_folder_path,'ProductDim.csv'),index=False)

**4.ProductSubcategory**

In [571]:
ProductSubcategory.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37 entries, 0 to 36
Data columns (total 5 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   ProductSubcategoryID  37 non-null     int64 
 1   ProductCategoryID     37 non-null     int64 
 2   Name                  37 non-null     object
 3   rowguid               37 non-null     object
 4   ModifiedDate          37 non-null     object
dtypes: int64(2), object(3)
memory usage: 1.6+ KB


In [572]:
ProductSubcategory.head()

Unnamed: 0,ProductSubcategoryID,ProductCategoryID,Name,rowguid,ModifiedDate
0,1,1,Mountain Bikes,2d364ade-264a-433c-b092-4fcbf3804e01,2008-04-30 00:00:00.000
1,2,1,Road Bikes,000310c0-bcc8-42c4-b0c3-45ae611af06b,2008-04-30 00:00:00.000
2,3,1,Touring Bikes,02c5061d-ecdc-4274-b5f1-e91d76bc3f37,2008-04-30 00:00:00.000
3,4,2,Handlebars,3ef2c725-7135-4c85-9ae6-ae9a3bdd9283,2008-04-30 00:00:00.000
4,5,2,Bottom Brackets,a9e54089-8a1e-4cf5-8646-e3801f685934,2008-04-30 00:00:00.000


In [573]:
ProductSubcategory['ProductCategoryName'] = ProductSubcategory['ProductCategoryID'].map(ProductCategory.set_index('ProductCategoryID')['Name'])

Remove unwanted columns

In [574]:
selected_columns = ['ProductSubcategoryID', 'Name', 'ProductCategoryName']
ProductSubcategory = ProductSubcategory.loc[:, selected_columns]
ProductSubcategory = ProductSubcategory.rename(columns={'Name': 'ProductSubcategoryName'})

In [575]:
ProductSubcategory.to_csv(os.path.join(output_folder_path,'ProductSubcategoryDim.csv'),index=False)

**5.SalesOrderHeader**

In [576]:
SalesOrderHeader.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31465 entries, 0 to 31464
Data columns (total 26 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   SalesOrderID            31465 non-null  int64  
 1   RevisionNumber          31465 non-null  int64  
 2   OrderDate               31465 non-null  object 
 3   DueDate                 31465 non-null  object 
 4   ShipDate                31465 non-null  object 
 5   Status                  31465 non-null  int64  
 6   OnlineOrderFlag         31465 non-null  int64  
 7   SalesOrderNumber        31465 non-null  object 
 8   PurchaseOrderNumber     3806 non-null   object 
 9   AccountNumber           31465 non-null  object 
 10  CustomerID              31465 non-null  int64  
 11  SalesPersonID           3806 non-null   float64
 12  TerritoryID             31465 non-null  int64  
 13  BillToAddressID         31465 non-null  int64  
 14  ShipToAddressID         31465 non-null

Remove unwanted columns

In [577]:
selected_columns = ['SalesOrderID', 'OrderDate', 'DueDate','ShipDate','CustomerID','ShipToAddressID','SubTotal','TaxAmt','Freight','OnlineOrderFlag']
SalesOrderHeader = SalesOrderHeader.loc[:, selected_columns]

In [578]:
SalesOrderHeader['TaxAmt'] = SalesOrderHeader['TaxAmt'].astype('str').str.replace(',', '.').astype(float)
SalesOrderHeader['Freight'] = SalesOrderHeader['Freight'].astype('str').str.replace(',', '.').astype(float)


Check null values and duplicated

In [579]:
SalesOrderHeader.isnull().sum()

SalesOrderID       0
OrderDate          0
DueDate            0
ShipDate           0
CustomerID         0
ShipToAddressID    0
SubTotal           0
TaxAmt             0
Freight            0
OnlineOrderFlag    0
dtype: int64

In [580]:
print(SalesOrderHeader.duplicated().sum())

0


In [581]:
SalesOrderHeader[(SalesOrderHeader['OrderDate'] >= SalesOrderHeader['DueDate']) | (SalesOrderHeader['OrderDate'] >= SalesOrderHeader['ShipDate'])]


Unnamed: 0,SalesOrderID,OrderDate,DueDate,ShipDate,CustomerID,ShipToAddressID,SubTotal,TaxAmt,Freight,OnlineOrderFlag
21,43680,2012-05-31 00:00:00.000,2011-06-12 00:00:00.000,2011-06-07 00:00:00.000,29489,1069,0,1093.6394,341.7623,0
22,43681,2012-05-31 00:00:00.000,2011-06-12 00:00:00.000,2011-06-07 00:00:00.000,29661,955,0,1323.0668,413.4584,0


In [582]:
SalesOrderHeader[(SalesOrderHeader['OrderDate'] >= SalesOrderHeader['DueDate']) | (SalesOrderHeader['OrderDate'] >= SalesOrderHeader['ShipDate'])]


Unnamed: 0,SalesOrderID,OrderDate,DueDate,ShipDate,CustomerID,ShipToAddressID,SubTotal,TaxAmt,Freight,OnlineOrderFlag
21,43680,2012-05-31 00:00:00.000,2011-06-12 00:00:00.000,2011-06-07 00:00:00.000,29489,1069,0,1093.6394,341.7623,0
22,43681,2012-05-31 00:00:00.000,2011-06-12 00:00:00.000,2011-06-07 00:00:00.000,29661,955,0,1323.0668,413.4584,0


In [583]:
SalesOrderHeader['OrderDate'] = pd.to_datetime(SalesOrderHeader['OrderDate'])
SalesOrderHeader['DueDate'] = pd.to_datetime(SalesOrderHeader['DueDate'])
SalesOrderHeader['ShipDate'] = pd.to_datetime(SalesOrderHeader['ShipDate'])

Examine the difference between OrderDate and DueDate

In [584]:
(SalesOrderHeader['DueDate'] - SalesOrderHeader['OrderDate']).value_counts()


12 days      31454
13 days          9
-354 days        2
Name: count, dtype: int64

In [585]:
(SalesOrderHeader['ShipDate'] - SalesOrderHeader['OrderDate']).value_counts()

7 days       31454
8 days           9
-359 days        2
Name: count, dtype: int64

2 rows have negative time interval, majority are 12 days and some are 13 days from DueDate, while 7 and 8 days from ShipDate. Therefore, we will subtract 12 days from DueDate or 7 days from ShipDate to get the OrderDate of negative intervals respectively

In [586]:
SalesOrderHeader.loc[(SalesOrderHeader['OrderDate'] >= SalesOrderHeader['DueDate']) | (SalesOrderHeader['OrderDate'] >= SalesOrderHeader['ShipDate']), 'OrderDate']=SalesOrderHeader[(SalesOrderHeader['OrderDate'] >= SalesOrderHeader['DueDate']) | (SalesOrderHeader['OrderDate'] >= SalesOrderHeader['ShipDate'])]['ShipDate'] - pd.Timedelta(days=7)


Create Date dimension

    Extract unique date values

In [587]:
all_dates = pd.concat([
    SalesOrderHeader['OrderDate'],
    SalesOrderHeader['DueDate'],
    SalesOrderHeader['ShipDate']
]).sort_values().reset_index(drop=True)
date_records = pd.DataFrame({
    'DateID': all_dates.dt.strftime('%Y%m%d').astype('int64'),  # Generating sequential DateID starting from 1
    'FullDate': all_dates.dt.strftime('%Y-%m-%d'),  # Converting datetime to string
    'IsWeekDay': all_dates.dt.weekday < 5,  # Weekdays are less than 5
    'DayOfWeek': all_dates.dt.day_name() 
})

date_records = date_records.drop_duplicates()
date_records.to_csv(os.path.join(output_folder_path,'DateDim.csv'),index=False) 

In [588]:
date_records

Unnamed: 0,DateID,FullDate,IsWeekDay,DayOfWeek
0,20110531,2011-05-31,True,Tuesday
43,20110601,2011-06-01,True,Wednesday
47,20110602,2011-06-02,True,Thursday
52,20110603,2011-06-03,True,Friday
54,20110604,2011-06-04,False,Saturday
...,...,...,...,...
94240,20140708,2014-07-08,True,Tuesday
94269,20140709,2014-07-09,True,Wednesday
94301,20140710,2014-07-10,True,Thursday
94332,20140711,2014-07-11,True,Friday


In [589]:
def date_to_id(date):
    return date.strftime('%Y%m%d')

In [590]:
for column in ['OrderDate','DueDate','ShipDate']:
    SalesOrderHeader[column] = SalesOrderHeader[column].apply(date_to_id)
    SalesOrderHeader[column] = SalesOrderHeader[column].astype('int64')
    SalesOrderHeader.rename(columns={column: column+'ID'}, inplace=True)

In [591]:
SalesOrderHeader.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31465 entries, 0 to 31464
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   SalesOrderID     31465 non-null  int64  
 1   OrderDateID      31465 non-null  int64  
 2   DueDateID        31465 non-null  int64  
 3   ShipDateID       31465 non-null  int64  
 4   CustomerID       31465 non-null  int64  
 5   ShipToAddressID  31465 non-null  int64  
 6   SubTotal         31465 non-null  object 
 7   TaxAmt           31465 non-null  float64
 8   Freight          31465 non-null  float64
 9   OnlineOrderFlag  31465 non-null  int64  
dtypes: float64(2), int64(7), object(1)
memory usage: 2.4+ MB


**6.SalesOrderDetail**

In [592]:
SalesOrderDetail

Unnamed: 0,SalesOrderID,SalesOrderDetailID,CarrierTrackingNumber,OrderQty,ProductID,SpecialOfferID,UnitPrice,UnitPriceDiscount,LineTotal,rowguid,ModifiedDate
0,43659,1,4911-403C-98,1.0,776,1,2024994,000,2024.994,b207c96d-d9e6-402b-8470-2cc176c42283,2011-05-31 00:00:00.000
1,43659,2,4911-403C-98,3.0,777,1,2024994,000,6074.982,7abb600d-1e77-41be-9fe5-b9142cfc08fa,2011-05-31 00:00:00.000
2,43659,3,4911-403C-98,1.0,778,1,2024994,000,2024.994,475cf8c6-49f6-486e-b0ad-afc6a50cdd2f,2011-05-31 00:00:00.000
3,43659,4,4911-403C-98,1.0,771,1,2039994,000,2039.994,04c4de91-5815-45d6-8670-f462719fbce3,2011-05-31 00:00:00.000
4,43659,5,4911-403C-98,1.0,772,1,2039994,000,2039.994,5a74c7d2-e641-438e-a7ac-37bf23280301,2011-05-31 00:00:00.000
...,...,...,...,...,...,...,...,...,...,...,...
121312,75122,121313,,1.0,878,1,2198,000,21.980,8cad6675-18cc-4f47-8287-97b41a8ee47d,2014-06-30 00:00:00.000
121313,75122,121314,,1.0,712,1,899,000,8.990,84f1c363-1c50-4442-be16-541c59b6e12c,2014-06-30 00:00:00.000
121314,75123,121315,,1.0,878,1,2198,000,21.980,c18b6476-429f-4bb1-828e-2be5f82a0a51,2014-06-30 00:00:00.000
121315,75123,121316,,1.0,879,1,15900,000,159.000,75a89c6a-c60a-47ea-8a52-b52a9c435b64,2014-06-30 00:00:00.000


In [593]:
selected_columns = ['SalesOrderID', 'SalesOrderDetailID','OrderQty','ProductID','UnitPrice','UnitPriceDiscount','LineTotal']
SalesOrderDetail = SalesOrderDetail.loc[:, selected_columns]

In [594]:
SalesOrderDetail['UnitPrice'] = SalesOrderDetail['UnitPrice'].astype('str').str.replace(',', '.').astype(float)
SalesOrderDetail['LineTotal'] = SalesOrderDetail['UnitPrice'].astype('str').str.replace(',', '.').astype(float)
SalesOrderDetail['UnitPriceDiscount'] = SalesOrderDetail['UnitPriceDiscount'].astype('str').str.replace(',', '.').astype(float)

In [595]:
def get_most_frequent_price(product_id):
    return SalesOrderDetail[SalesOrderDetail['ProductID'] == product_id]['UnitPrice'].mode()

In [596]:
SalesOrderDetail.loc[(SalesOrderDetail['UnitPrice'] <= 0)| (SalesOrderDetail['UnitPrice'].isna())  ,'UnitPrice'] = SalesOrderDetail.loc[(SalesOrderDetail['UnitPrice'] <= 0)| (SalesOrderDetail['UnitPrice'].isna())]['ProductID'].apply(get_most_frequent_price)

In [597]:
SalesOrderDetail.loc[(SalesOrderDetail['OrderQty'] <= 0),'OrderQty'] = - SalesOrderDetail.loc[(SalesOrderDetail['OrderQty'] <= 0)]['OrderQty']

In [598]:
SalesOrderDetail[SalesOrderDetail['ProductID']==715 &SalesOrderDetail['UnitPrice'].isna()]

Unnamed: 0,SalesOrderID,SalesOrderDetailID,OrderQty,ProductID,UnitPrice,UnitPriceDiscount,LineTotal


In [599]:
SalesOrderDetail = SalesOrderDetail.drop_duplicates().dropna()

In [600]:
SalesOrderDetail['LineTotal']  = (SalesOrderDetail['OrderQty'] * SalesOrderDetail["UnitPrice"]*(1-SalesOrderDetail['UnitPriceDiscount']))

In [601]:
SalesOrderDetail.info()

<class 'pandas.core.frame.DataFrame'>
Index: 121268 entries, 0 to 121316
Data columns (total 7 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   SalesOrderID        121268 non-null  int64  
 1   SalesOrderDetailID  121268 non-null  int64  
 2   OrderQty            121268 non-null  float64
 3   ProductID           121268 non-null  int64  
 4   UnitPrice           121268 non-null  float64
 5   UnitPriceDiscount   121268 non-null  float64
 6   LineTotal           121268 non-null  float64
dtypes: float64(4), int64(3)
memory usage: 7.4 MB


In [602]:
SalesOrderDetail.rename(columns={'UnitPriceDiscount': 'PercentDiscount'}, inplace=True)

In [603]:
SalesOrderDetail.rename(columns={'SalesOrderDetailID': 'SaleItemFactID'}, inplace=True)

In [604]:
SalesOrderDetail.to_csv(os.path.join(output_folder_path,'SaleItemFact.csv'),index=False)

In [605]:
SalesOrderHeader = pd.merge(SalesOrderHeader, SalesOrderDetail, on='SalesOrderID', how='inner')

In [606]:
SalesOrderHeader = SalesOrderHeader.drop_duplicates(subset='SalesOrderID')

In [607]:
SalesOrderHeader['Subtotal'] = SalesOrderHeader['LineTotal']

In [608]:
selected_columns = ['SalesOrderID', 'OrderDateID', 'DueDateID','ShipDateID','CustomerID','ShipToAddressID','Subtotal','TaxAmt','Freight','OnlineOrderFlag']
SalesOrderHeader = SalesOrderHeader.loc[:, selected_columns]

In [609]:
SalesOrderHeader.to_csv(os.path.join(output_folder_path,'OrderDim.csv'),index=False)