# - Data Warehouse -

### Het samenvoegen van bestaande tabellen voor het uiteindelijke 'Datawarehouse'

##### Importeren van benodigde dependencies

In [2]:
import pandas as pd
import pyodbc

import warnings
warnings.filterwarnings("ignore")

# Importeren van de create_connection en run_query functies uit de database_utils.py file
from utils.database_utils import create_connection, run_query

##### 1 - Opbouwen van dataframes voor elke database-tabel

In [5]:
aenc_order_df = run_query("SELECT * FROM sales_order", "AenC")
aenc_order_item_df = run_query("SELECT * FROM sales_order_item", "AenC")

aw_order_df = run_query("SELECT * FROM Purchasing.PurchaseOrderDetail", "AdventureWorks")

nw_order_df = run_query("SELECT * FROM [Order Details]", "Northwind")

Je kunt vergelijkbare queries uitvoeren voor andere tabellen die je wilt opnemen.

#### 2 - Data transformatie

Na het opbouwen van de dataframes voor elke database-tabel, kunnen we beginnen met het transformeren van de data. Dit omvat het samenvoegen van tabellen, het toepassen van filters, het uitvoeren van berekeningen, enzovoort.

In [62]:
create_table_query = """
CREATE TABLE OrderData (
    IDSK INT PRIMARY KEY,
    id VARCHAR(255),
    cust_id VARCHAR(255),
    order_date DATE,
    region VARCHAR(255),
    sales_rep VARCHAR(255),
    line_id INT,
    prod_id VARCHAR(255),
    quantity INT,
    ship_date DATE,
    LineTotal DECIMAL(10, 2),
    ReceivedQty DECIMAL(10, 1),  -- Adjust precision and scale as needed
    RejectedQty DECIMAL(10, 1),  -- Adjust precision and scale as needed
    StockedQty DECIMAL(10, 1),
    ModifiedDate DATE,
    OrderID INT,
    ProductID_y INT,
    UnitPrice_y DECIMAL(10, 2),
    Discount DECIMAL(5, 2)
);
"""

# Create the table in SQL Server
db_name = "test"
conn, cursor = create_connection(db_name)
cursor.execute(create_table_query)
conn.commit()
conn.close()

### Eerste dataframe (AenC)

In [47]:
# Merge the dataframes
merged_df = pd.merge(aenc_order_df, aenc_order_item_df, on='id', how='inner')

merged_df['id'] = merged_df['id'].astype('object')

# Reset the index and start from 1
merged_df.reset_index(drop=True, inplace=True)
merged_df.index = merged_df.index + 1

# Rename the index to 'PrimaryKey'
merged_df.index.name = 'IDSK'

merged_df

Unnamed: 0_level_0,id,cust_id,order_date,region,sales_rep,line_id,prod_id,quantity,ship_date
IDSK,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,2001,101,14-Mar-1996 12:00:00 AM,Eastern,299,1,300,12,15-Sep-1996 12:00:00 AM
2,2001,101,14-Mar-1996 12:00:00 AM,Eastern,299,2,301,12,14-Sep-1996 12:00:00 AM
3,2001,101,14-Mar-1996 12:00:00 AM,Eastern,299,3,302,12,14-Sep-1996 12:00:00 AM
4,2002,102,18-Mar-1996 12:00:00 AM,Eastern,467,1,400,24,18-Sep-1996 12:00:00 AM
5,2002,102,18-Mar-1996 12:00:00 AM,Eastern,467,2,401,24,18-Sep-1996 12:00:00 AM
...,...,...,...,...,...,...,...,...,...
1099,2652,174,26-Nov-1998 12:00:00 AM,Central,902,2,300,10,01-Dec-1998 12:00:00 AM
1100,2653,102,17-Jan-1999 12:00:00 AM,Eastern,148,1,301,0,17-Oct-1999 12:00:00 AM
1101,2654,105,26-Feb-1999 12:00:00 AM,South,148,1,301,15,29-Oct-1999 12:00:00 AM
1102,2654,105,26-Feb-1999 12:00:00 AM,South,148,2,302,3,29-Oct-1999 12:00:00 AM


### Tweede dataframe (AdventureWorks)

In [49]:
merged_df2 = pd.merge(merged_df, aw_order_df, left_on='IDSK', right_on='PurchaseOrderID', how='inner') # type: ignore

merged_df2['id'] = merged_df2['id'].astype('object')

# Reset the index and start from 1
merged_df2.reset_index(drop=True, inplace=True)
merged_df2.index = merged_df2.index + 1

# Rename the index to 'PrimaryKey'
merged_df.index.name = 'IDSK'

merged_df2


Unnamed: 0,id,cust_id,order_date,region,sales_rep,line_id,prod_id,quantity,ship_date,PurchaseOrderID,PurchaseOrderDetailID,DueDate,OrderQty,ProductID,UnitPrice,LineTotal,ReceivedQty,RejectedQty,StockedQty,ModifiedDate
1,2001,101,14-Mar-1996 12:00:00 AM,Eastern,299,1,300,12,15-Sep-1996 12:00:00 AM,1,1,2011-04-30,4,1,50.2600,201.0400,3.0,0.0,3.0,2011-04-23
2,2001,101,14-Mar-1996 12:00:00 AM,Eastern,299,2,301,12,14-Sep-1996 12:00:00 AM,2,2,2011-04-30,3,359,45.1200,135.3600,3.0,0.0,3.0,2011-04-23
3,2001,101,14-Mar-1996 12:00:00 AM,Eastern,299,2,301,12,14-Sep-1996 12:00:00 AM,2,3,2011-04-30,3,360,45.5805,136.7415,3.0,0.0,3.0,2011-04-23
4,2001,101,14-Mar-1996 12:00:00 AM,Eastern,299,3,302,12,14-Sep-1996 12:00:00 AM,3,4,2011-04-30,550,530,16.0860,8847.3000,550.0,0.0,550.0,2011-04-23
5,2002,102,18-Mar-1996 12:00:00 AM,Eastern,467,1,400,24,18-Sep-1996 12:00:00 AM,4,5,2011-04-30,3,4,57.0255,171.0765,2.0,1.0,1.0,2011-04-23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2484,2653,102,17-Jan-1999 12:00:00 AM,Eastern,148,1,301,0,17-Oct-1999 12:00:00 AM,1100,2484,2013-11-20,3,504,48.7620,146.2860,3.0,0.0,3.0,2013-11-13
2485,2654,105,26-Feb-1999 12:00:00 AM,South,148,1,301,15,29-Oct-1999 12:00:00 AM,1101,2485,2013-11-21,3,497,34.1880,102.5640,3.0,0.0,3.0,2013-11-14
2486,2654,105,26-Feb-1999 12:00:00 AM,South,148,2,302,3,29-Oct-1999 12:00:00 AM,1102,2486,2013-11-21,3,323,50.2635,150.7905,3.0,0.0,3.0,2013-11-14
2487,2654,105,26-Feb-1999 12:00:00 AM,South,148,3,400,8,29-Oct-1999 12:00:00 AM,1103,2487,2013-11-21,550,523,10.7310,5902.0500,550.0,0.0,550.0,2013-11-14


### Derde Dataframe (Northwind)

In [46]:

nw_order_df['OrderID'] = nw_order_df['OrderID'].astype('object')

merged_df_def = pd.merge(merged_df2, nw_order_df, left_on='PurchaseOrderID', right_on='OrderID', how='inner') # type: ignore

# Reset the index and start from 1
merged_df_def.reset_index(drop=True, inplace=True)
merged_df_def.index = merged_df_def.index + 1

# Rename the index to 'PrimaryKey'
merged_df.index.name = 'IDSK'

merged_df_def


Unnamed: 0,id,cust_id,order_date,region,sales_rep,line_id,prod_id,quantity,ship_date,PurchaseOrderID,...,LineTotal,ReceivedQty,RejectedQty,StockedQty,ModifiedDate,OrderID,ProductID_y,UnitPrice_y,Quantity,Discount


In [56]:
import pandas as pd

# Merge the first two dataframes
merged_df = pd.merge(aenc_order_df, aenc_order_item_df, on='id', how='inner')

# Convert 'id' column to object type
merged_df['id'] = merged_df['id'].astype('object')

# Reset the index and start from 1
merged_df.reset_index(drop=True, inplace=True)
merged_df.index = merged_df.index + 1

# Create a new primary key 'IDSK'
merged_df['IDSK'] = merged_df.index

# Merge the third dataframe using the new 'IDSK' column
merged_df2 = pd.merge(merged_df, aw_order_df, left_on='IDSK', right_index=True, how='inner')

# Merge the fourth dataframe using the new 'IDSK' column
nw_order_df['IDSK'] = nw_order_df.index
merged_df_def = pd.merge(merged_df2, nw_order_df, left_on='IDSK', right_index=True, how='inner')

# Drop the temporary 'IDSK' columns
merged_df_def.drop(columns=['IDSK_x', 'IDSK_y'], inplace=True)

# Display the final merged dataframe
merged_df_def

# List of columns to drop ( HIER EVT NOG KOLOMMEN DROPPEN OM HET SCHOON TE MAKEN)
#columns_to_drop = ['', '', '']

# Drop the specified columns
#merged_df_def.drop(columns=columns_to_drop, inplace=True)



Unnamed: 0,IDSK,id,cust_id,order_date,region,sales_rep,line_id,prod_id,quantity,ship_date,...,LineTotal,ReceivedQty,RejectedQty,StockedQty,ModifiedDate,OrderID,ProductID_y,UnitPrice_y,Quantity,Discount
1,1,2001,101,14-Mar-1996 12:00:00 AM,Eastern,299,1,300,12,15-Sep-1996 12:00:00 AM,...,135.3600,3.0,0.0,3.0,2011-04-23,10248,42,9.80,10,0.0
2,2,2001,101,14-Mar-1996 12:00:00 AM,Eastern,299,2,301,12,14-Sep-1996 12:00:00 AM,...,136.7415,3.0,0.0,3.0,2011-04-23,10248,72,34.80,5,0.0
3,3,2001,101,14-Mar-1996 12:00:00 AM,Eastern,299,3,302,12,14-Sep-1996 12:00:00 AM,...,8847.3000,550.0,0.0,550.0,2011-04-23,10249,14,18.60,9,0.0
4,4,2002,102,18-Mar-1996 12:00:00 AM,Eastern,467,1,400,24,18-Sep-1996 12:00:00 AM,...,171.0765,2.0,1.0,1.0,2011-04-23,10249,51,42.40,40,0.0
5,5,2002,102,18-Mar-1996 12:00:00 AM,Eastern,467,2,401,24,18-Sep-1996 12:00:00 AM,...,20397.3000,550.0,0.0,550.0,2011-05-07,10250,41,7.70,10,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1099,1099,2652,174,26-Nov-1998 12:00:00 AM,Central,902,2,300,10,01-Dec-1998 12:00:00 AM,...,45558.9750,550.0,0.0,550.0,2013-08-15,10666,29,123.79,36,0.0
1100,1100,2653,102,17-Jan-1999 12:00:00 AM,Eastern,148,1,301,0,17-Oct-1999 12:00:00 AM,...,262.5000,1250.0,0.0,1250.0,2013-08-15,10666,65,21.05,10,0.0
1101,1101,2654,105,26-Feb-1999 12:00:00 AM,South,148,1,301,15,29-Oct-1999 12:00:00 AM,...,262.5000,1188.0,0.0,1188.0,2013-08-15,10667,69,36.00,45,0.2
1102,1102,2654,105,26-Feb-1999 12:00:00 AM,South,148,2,302,3,29-Oct-1999 12:00:00 AM,...,148.9320,3.0,0.0,3.0,2013-08-15,10667,71,21.50,14,0.2


#### 3 - Data loading

Na het transformeren van de data, kunnen we de resulterende dataframe in de doeltabel van ons datawarehouse laden.

In [63]:
# Connect to the database
conn, cursor = create_connection('test')

for index, row in merged_df_def.iterrows():
    query = f"""
        INSERT INTO OrderData (
            IDSK,
            id, 
            cust_id, 
            order_date, 
            region, 
            sales_rep, 
            line_id,
            prod_id,
            quantity,
            ship_date,
            LineTotal,
            ReceivedQty,
            RejectedQty,
            StockedQty,
            ModifiedDate,
            OrderID,
            ProductID_y,
            UnitPrice_y,
            Discount
        ) 
        VALUES (
            '{row['IDSK']}', 
            '{row['id']}', 
            '{row['cust_id']}',
            '{row['order_date']}', 
            '{row['region']}', 
            '{row['sales_rep']}',
            '{row['line_id']}', 
            '{row['prod_id']}', 
            '{row['quantity']}',
            '{row['ship_date']}', 
            '{row['LineTotal']}', 
            '{row['ReceivedQty']}',
            '{row['RejectedQty']}',
            '{row['StockedQty']}', 
            '{row['ModifiedDate']}', 
            '{row['OrderID']}',
            '{row['ProductID_y']}', 
            '{row['UnitPrice_y']}', 
            '{row['Discount']}'
        )
    """
    # Execute the query
    cursor.execute(query)

# Commit the changes and close the connection
conn.commit()
conn.close()

**Note:** Voeg indien nodig zoveel Markdown- of codeblokken toe als nodig is.

#### 4 -  Data Quality Checks

Voeg controles toe om de kwaliteit van de gegevens te waarborgen voordat ze worden geladen in het datawarehouse:

In [None]:
# Controleren op ontbrekende waarden
missing_values = filtered_df.isnull().sum()

# Controleren op duplicaten
duplicate_rows = filtered_df.duplicated().sum()

# Weergave van resultaten
print("Aantal ontbrekende waarden:", missing_values)
print("Aantal duplicaten:", duplicate_rows)

**Note:** Dit is optioneel, het leek mij opzich best handig om te doen.