# - Data Warehouse -

### Het samenvoegen van bestaande tabellen voor het uiteindelijke 'Datawarehouse'

##### Importeren van benodigde dependencies

In [2]:
import pandas as pd
import pyodbc

import warnings
warnings.filterwarnings("ignore")

# Importeren van de create_connection en run_query functies uit de database_utils.py file
from utils.database_utils import create_connection, run_query

##### 1 - Opbouwen van dataframes voor elke database-tabel

In [5]:
aenc_order_df = run_query("SELECT * FROM sales_order", "AenC")
aenc_order_item_df = run_query("SELECT * FROM sales_order_item", "AenC")

aw_order_df = run_query("SELECT * FROM Purchasing.PurchaseOrderDetail", "AdventureWorks")

nw_order_df = run_query("SELECT * FROM [Order Details]", "Northwind")

Je kunt vergelijkbare queries uitvoeren voor andere tabellen die je wilt opnemen.

#### 2 - Data transformatie

Na het opbouwen van de dataframes voor elke database-tabel, kunnen we beginnen met het transformeren van de data. Dit omvat het samenvoegen van tabellen, het toepassen van filters, het uitvoeren van berekeningen, enzovoort.

In [9]:
create_table_query = """
CREATE TABLE OrderData (
    id INT PRIMARY KEY,
    cust_id INT,
    order_date DATE,
    region VARCHAR(255),
    sales_rep VARCHAR(255),
    ship_date DATE,
    UnitPrice VARCHAR(255),
    Quantity INT,
    Discount INT,
    OrderQty INT,
    LineTotal INT,
    ReceivedQty INT,
    RejectedQty INT,
    StockedQty INT,
);
"""

# Create the table in SQL Server
db_name = "test"
conn, cursor = create_connection(db_name)
cursor.execute(create_table_query)
conn.commit()
conn.close()

### Eerste dataframe (AenC)

In [29]:
# Merge the dataframes
merged_df = pd.merge(aenc_order_df, aenc_order_item_df, on='id', how='inner')

merged_df['id'] = merged_df['id'].astype('object')

# Reset the index and start from 1
merged_df.reset_index(drop=True, inplace=True)
merged_df.index = merged_df.index + 1

# Rename the index to 'PrimaryKey'
merged_df.index.name = 'IDSK'



### Tweede dataframe (AdventureWorks)

In [33]:
merged_df2 = pd.merge(merged_df, aw_order_df, left_on='IDSK', right_on='PurchaseOrderID', how='inner') # type: ignore

merged_df2['id'] = merged_df2['id'].astype('object')

# Reset the index and start from 1
merged_df2.reset_index(drop=True, inplace=True)
merged_df2.index = merged_df2.index + 1

# Rename the index to 'PrimaryKey'
merged_df.index.name = 'IDSK'

merged_df2


Unnamed: 0,id,cust_id,order_date,region,sales_rep,line_id,prod_id,quantity,ship_date,PurchaseOrderID,PurchaseOrderDetailID,DueDate,OrderQty,ProductID,UnitPrice,LineTotal,ReceivedQty,RejectedQty,StockedQty,ModifiedDate
1,2001,101,14-Mar-1996 12:00:00 AM,Eastern,299,1,300,12,15-Sep-1996 12:00:00 AM,1,1,2011-04-30,4,1,50.2600,201.0400,3.0,0.0,3.0,2011-04-23
2,2001,101,14-Mar-1996 12:00:00 AM,Eastern,299,2,301,12,14-Sep-1996 12:00:00 AM,2,2,2011-04-30,3,359,45.1200,135.3600,3.0,0.0,3.0,2011-04-23
3,2001,101,14-Mar-1996 12:00:00 AM,Eastern,299,2,301,12,14-Sep-1996 12:00:00 AM,2,3,2011-04-30,3,360,45.5805,136.7415,3.0,0.0,3.0,2011-04-23
4,2001,101,14-Mar-1996 12:00:00 AM,Eastern,299,3,302,12,14-Sep-1996 12:00:00 AM,3,4,2011-04-30,550,530,16.0860,8847.3000,550.0,0.0,550.0,2011-04-23
5,2002,102,18-Mar-1996 12:00:00 AM,Eastern,467,1,400,24,18-Sep-1996 12:00:00 AM,4,5,2011-04-30,3,4,57.0255,171.0765,2.0,1.0,1.0,2011-04-23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2484,2653,102,17-Jan-1999 12:00:00 AM,Eastern,148,1,301,0,17-Oct-1999 12:00:00 AM,1100,2484,2013-11-20,3,504,48.7620,146.2860,3.0,0.0,3.0,2013-11-13
2485,2654,105,26-Feb-1999 12:00:00 AM,South,148,1,301,15,29-Oct-1999 12:00:00 AM,1101,2485,2013-11-21,3,497,34.1880,102.5640,3.0,0.0,3.0,2013-11-14
2486,2654,105,26-Feb-1999 12:00:00 AM,South,148,2,302,3,29-Oct-1999 12:00:00 AM,1102,2486,2013-11-21,3,323,50.2635,150.7905,3.0,0.0,3.0,2013-11-14
2487,2654,105,26-Feb-1999 12:00:00 AM,South,148,3,400,8,29-Oct-1999 12:00:00 AM,1103,2487,2013-11-21,550,523,10.7310,5902.0500,550.0,0.0,550.0,2013-11-14


In [25]:
aw_order_df['PurchaseOrderID'] = aw_order_df['PurchaseOrderID'].astype('object')
merged_df['id'] = merged_df['id'].astype('object')

# Samenvoegen van tabellen
#merged_df = pd.merge(aenc_order_df, aenc_order_item_df, on='id', how='inner') # type: ignore
#print(merged_df.head())

merged_df2 = pd.merge(merged_df, aw_order_df, left_on='id', right_on='PurchaseOrderID', how='inner') # type: ignore
print(merged_df2.head())

#merged_df_def = pd.merge(merged_df2, nw_order_df, left_on='id', right_on='OrderID', how='inner') # type: ignore

#merged_df_def.drop(columns=['PurchaseOrderID', 'OrderID', 'PurchaseOrderDetailID', 'cust_id', 'ProductID_y', 'ProductID_x', 'prod_id', 'UnitPrice_y', 'line_id'], inplace=True)

#merged_df_def

Empty DataFrame
Columns: [id, cust_id, order_date, region, sales_rep, line_id, prod_id, quantity, ship_date, PurchaseOrderID, PurchaseOrderDetailID, DueDate, OrderQty, ProductID, UnitPrice, LineTotal, ReceivedQty, RejectedQty, StockedQty, ModifiedDate]
Index: []


#### 3 - Data loading

Na het transformeren van de data, kunnen we de resulterende dataframe in de doeltabel van ons datawarehouse laden.

In [None]:
# Verbinding maken met de database
test_database_name = 'test'
conn, cursor = create_connection(test_database_name)

for index, row in filtered_df.iterrows():
    # Opstellen van de SQL-invoegquery
    query = f"""
        INSERT INTO TemplateTable (
        ) 
        VALUES (

        )
    """
    # Uitvoeren van de query
    cursor.execute(query)

conn.commit()
conn.close()

**Note:** Voeg indien nodig zoveel Markdown- of codeblokken toe als nodig is.

#### 4 -  Data Quality Checks

Voeg controles toe om de kwaliteit van de gegevens te waarborgen voordat ze worden geladen in het datawarehouse:

In [None]:
# Controleren op ontbrekende waarden
missing_values = filtered_df.isnull().sum()

# Controleren op duplicaten
duplicate_rows = filtered_df.duplicated().sum()

# Weergave van resultaten
print("Aantal ontbrekende waarden:", missing_values)
print("Aantal duplicaten:", duplicate_rows)

**Note:** Dit is optioneel, het leek mij opzich best handig om te doen.