# - Data Warehouse - Inserted

### Het samenvoegen van bestaande tabellen voor het uiteindelijke 'Datawarehouse'

##### Importeren van benodigde dependencies

In [1]:
import pandas as pd
import pyodbc

import warnings
warnings.filterwarnings("ignore")

# Importeren van de create_connection en run_query functies uit de database_utils.py file
from utils.database_utils import create_connection, run_query

##### 1 - Opbouwen van dataframes voor elke database-tabel

In [2]:
Workorderdf = run_query("SELECT * FROM Production.Workorder", "AdventureWorks2019")
WorkOrderRoutingdf = run_query("SELECT * FROM Production.WorkOrderRouting", "AdventureWorks2019")

Je kunt vergelijkbare queries uitvoeren voor andere tabellen die je wilt opnemen.

In [3]:
merged_df = Workorderdf.merge(WorkOrderRoutingdf, on='WorkOrderID', how='inner')

merged_df

Unnamed: 0,WorkOrderID,ProductID_x,OrderQty,StockedQty,ScrappedQty,StartDate,EndDate,DueDate,ScrapReasonID,ModifiedDate_x,...,OperationSequence,LocationID,ScheduledStartDate,ScheduledEndDate,ActualStartDate,ActualEndDate,ActualResourceHrs,PlannedCost,ActualCost,ModifiedDate_y
0,13,747,4,4,0,2011-06-03,2011-06-19,2011-06-14,,2011-06-19,...,1,10,2011-06-03,2011-06-14,2011-06-03,2011-06-19,4.1,92.25,92.25,2011-06-19
1,13,747,4,4,0,2011-06-03,2011-06-19,2011-06-14,,2011-06-19,...,2,20,2011-06-03,2011-06-14,2011-06-03,2011-06-19,3.5,87.50,87.50,2011-06-19
2,13,747,4,4,0,2011-06-03,2011-06-19,2011-06-14,,2011-06-19,...,3,30,2011-06-03,2011-06-14,2011-06-03,2011-06-19,1.0,14.50,14.50,2011-06-19
3,13,747,4,4,0,2011-06-03,2011-06-19,2011-06-14,,2011-06-19,...,4,40,2011-06-03,2011-06-14,2011-06-03,2011-06-19,2.0,31.50,31.50,2011-06-19
4,13,747,4,4,0,2011-06-03,2011-06-19,2011-06-14,,2011-06-19,...,6,50,2011-06-03,2011-06-14,2011-06-03,2011-06-19,3.0,36.75,36.75,2011-06-19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67126,72585,802,6,6,0,2014-06-02,2014-06-13,2014-06-13,,2014-06-13,...,6,50,2014-06-02,2014-06-13,2014-06-05,2014-06-13,3.0,36.75,36.75,2014-06-13
67127,72586,803,1,1,0,2014-06-02,2014-06-13,2014-06-13,,2014-06-13,...,1,10,2014-06-02,2014-06-13,2014-06-05,2014-06-13,4.1,92.25,92.25,2014-06-13
67128,72586,803,1,1,0,2014-06-02,2014-06-13,2014-06-13,,2014-06-13,...,6,50,2014-06-02,2014-06-13,2014-06-05,2014-06-13,3.0,36.75,36.75,2014-06-13
67129,72587,804,19,19,0,2014-06-02,2014-06-13,2014-06-13,,2014-06-13,...,1,10,2014-06-02,2014-06-13,2014-06-05,2014-06-13,4.1,92.25,92.25,2014-06-13


#### 2 - Data transformatie

Na het opbouwen van de dataframes voor elke database-tabel, kunnen we beginnen met het transformeren van de data. Dit omvat het samenvoegen van tabellen, het toepassen van filters, het uitvoeren van berekeningen, enzovoort.

In [4]:
# SQL query
WorkOrder_sql = """
CREATE TABLE WorkOrder (
    WorkOrderID INT PRIMARY KEY,
    ProductID INT,
    OrderQty INT,
    StockedQty INT,
    ScrappedQty INT,
    StartDate DATETIME,
    EndDate DATETIME,
    DueDate DATETIME,
    ModifiedDate DATETIME,
    OperationSequence INT,
    LocationID INT,
    ScheduledStartDate DATETIME,
    ScheduledEndDate DATETIME,
    ActualStartDate DATETIME,
    ActualEndDate DATETIME,
    ActualResourceHrs DECIMAL(10,2),
    PlannedCost DECIMAL(10,2),
    ActualCost DECIMAL(10,2)
);
"""

# db_name = "MustafaTest"
conn, cursor = create_connection()
cursor.execute(WorkOrder_sql)
conn.commit()
conn.close()

In [30]:
print(merged_df.columns)

Index(['WorkOrderID', 'ProductID_x', 'OrderQty', 'StockedQty', 'ScrappedQty',
       'StartDate', 'EndDate', 'DueDate', 'ScrapReasonID', 'ModifiedDate_x',
       'ProductID_y', 'OperationSequence', 'LocationID', 'ScheduledStartDate',
       'ScheduledEndDate', 'ActualStartDate', 'ActualEndDate',
       'ActualResourceHrs', 'PlannedCost', 'ActualCost', 'ModifiedDate_y'],
      dtype='object')


#### 3 - Data loading

Na het transformeren van de data, kunnen we de resulterende dataframe in de doeltabel van ons datawarehouse laden.

In [5]:
# Verbinding maken met de database
# test_database_name = 'MustafaTest'
conn, cursor = create_connection()

for index, row in merged_df.iterrows():
    # Check if WorkOrderID already exists in the database
    cursor.execute(f"SELECT COUNT(1) FROM WorkOrder WHERE WorkOrderID = {row['WorkOrderID']}")
    if cursor.fetchone()[0]:
        # If it exists, delete the existing row
        cursor.execute(f"DELETE FROM WorkOrder WHERE WorkOrderID = {row['WorkOrderID']}")

    # Opstellen van de SQL-invoegquery
    query = f"""
        INSERT INTO WorkOrder (
            WorkOrderID, ProductID, OrderQty, StockedQty, ScrappedQty, StartDate, EndDate, DueDate, 
            ModifiedDate, OperationSequence, LocationID, ScheduledStartDate, 
            ScheduledEndDate, ActualStartDate, ActualEndDate, ActualResourceHrs, PlannedCost, ActualCost
        ) 
        VALUES (
            {row['WorkOrderID']}, {row['ProductID_x']}, {row['OrderQty']}, {row['StockedQty']}, {row['ScrappedQty']}, 
            '{row['StartDate']}', '{row['EndDate']}', '{row['DueDate']}', '{row['ModifiedDate_x']}', 
            {row['OperationSequence']}, {row['LocationID']}, '{row['ScheduledStartDate']}', '{row['ScheduledEndDate']}', 
            '{row['ActualStartDate']}', '{row['ActualEndDate']}', {row['ActualResourceHrs']}, {row['PlannedCost']}, 
            {row['ActualCost']}
        )
    """
    # Uitvoeren van de query
    cursor.execute(query)

conn.commit()
conn.close()

**Note:** Voeg indien nodig zoveel Markdown- of codeblokken toe als nodig is.

#### 4 -  Data Quality Checks

Voeg controles toe om de kwaliteit van de gegevens te waarborgen voordat ze worden geladen in het datawarehouse:

In [36]:
# Controleren op ontbrekende waarden
missing_values = merged_df.isnull().sum()

# Controleren op duplicaten
duplicate_rows = merged_df.duplicated().sum()

# Weergave van resultaten
print("Aantal ontbrekende waarden:", missing_values)
print("Aantal duplicaten:", duplicate_rows)

Aantal ontbrekende waarden: WorkOrderID               0
ProductID_x               0
OrderQty                  0
StockedQty                0
ScrappedQty               0
StartDate                 0
EndDate                   0
DueDate                   0
ScrapReasonID         66748
ModifiedDate_x            0
ProductID_y               0
OperationSequence         0
LocationID                0
ScheduledStartDate        0
ScheduledEndDate          0
ActualStartDate           0
ActualEndDate             0
ActualResourceHrs         0
PlannedCost               0
ActualCost                0
ModifiedDate_y            0
dtype: int64
Aantal duplicaten: 0


**Note:** Dit is optioneel, het leek mij opzich best handig om te doen.