# - Data Warehouse -

### Het samenvoegen van bestaande tabellen voor het uiteindelijke 'Datawarehouse'

##### Importeren van benodigde dependencies

In [12]:
import pandas as pd
import pyodbc

import warnings
warnings.filterwarnings("ignore")

# Importeren van de create_connection en run_query functies uit de database_utils.py file
from utils.database_utils import create_connection, run_query

##### 1 - Opbouwen van dataframes voor elke database-tabel

In [13]:
awProductdf = run_query("SELECT * FROM Production.Product", "AdventureWorks2019")
awProductInventorydf = run_query("SELECT * FROM Production.ProductInventory", "AdventureWorks2019")
awProductSubcategorydf = run_query("SELECT * FROM Production.ProductSubcategory", "AdventureWorks2019")
awproductCategorydf = run_query("SELECT * FROM Production.ProductCategory", "AdventureWorks2019")

AenCProductdf = run_query("SELECT * FROM PRODUCT", "AenC")

nwProductsdf = run_query("SELECT * FROM products", "northwind")





In [14]:
# Add a surrogate key to each DataFrame
awProductdf['key'] = range(1, len(awProductdf) + 1)
awProductInventorydf['key'] = range(len(awProductdf) + 1, len(awProductdf) + len(awProductInventorydf) + 1)
awProductSubcategorydf['key'] = range(len(awProductdf) + len(awProductInventorydf) + 1, len(awProductdf) + len(awProductInventorydf) + len(awProductSubcategorydf) + 1)
awproductCategorydf['key'] = range(len(awProductdf) + len(awProductInventorydf) + len(awProductSubcategorydf) + 1, len(awProductdf) + len(awProductInventorydf) + len(awProductSubcategorydf) + len(awproductCategorydf) + 1)
AenCProductdf['key'] = range(len(awProductdf) + len(awProductInventorydf) + len(awProductSubcategorydf) + len(awproductCategorydf) + 1, len(awProductdf) + len(awProductInventorydf) + len(awProductSubcategorydf) + len(awproductCategorydf) + len(AenCProductdf) + 1)
nwProductsdf['key'] = range(len(awProductdf) + len(awProductInventorydf) + len(awProductSubcategorydf) + len(awproductCategorydf) + len(AenCProductdf) + 1, len(awProductdf) + len(awProductInventorydf) + len(awProductSubcategorydf) + len(awproductCategorydf) + len(AenCProductdf) + len(nwProductsdf) + 1)

# Merge the DataFrames using the surrogate key
# Merge the DataFrames using the surrogate key and unique suffixes
merged_df = pd.merge(awProductdf, awProductInventorydf, on='key', suffixes=('_awProductdf', '_awProductInventorydf'))
merged_df = pd.merge(merged_df, awProductSubcategorydf, on='key', suffixes=('', '_awProductSubcategorydf'))
merged_df = pd.merge(merged_df, awproductCategorydf, on='key', suffixes=('', '_awproductCategorydf'))
merged_df = pd.merge(merged_df, AenCProductdf, on='key', suffixes=('', '_AenCProductdf'))
merged_df = pd.merge(merged_df, nwProductsdf, on='key', suffixes=('', '_nwProductsdf'))

print(awProductdf.columns)
print(awProductInventorydf.columns)
print(awProductSubcategorydf.columns)
print(awproductCategorydf.columns)
print(AenCProductdf.columns)
print(nwProductsdf.columns) 

Index(['ProductID', 'Name', 'ProductNumber', 'MakeFlag', 'FinishedGoodsFlag',
       'Color', 'SafetyStockLevel', 'ReorderPoint', 'StandardCost',
       'ListPrice', 'Size', 'SizeUnitMeasureCode', 'WeightUnitMeasureCode',
       'Weight', 'DaysToManufacture', 'ProductLine', 'Class', 'Style',
       'ProductSubcategoryID', 'ProductModelID', 'SellStartDate',
       'SellEndDate', 'DiscontinuedDate', 'rowguid', 'ModifiedDate', 'key'],
      dtype='object')
Index(['ProductID', 'LocationID', 'Shelf', 'Bin', 'Quantity', 'rowguid',
       'ModifiedDate', 'key'],
      dtype='object')
Index(['ProductSubcategoryID', 'ProductCategoryID', 'Name', 'rowguid',
       'ModifiedDate', 'key'],
      dtype='object')
Index(['ProductCategoryID', 'Name', 'rowguid', 'ModifiedDate', 'key'], dtype='object')
Index(['id', 'name', 'description', 'prod_size', 'color', 'quantity',
       'unit_price', 'picture_name', 'Category', 'key'],
      dtype='object')
Index(['ProductID', 'ProductName', 'SupplierID', 'Catego

Je kunt vergelijkbare queries uitvoeren voor andere tabellen die je wilt opnemen.

#### 2 - Data transformatie

Na het opbouwen van de dataframes voor elke database-tabel, kunnen we beginnen met het transformeren van de data. Dit omvat het samenvoegen van tabellen, het toepassen van filters, het uitvoeren van berekeningen, enzovoort.

In [16]:

Products_sql = """
CREATE TABLE Products (
    SurrogateKey INT ,
    ProductID INT PRIMARY KEY,
    Name VARCHAR(255),
    ProductNumber VARCHAR(255),
    MakeFlag TINYINT,
    FinishedGoodsFlag TINYINT,
    Color VARCHAR(50),
    SafetyStockLevel INT,
    ReorderPoint INT,
    StandardCost DECIMAL(10,2),
    ListPrice DECIMAL(10,2),
    Size VARCHAR(50),
    SizeUnitMeasureCode VARCHAR(50),
    WeightUnitMeasureCode VARCHAR(50),
    Weight DECIMAL(10,2),
    DaysToManufacture INT,
    ProductLine VARCHAR(50),
    Class VARCHAR(50),
    Style VARCHAR(50),
    ProductSubcategoryID INT,
    ProductModelID INT,
    SellStartDate DATETIME,
    SellEndDate DATETIME,
    DiscontinuedDate DATETIME,
    rowguid VARCHAR(255),
    ModifiedDate DATETIME,
    ProductCategoryID INT,
    LocationID INT,
    Shelf VARCHAR(50),
    Bin INT,
    Quantity INT,
    description TEXT,
    prod_size VARCHAR(50),
    unit_price DECIMAL(10,2),
    picture_name VARCHAR(255),
    Category VARCHAR(255),
    ProductName VARCHAR(255),
    SupplierID INT,
    CategoryID INT,
    QuantityPerUnit VARCHAR(255),
    UnitPrice DECIMAL(10,2),
    UnitsInStock INT,
    UnitsOnOrder INT,
    ReorderLevel INT,
    Discontinued TINYINT
);
"""

db_name = "MustafaTest"
conn, cursor = create_connection(db_name)
cursor.execute(Products_sql)
conn.commit()
conn.close()

#### 3 - Data loading

Na het transformeren van de data, kunnen we de resulterende dataframe in de doeltabel van ons datawarehouse laden.

In [17]:
# Verbinding maken met de database
test_database_name = 'MustafaTest'
conn, cursor = create_connection(test_database_name)

for index, row in merged_df.iterrows():
    # Opstellen van de SQL-invoegquery
    query = f"""
        INSERT INTO Product (
            SurrogateKey, ProductID, Name, ProductNumber, MakeFlag, FinishedGoodsFlag, Color, SafetyStockLevel, 
            ReorderPoint, StandardCost, ListPrice, Size, SizeUnitMeasureCode, WeightUnitMeasureCode, 
            Weight, DaysToManufacture, ProductLine, Class, Style, ProductSubcategoryID, ProductModelID, 
            SellStartDate, SellEndDate, DiscontinuedDate, rowguid, ModifiedDate, ProductCategoryID, 
            LocationID, Shelf, Bin, Quantity, description, prod_size, unit_price, picture_name, Category, 
            ProductName, SupplierID, CategoryID, QuantityPerUnit, UnitPrice, UnitsInStock, UnitsOnOrder, 
            ReorderLevel, Discontinued
        ) 
        VALUES (
            {row['SurrogateKey']}, {row['ProductID']}, {row['Name']}, {row['ProductNumber']}, {row['MakeFlag']}, {row['FinishedGoodsFlag']}, 
            {row['Color']}, {row['SafetyStockLevel']}, {row['ReorderPoint']}, {row['StandardCost']}, {row['ListPrice']}, 
            {row['Size']}, {row['SizeUnitMeasureCode']}, {row['WeightUnitMeasureCode']}, {row['Weight']}, 
            {row['DaysToManufacture']}, {row['ProductLine']}, {row['Class']}, {row['Style']}, {row['ProductSubcategoryID']}, 
            {row['ProductModelID']}, {row['SellStartDate']}, {row['SellEndDate']}, {row['DiscontinuedDate']}, 
            {row['rowguid']}, {row['ModifiedDate']}, {row['ProductCategoryID']}, {row['LocationID']}, {row['Shelf']}, 
            {row['Bin']}, {row['Quantity']}, {row['description']}, {row['prod_size']}, {row['unit_price']}, 
            {row['picture_name']}, {row['Category']}, {row['ProductName']}, {row['SupplierID']}, {row['CategoryID']}, 
            {row['QuantityPerUnit']}, {row['UnitPrice']}, {row['UnitsInStock']}, {row['UnitsOnOrder']}, {row['ReorderLevel']}, 
            {row['Discontinued']}
        )
    """
    # Uitvoeren van de query
    cursor.execute(query)

conn.commit()
conn.close()

**Note:** Voeg indien nodig zoveel Markdown- of codeblokken toe als nodig is.

#### 4 -  Data Quality Checks

Voeg controles toe om de kwaliteit van de gegevens te waarborgen voordat ze worden geladen in het datawarehouse:

In [18]:
# Controleren op ontbrekende waarden
missing_values = merged_df.isnull().sum()

# Controleren op duplicaten
duplicate_rows = merged_df.duplicated().sum()

# Weergave van resultaten
print("Aantal ontbrekende waarden:", missing_values)
print("Aantal duplicaten:", duplicate_rows)

Aantal ontbrekende waarden: ProductID_awProductdf    0
Name                     0
ProductNumber            0
MakeFlag                 0
FinishedGoodsFlag        0
                        ..
UnitPrice                0
UnitsInStock             0
UnitsOnOrder             0
ReorderLevel             0
Discontinued             0
Length: 61, dtype: int64
Aantal duplicaten: 0


**Note:** Dit is optioneel, het leek mij opzich best handig om te doen.