# - Data Warehouse -

### Het samenvoegen van bestaande tabellen voor het uiteindelijke 'Datawarehouse'

##### Importeren van benodigde dependencies

In [1]:
import pandas as pd # type: ignore
import pyodbc # type: ignore

import warnings
warnings.filterwarnings("ignore")

# Importeren van de create_connection en run_query functies uit de database_utils.py file
from utils.database_utils import create_connection, run_query

##### 1 - Opbouwen van dataframes voor elke database-tabel

In [5]:
sales_territory_df = run_query("SELECT * FROM Sales.SalesTerritory", "AdventureWorks2019")
sales_history_df = run_query("SELECT * FROM Sales.SalesTerritoryHistory", "AdventureWorks2019")

#### 2 - Data transformatie

Na het opbouwen van de dataframes voor elke database-tabel, kunnen we beginnen met het transformeren van de data. Dit omvat het samenvoegen of aanpassen van tabellen, het toepassen van filters, het uitvoeren van berekeningen, enzovoort.

In [4]:
create_table_query = """
CREATE TABLE SalesTerritoryData (
    BusinessEntityID INT,
    TerritoryID INT,
    Name VARCHAR(255),
    CountryRegionCode VARCHAR(255),
    GroupName VARCHAR(255),
    SalesYTD FLOAT,
    SalesLastYear FLOAT,
    CostYTD FLOAT,
    CostLastYear FLOAT,
    StartDate DATETIME,
    EndDate DATETIME,
)
"""

# Create the table in SQL Server
db_name = "AdventureWorks2019"
conn, cursor = create_connection(db_name)
cursor.execute(create_table_query)
conn.commit()
conn.close()

In [9]:
merged_df = pd.merge(sales_territory_df, sales_history_df, on='TerritoryID')
merged_df

Unnamed: 0,TerritoryID,Name,CountryRegionCode,Group,SalesYTD,SalesLastYear,CostYTD,CostLastYear,rowguid_x,ModifiedDate_x,BusinessEntityID,StartDate,EndDate,rowguid_y,ModifiedDate_y
0,1,Northwest,US,North America,7887187.0,3298694.0,0.0,0.0,43689A10-E30B-497F-B0DE-11DE20267FF7,2008-04-30,280,2011-05-31,2012-09-29,FD3F5566-10E2-4960-BE12-0365E5665881,2012-09-22
1,1,Northwest,US,North America,7887187.0,3298694.0,0.0,0.0,43689A10-E30B-497F-B0DE-11DE20267FF7,2008-04-30,283,2011-05-31,NaT,009F7660-44A6-4ADF-BD4B-A5D1B79993F5,2011-05-24
2,1,Northwest,US,North America,7887187.0,3298694.0,0.0,0.0,43689A10-E30B-497F-B0DE-11DE20267FF7,2008-04-30,284,2012-09-30,NaT,ED12F921-8023-48EF-84BD-94D942F4C009,2012-09-23
3,2,Northeast,US,North America,2402177.0,3607149.0,0.0,0.0,00FB7309-96CC-49E2-8363-0A1BA72486F2,2008-04-30,275,2011-05-31,2012-11-29,8563CE6A-00FF-47D7-BA4D-3C3E1CDEF531,2012-11-22
4,2,Northeast,US,North America,2402177.0,3607149.0,0.0,0.0,00FB7309-96CC-49E2-8363-0A1BA72486F2,2008-04-30,277,2012-11-30,NaT,132E4721-32DD-4A73-B556-1837F3A2B9AE,2012-11-23
5,3,Central,US,North America,3072175.0,3205014.0,0.0,0.0,DF6E7FD8-1A8D-468C-B103-ED8ADDB452C1,2008-04-30,275,2012-11-30,NaT,2F44304C-EE87-4C72-813E-CA75C5F61F4C,2012-11-23
6,3,Central,US,North America,3072175.0,3205014.0,0.0,0.0,DF6E7FD8-1A8D-468C-B103-ED8ADDB452C1,2008-04-30,277,2011-05-31,2012-11-29,3E9F893D-5142-46C9-A76A-867D1E3D6F90,2012-11-22
7,4,Southwest,US,North America,10510850.0,5366576.0,0.0,0.0,DC3E9EA0-7950-4431-9428-99DBCBC33865,2008-04-30,276,2011-05-31,NaT,64BCB1B3-A793-40BA-9859-D90F78C3F167,2011-05-24
8,4,Southwest,US,North America,10510850.0,5366576.0,0.0,0.0,DC3E9EA0-7950-4431-9428-99DBCBC33865,2008-04-30,281,2011-05-31,NaT,9D8754B2-C320-40DB-A77F-FF5A1BC0F46B,2011-05-24
9,5,Southeast,US,North America,2538667.0,3925071.0,0.0,0.0,6DC4165A-5E4C-42D2-809D-4344E0AC75E7,2008-04-30,279,2011-05-31,NaT,57D1CDCF-62CE-499F-8BE8-1BB71C4BB7EF,2011-05-24


#### 3 - Data loading

Na het transformeren van de data, kunnen we de resulterende dataframe in de doeltabel van ons datawarehouse laden.

In [11]:
conn, cursor = create_connection(db_name)

for index, row in merged_df.iterrows():
    start_date_str = f"'{row['StartDate'].strftime('%Y-%m-%d %H:%M:%S')}'" if pd.notnull(row['StartDate']) else 'NULL'
    end_date_str = f"'{row['EndDate'].strftime('%Y-%m-%d %H:%M:%S')}'" if pd.notnull(row['EndDate']) else 'NULL'
    
    query = f"""
        INSERT INTO SalesTerritoryData (
            BusinessEntityID,
            TerritoryID, 
            Name, 
            CountryRegionCode, 
            GroupName, 
            SalesYTD, 
            SalesLastYear, 
            CostYTD, 
            CostLastYear,
            StartDate,
            EndDate
        ) 
        VALUES (
            {row['BusinessEntityID']},
            {row['TerritoryID']}, 
            '{row['Name']}', 
            '{row['CountryRegionCode']}', 
            '{row['Group']}', 
            {row['SalesYTD']}, 
            {row['SalesLastYear']}, 
            {row['CostYTD']}, 
            {row['CostLastYear']}, 
            {start_date_str}, 
            {end_date_str} 
        )
    """
    cursor.execute(query)

conn.commit()
conn.close()

**Note:** Voeg indien nodig zoveel Markdown- of codeblokken toe als nodig is.

#### 4 -  Data Quality Checks

Voeg controles toe om de kwaliteit van de gegevens te waarborgen voordat ze worden geladen in het datawarehouse:

In [12]:
# Controleren op ontbrekende waarden
missing_values = merged_df.isnull().sum()

# Controleren op duplicaten
duplicate_rows = merged_df.duplicated().sum()

# Weergave van resultaten
print("Aantal ontbrekende waarden:", missing_values)
print("Aantal duplicaten:", duplicate_rows)

Aantal ontbrekende waarden: TerritoryID           0
Name                  0
CountryRegionCode     0
Group                 0
SalesYTD              0
SalesLastYear         0
CostYTD               0
CostLastYear          0
rowguid_x             0
ModifiedDate_x        0
BusinessEntityID      0
StartDate             0
EndDate              13
rowguid_y             0
ModifiedDate_y        0
dtype: int64
Aantal duplicaten: 0


**Note:** Dit is optioneel, het leek mij opzich best handig om te doen.