# - Data Warehouse - Al insterted in DWH

### Het samenvoegen van bestaande tabellen voor het uiteindelijke 'Datawarehouse'

##### Importeren van benodigde dependencies

In [1]:
import pandas as pd
import pyodbc

import warnings
warnings.filterwarnings("ignore")

# Importeren van de create_connection en run_query functies uit de database_utils.py file
from utils.database_utils import create_connection, run_query

##### 1 - Opbouwen van dataframes voor elke database-tabel

In [2]:

aw_PersonDF= run_query("SELECT * FROM Person.person", "AdventureWorks2019")
aw_EmailAdressDF= run_query("SELECT * FROM Person.EmailAddress", "AdventureWorks2019")
aw_PasswordDF= run_query("SELECT * FROM Person.Password", "AdventureWorks2019")

print(aw_PersonDF.columns)
print(aw_EmailAdressDF.columns)
print(aw_PasswordDF.columns)



Index(['BusinessEntityID', 'PersonType', 'NameStyle', 'Title', 'FirstName',
       'MiddleName', 'LastName', 'Suffix', 'EmailPromotion',
       'AdditionalContactInfo', 'Demographics', 'rowguid', 'ModifiedDate'],
      dtype='object')
Index(['BusinessEntityID', 'EmailAddressID', 'EmailAddress', 'rowguid',
       'ModifiedDate'],
      dtype='object')
Index(['BusinessEntityID', 'PasswordHash', 'PasswordSalt', 'rowguid',
       'ModifiedDate'],
      dtype='object')


Je kunt vergelijkbare queries uitvoeren voor andere tabellen die je wilt opnemen.

#### 2 - Data transformatie

Na het opbouwen van de dataframes voor elke database-tabel, kunnen we beginnen met het transformeren van de data. Dit omvat het samenvoegen van tabellen, het toepassen van filters, het uitvoeren van berekeningen, enzovoort.

In [3]:
# Merge aw_PersonDF and aw_EmailAdressDF
aw_PersonDF.drop(columns=['rowguid', 'ModifiedDate'], inplace=True)
aw_EmailAdressDF.drop(columns=['rowguid', 'ModifiedDate'], inplace=True)
aw_PasswordDF.drop(columns=['rowguid', 'ModifiedDate'], inplace=True)


merged_df = pd.merge(aw_PersonDF, aw_EmailAdressDF, on='BusinessEntityID')
merged_df = pd.merge(merged_df, aw_PasswordDF, on='BusinessEntityID')

filtered_df = merged_df
filtered_df


Unnamed: 0,BusinessEntityID,PersonType,NameStyle,Title,FirstName,MiddleName,LastName,Suffix,EmailPromotion,AdditionalContactInfo,Demographics,EmailAddressID,EmailAddress,PasswordHash,PasswordSalt
0,1,EM,False,,Ken,J,Sánchez,,0,,"<IndividualSurvey xmlns=""http://schemas.micros...",1,ken0@adventure-works.com,pbFwXWE99vobT6g+vPWFy93NtUU/orrIWafF01hccfM=,bE3XiWw=
1,2,EM,False,,Terri,Lee,Duffy,,1,,"<IndividualSurvey xmlns=""http://schemas.micros...",2,terri0@adventure-works.com,bawRVNrZQYQ05qF05Gz6VLilnviZmrqBReTTAGAudm0=,EjJaC3U=
2,3,EM,False,,Roberto,,Tamburello,,0,,"<IndividualSurvey xmlns=""http://schemas.micros...",3,roberto0@adventure-works.com,8BUXrZfDqO1IyHCWOYzYmqN1IhTUn3CJMpdx/UCQ3iY=,wbPZqMw=
3,4,EM,False,,Rob,,Walters,,0,,"<IndividualSurvey xmlns=""http://schemas.micros...",4,rob0@adventure-works.com,SjLXpiarHSlz+6AG+H+4QpB/IPRzras/+9q/5Wr7tf8=,PwSunQU=
4,5,EM,False,Ms.,Gail,A,Erickson,,0,,"<IndividualSurvey xmlns=""http://schemas.micros...",5,gail0@adventure-works.com,8FYdAiY6gWuBsgjCFdg0UibtsqOcWHf9TyaHIP7+paA=,qYhZRiM=
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19967,20773,IN,False,,Crystal,,Guo,,0,,"<IndividualSurvey xmlns=""http://schemas.micros...",19968,crystal18@adventure-works.com,4gSNTcSKHtKW1k9te824egho2RixU5Gc+LRDNYyMDx4=,qh4YKRQ=
19968,20774,IN,False,,Isabella,F,Richardson,,2,,"<IndividualSurvey xmlns=""http://schemas.micros...",19969,isabella91@adventure-works.com,gOO6OEoRpCe9TiQ4+1fX1qXIzavOQ0Ccvl1JHS/Pseg=,s+EMJTA=
19969,20775,IN,False,,Crystal,S,He,,0,,"<IndividualSurvey xmlns=""http://schemas.micros...",19970,crystal19@adventure-works.com,r5nZct0C8mWL6KM0DE4pM8fO/0nmUYAtya8ref2efg8=,axcde7k=
19970,20776,IN,False,,Crystal,,Zheng,,1,,"<IndividualSurvey xmlns=""http://schemas.micros...",19971,crystal20@adventure-works.com,5eVmZbWYJXVwZkBkvpxlhA3/bKMLRReav9CgRP4NRbU=,nJbmm88=


In [4]:
create_table_query = """

CREATE TABLE PersonInfo (
    BusinessEntityID int PRIMARY KEY,
    PersonType nchar(5),
    NameStyle bit,
    Title nvarchar(max),
    FirstName nvarchar(max),
    MiddleName nvarchar(max),
    LastName nvarchar(max),
    Suffix nvarchar(max),
    EmailPromotion int,
    AdditionalContactInfo xml,
    Demographics xml,
    EmailAddressID int,
    EmailAddress nvarchar(max),
    PasswordHash varchar(max),
    PasswordSalt varchar(max)
);
"""

# Create the table in SQL Server
# db_name = "testDB"
conn, cursor = create_connection()
cursor.execute(create_table_query)
conn.commit()
conn.close()

#### 3 - Data loading

Na het transformeren van de data, kunnen we de resulterende dataframe in de doeltabel van ons datawarehouse laden.

In [5]:
# Verbinding maken met de database
# test_database_name = 'testDB'
conn, cursor = create_connection()

for index, row in filtered_df.iterrows():
    NameStyle = 1 if row['NameStyle'] else 0

    # Replace single quotes in the data
    title = row['Title'].replace("'", "''") if row['Title'] else None
    first_name = row['FirstName'].replace("'", "''") if row['FirstName'] else None
    middle_name = row['MiddleName'].replace("'", "''") if row['MiddleName'] else None
    last_name = row['LastName'].replace("'", "''") if row['LastName'] else None
    suffix = row['Suffix'].replace("'", "''") if row['Suffix'] else None
    additional_contact_info = row['AdditionalContactInfo'].replace("'", "''") if row['AdditionalContactInfo'] else None
    demographics = row['Demographics'].replace("'", "''") if row['Demographics'] else None
    email_address = row['EmailAddress'].replace("'", "''") if row['EmailAddress'] else None
    password_hash = row['PasswordHash'].replace("'", "''") if row['PasswordHash'] else None
    password_salt = row['PasswordSalt'].replace("'", "''") if row['PasswordSalt'] else None

    # Opstellen van de SQL-invoegquery
    query = f"""
        INSERT INTO PersonInfo (
        BusinessEntityID, PersonType, NameStyle, Title, FirstName,
        MiddleName, LastName, Suffix, EmailPromotion,
        AdditionalContactInfo, Demographics, EmailAddressID,
        EmailAddress, PasswordHash, PasswordSalt
        ) 
        VALUES (
        {row['BusinessEntityID']}, '{row['PersonType']}', {NameStyle}, '{title}', '{first_name}',
        '{middle_name}', '{last_name}', '{suffix}', {row['EmailPromotion']},
        '{additional_contact_info}', '{demographics}', {row['EmailAddressID']},
        '{email_address}', '{password_hash}', '{password_salt}'
        )
    """
    # Uitvoeren van de query
    cursor.execute(query)

conn.commit()
conn.close()

**Note:** Voeg indien nodig zoveel Markdown- of codeblokken toe als nodig is.

#### 4 -  Data Quality Checks

Voeg controles toe om de kwaliteit van de gegevens te waarborgen voordat ze worden geladen in het datawarehouse:

In [None]:
# Controleren op ontbrekende waarden
missing_values = filtered_df.isnull().sum()

# Controleren op duplicaten
duplicate_rows = filtered_df.duplicated().sum()

# Weergave van resultaten
print("Aantal ontbrekende waarden:", missing_values)
print("Aantal duplicaten:", duplicate_rows)

**Note:** Dit is optioneel, het leek mij opzich best handig om te doen.