In [2]:
import pyodbc
import sqlite3 as sql
import pandas as pd
import warnings
import numpy as np
from datetime import datetime
import os
from dotenv import load_dotenv

warnings.simplefilter('ignore')

Load in .env file for safe connection

In [3]:
class EnvConfig:
    def __init__(self):
        load_dotenv()
    
    def get(self, key: str) -> str:

        value = os.getenv(key)
        if not value:
            print(f"Warning: {key} not found in environment")
        return value
    
    def is_configured(self) -> bool:
        required_keys = ["DB_SERVER", "DB_NAME_SDM", "DB_NAME_DWH", "DB_USER", "DB_PASSWORD"]
        return all(self.get(key) for key in required_keys)

Safe connection

In [4]:
env = EnvConfig()

SDM_CONN_PATH = (
    f"DRIVER={{SQL Server}};"
    f"SERVER={env.get('DB_SERVER')},1433;"
    f"DATABASE={env.get('DB_NAME_SDM')};"
    f"UID={env.get('DB_USER')};"
    f"PWD={env.get('DB_PASSWORD')}"
)

DWH_CONN_PATH = (
    f"DRIVER={{SQL Server}};"
    f"SERVER={env.get('DB_SERVER')},1433;"
    f"DATABASE={env.get('DB_NAME_DWH')};"
    f"UID={env.get('DB_USER')};"
    f"PWD={env.get('DB_PASSWORD')}"
)

FROM_DB = pyodbc.connect(SDM_CONN_PATH)
SDM_cursor = FROM_DB.cursor()

TO_DB = pyodbc.connect(DWH_CONN_PATH)
DWH_cursor = TO_DB.cursor()

Dataframes

In [5]:
def create_dataframes_sql(connection):
    dictionary : dict = {}
    query = "SELECT TABLE_NAME FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_TYPE = 'BASE TABLE';"
    key = "TABLE_NAME"
    
    tables = pd.read_sql(query, connection)
    
    for table in tables[key].tolist():
        dictionary[table] = pd.read_sql(f"SELECT * FROM {table}", connection)
    
    return dictionary

sdm_frames = create_dataframes_sql(FROM_DB)

Query's

In [6]:
from typing import Iterable

def query_remove(table_name : str):
    try:
        query = f"DELETE FROM {table_name}"
        DWH_cursor.execute(query)
        DWH_cursor.commit()
    except pyodbc.Error as e:
        print(f"ERROR: {table_name}: {e}")
        return table_name
    
    return None

def right_type(value, column_name, types):
    dtype = types[column_name]
    
    if pd.isna(value):
        return "NULL"
    
    if dtype == "object" or dtype == "string":
        value = value.replace("'", "''")
        return f"'{value}'"
    
    return f"{value}"
    

def create_add_query(row, types):
    query = ""
    columns = list(row.keys())
    
    for pos in range(len(columns)):
        column_name = columns[pos]
        value = row[column_name]
        
        if (pos == len(columns) - 1):
            query += f"{column_name}) VALUES ("
        else:
            query += f"{column_name}, "
    
    data = list(row)
    
    for pos in range(len(data)):
        column_name = columns[pos]
        value = row[column_name]
        
        if pos == len(columns) - 1:
            query += f"{right_type(value, column_name, types)})"
        else:
            query += f"{right_type(value, column_name, types)}, "
    
    return query

def query_add(table_name : str, table_data : pd.DataFrame):
    queries = []
    types = table_data.dtypes

    for index, row in table_data.iterrows():
        query : str = f"INSERT INTO {table_name} ("
        query += create_add_query(row, types)
        queries.append(query)
    
    return queries

Maak het DWH leeg

In [23]:
def empty_dwh():
    tables_dwh : list = [
        "DimProduct",
        "DimCustomer",
        "DimEmployee",
        "DimDate",
        "DimTerritory",
        "FactSales",
    ]

    while len(tables_dwh) > 0:
        for table in tables_dwh:
            try:
                temp = query_remove(table)
                
                if (temp == None):
                    tables_dwh.remove(table)
                    print(f"REMOVED {table}")
                else:
                    print(f"NOT YET REMOVED: {table}")
            except pyodbc.Error as e:
                print(e)
        
    print("All items are removed")

empty_dwh()

REMOVED DimProduct
REMOVED DimEmployee
REMOVED DimTerritory
REMOVED DimCustomer
REMOVED FactSales
REMOVED DimDate
All items are removed


Datum converter

Dataframes aanmaken

In [20]:
def create_territory():
    return sdm_frames["Territories"]


# def create_product():
#     dim_product = sdm_frames["Production_Product"].copy()

#     dim_product = pd.merge(dim_product, sdm_frames["Production_ProductCategoryID"], on="ProductCategoryID")

#     sdm_frames["Suppliers"]["SupplierID"] = sdm_frames["Suppliers"]["SupplierID"].astype(str)
#     dim_product["SupplierID"] = dim_product["SupplierID"].astype(str)

#     dim_product = pd.merge(dim_product, sdm_frames["Suppliers"], on="SupplierID", how="outer")

#     dim_product = dim_product[[
#         "ProductID","NWProductID", "Name", "description", "Category", "Color", "CompanyName", "StandardCosts", "UnitPrice"
#     ]].rename(columns={
#         "StandardCost": "Costs",
#         "description": "Description"
#     })

#     dim_product = dim_product.dropna(subset=["ProductID"])

#     return dim_product


def create_employee():
    dim_employee = sdm_frames["Employee"].copy()

    dim_employee["EmpFullName"] = dim_employee["emp_fname"] + " " + dim_employee["emp_lname"]

    dim_employee = dim_employee[["emp_id", "manager_id", "EmpFullName", "Title", "start_date", "dept_id"]].rename(columns={
                                   "emp_id" : "EmpID",
                                   "manager_id" : "ManagerID",
                                   "dept_id" : "DeptID",
                                   "start_date" : "StartDate"
    })
    return dim_employee

def create_customer():
    dim_customer = sdm_frames["Customer"].copy()

    dim_customer["FullName"] = dim_customer["fname"] + " " + dim_customer["lname"]

    dim_customer = dim_customer[[
        "id", "FullName", "address", "company_name"
    ]].rename(columns={
        "address": "Address",
        "phone": "Phone",
        "company_name": "CompanyName"
    })

    dim_customer = dim_customer.drop_duplicates()

    return dim_customer



territory_dwh, customer_dwh, employee_dwh,= (
    create_territory(),
    create_customer(),
    create_employee(),
)


def create_fact_sales ():
    fact_sales = pd.merge(sdm_frames["Sales_SalesOrderDetail"], sdm_frames["Sales_SalesOrderHeader"], on="SalesOrderID")

    product_cost = sdm_frames["Production_Product"][["ProductID", "StandardCost"]]
    fact_sales = pd.merge(fact_sales, product_cost, on="ProductID", how="left")

    fact_sales["TotalSales"] = (fact_sales["UnitPrice"] * fact_sales["OrderQty"]) - fact_sales["UnitPriceDiscount"]
    fact_sales["TotalProfits"] = ((fact_sales["UnitPrice"] - fact_sales["StandardCost"]) * fact_sales["OrderQty"]) - fact_sales["UnitPriceDiscount"]

    fact_sales = fact_sales[[
        "SalesOrderID",
        "CustomerID",
        "SalesCustomerID",
        "SalesPersonID",
        "ProductID",
        "TerritoryID",
        "OrderDate",
        "UnitPrice",
        "UnitPriceDiscount",
        "OrderQty",
        "TotalSales",
        "TotalProfits",
        "NWProductID",
        "StandardCost"
        ]].rename(columns= {
            "quantity" : "Quantity",
            "StandardCost" : "Costs",
            "SalesPersonID" : "EmpID"
                            })
    
    return fact_sales

fact_sales_dwh = create_fact_sales()

fact_sales_dwh


Unnamed: 0,SalesOrderID,CustomerID,SalesCustomerID,EmpID,ProductID,TerritoryID,OrderDate,UnitPrice,UnitPriceDiscount,OrderQty,TotalSales,TotalProfits,NWProductID,Costs
0,43697,,21768.0,282,749,6.0,2021-05-31,3578.0,0.0,1,3578.0,1407.0,,2171.0
1,43698,,28389.0,290,773,7.0,2021-05-31,3400.0,0.0,1,3400.0,1488.0,,1912.0
2,43699,,25863.0,280,773,1.0,2021-05-31,3400.0,0.0,1,3400.0,1488.0,,1912.0
3,43700,,14501.0,276,767,4.0,2021-05-31,699.0,0.0,1,699.0,212.0,,487.0
4,43701,,11003.0,286,773,9.0,2021-05-31,3400.0,0.0,1,3400.0,1488.0,,1912.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60393,75122,,15868.0,282,878,6.0,2024-06-30,22.0,0.0,1,22.0,14.0,,8.0
60394,75122,,15868.0,282,712,6.0,2024-06-30,9.0,0.0,1,9.0,2.0,,7.0
60395,75123,,18759.0,282,878,6.0,2024-06-30,22.0,0.0,1,22.0,14.0,,8.0
60396,75123,,18759.0,282,879,6.0,2024-06-30,159.0,0.0,1,159.0,100.0,,59.0


In [55]:
empty_dwh()

DWH_cursor.commit()

tables = {
    "DimProduct" : product_dwh,
    "DimCustomer" : customer_dwh,
    "DimEmployee" : employee_dwh,
    "DimTerritory" : territory_dwh,
    "FactSales" : fact_sales_dwh
}
allowed = True

for key in tables:
    for query in query_add(key, tables[key]):
        try:
            DWH_cursor.execute(query)
        except pyodbc.Error as e:
            alllowed = False
            print(query)
            print(e)

if allowed:
    DWH_cursor.commit()
    print("ITEMS INSERTED")
else:
    DWH_cursor.rollback()
    print("NOT ALLOWED TO COMMIT")

REMOVED DimProduct
REMOVED DimEmployee
REMOVED DimTerritory
REMOVED DimCustomer
REMOVED FactSales
REMOVED DimDate
All items are removed
ITEMS INSERTED
