In [1]:
from datetime import date

import numpy as np
import pandas as pd
import yaml
from sqlalchemy import create_engine


# database connections 

In [2]:
# database connections
with open('../config_fill.yml', 'r') as f:
    config = yaml.safe_load(f)
    config_aw = config['Adventure_Works']
    config_etl = config['ETL_PRO']

url_aw = f"mssql+pyodbc://@{config_aw['host']}/{config_aw['dbname']}?driver={config_aw['driver'].replace(' ', '+')}&trusted_connection={config_aw['trusted_connection']}"
url_etl = f"{config_etl['drivername']}://{config_etl['user']}:{config_etl['password']}@{config_etl['host']}:{config_etl['port']}/{config_etl['dbname']}"

aw_engine = create_engine(url_aw)
etl_engine = create_engine(url_etl)

# Extract

In [5]:
query_customer = """
SELECT
    c.CustomerID,
    c.PersonID AS BusinessEntityID,
    pp.FirstName,
    pp.LastName,
    pp.MiddleName,
    ea.EmailAddress,
    ph.PhoneNumber,

    a.AddressLine1,
    a.City,
    sp.Name AS StateProvince,
    cr.Name AS CountryRegion,
    a.PostalCode

FROM Sales.Customer c
JOIN Person.Person pp
    ON c.PersonID = pp.BusinessEntityID   -- Solo clientes individuales

LEFT JOIN Person.EmailAddress ea
    ON pp.BusinessEntityID = ea.BusinessEntityID

LEFT JOIN Person.PersonPhone ph
    ON pp.BusinessEntityID = ph.BusinessEntityID

LEFT JOIN Person.BusinessEntityAddress bea
    ON pp.BusinessEntityID = bea.BusinessEntityID
    AND bea.AddressTypeID = 2 -- Home Address (filtra sobre "Home")

LEFT JOIN Person.Address a
    ON bea.AddressID = a.AddressID

LEFT JOIN Person.StateProvince sp
    ON a.StateProvinceID = sp.StateProvinceID

LEFT JOIN Person.CountryRegion cr
    ON sp.CountryRegionCode = cr.CountryRegionCode;
"""
df_customer = pd.read_sql(query_customer, aw_engine)
df_customer.head()
df_customer.shape


(19119, 12)

# Transformations

In [6]:
df_customer = df_customer.fillna({
    "MiddleName": "",
    "EmailAddress": "Unknown",
    "PhoneNumber": "Unknown",
    "AddressLine1": "Unknown",
    "City": "Unknown",
    "StateProvince": "Unknown",
    "CountryRegion": "Unknown",
    "PostalCode": "Unknown"
})


In [7]:
df_customer["full_name"] = (
    df_customer["FirstName"] + " " +
    df_customer["LastName"]
)


In [8]:
df_customer.rename(columns={
    "CustomerID": "customer_id",
    "FirstName": "first_name",
    "LastName": "last_name",
    "MiddleName": "middle_name",
    "EmailAddress": "email",
    "PhoneNumber": "phone",
    "AddressLine1": "address",
    "StateProvince": "state",
    "CountryRegion": "country",
    "City": "city",
    "PostalCode": "postal_code"
}, inplace=True)


# load

In [9]:
df_customer.insert(0, "customer_key", range(1, len(df_customer) + 1))


In [10]:
df_customer.head()
df_customer.shape


(19119, 14)

In [11]:
df_customer.to_sql(
    "dim_customer",
    etl_engine,
    if_exists="replace",
    index=False
)


119