## Import libraries

In [9]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine

## Extract data from SQL server

In [10]:
# Extract the source data from the Bronze layer in SQL Server

server = 'mohamedibrahim'
database = 'DataWarehouse'

connection_string = (
    f'mssql+pyodbc://mohamedibrahim/DataWarehouse'
    '?driver=ODBC+Driver+17+for+SQL+Server'
    '&Trusted_Connection=yes'
)
engine = create_engine(connection_string)

query = "SELECT * FROM bronze.erp_loc_a101"
df = pd.read_sql(query, engine)

## Data overview

In [11]:
df.head()

Unnamed: 0,CID,CNTRY
0,AW-00011000,Australia
1,AW-00011001,Australia
2,AW-00011002,Australia
3,AW-00011003,Australia
4,AW-00011004,Australia


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18484 entries, 0 to 18483
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   CID     18484 non-null  object
 1   CNTRY   18152 non-null  object
dtypes: object(2)
memory usage: 288.9+ KB


In [13]:
df.isnull().sum()

CID        0
CNTRY    332
dtype: int64

In [14]:
df.duplicated().sum()

0

In [15]:
# Checks each object column for unwanted spaces (leading spaces, trailing spaces, multiple spaces)
for col in df.select_dtypes(include='object').columns:
    has_unwanted_spaces = False
    
    # Iterate through non-null values and check each one
    for value in df[col][df[col].notna()]:
        # Convert to string and check for unwanted spaces
        str_value = str(value)
        # Check for leading/trailing spaces or multiple consecutive spaces
        if str_value != str_value.strip() or '  ' in str_value:
            has_unwanted_spaces = True
            break
    
    if has_unwanted_spaces:
        print(f"Unwanted spaces found in column: {col}")

Unwanted spaces found in column: CNTRY


## Data Inspection & Cleaning

#### Column "CID"

In [16]:
df["CID"].is_unique

True

In [17]:
df["CID"] = df["CID"].str.replace('-', '', regex=False)

In [18]:
df[~df["CID"].str.match(r"AW\d{8}")]

Unnamed: 0,CID,CNTRY


#### Column "CNTRY"

In [19]:
df["CNTRY"] = df["CNTRY"].str.strip()

In [20]:
df["CNTRY"].unique()

array(['Australia', 'US', 'Canada', 'DE', 'United Kingdom', 'France',
       'USA', 'Germany', None, '', 'United States'], dtype=object)

In [21]:
df["CNTRY"] = df["CNTRY"].replace('US','United States')
df["CNTRY"] = df["CNTRY"].replace('USA','United States')
df["CNTRY"] = df["CNTRY"].replace('','Unknown')
df["CNTRY"] = df["CNTRY"].fillna('Unknown')
df["CNTRY"] = df["CNTRY"].replace('DE','Germany')

## Load Transformed Data into SQL Server

In [22]:
# Load the transformed "loc_a101" data into the silver layer

server = 'mohamedibrahim'
database = 'DataWarehouse'

connection_string = (
    f'mssql+pyodbc://mohamedibrahim/DataWarehouse'
    '?driver=ODBC+Driver+17+for+SQL+Server'
    '&Trusted_Connection=yes'
)

engine = create_engine(connection_string)

df.to_sql(
    name='erp_loc_a101',
    schema='silver',
    con=engine,   
    if_exists='append',
    index=False
)

484