## Import libraries

In [23]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine

## Extract data from SQL server

In [24]:
# Extract the source data from the Bronze layer in SQL Server

server = 'mohamedibrahim'
database = 'DataWarehouse'

connection_string = (
    f'mssql+pyodbc://mohamedibrahim/DataWarehouse'
    '?driver=ODBC+Driver+17+for+SQL+Server'
    '&Trusted_Connection=yes'
)
engine = create_engine(connection_string)

query = "SELECT * FROM bronze.erp_cust_az12"
df = pd.read_sql(query, engine)

## Data overview

In [25]:
df.head()

Unnamed: 0,CID,BDATE,GEN
0,NASAW00011000,1971-10-06,Male
1,NASAW00011001,1976-05-10,Male
2,NASAW00011002,1971-02-09,Male
3,NASAW00011003,1973-08-14,Female
4,NASAW00011004,1979-08-05,Female


In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18484 entries, 0 to 18483
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   CID     18484 non-null  object
 1   BDATE   18484 non-null  object
 2   GEN     17012 non-null  object
dtypes: object(3)
memory usage: 433.3+ KB


In [27]:
df.isnull().sum()

CID         0
BDATE       0
GEN      1472
dtype: int64

In [28]:
df.duplicated().sum()

0

In [29]:
# Checks each object column for unwanted spaces (leading spaces, trailing spaces, multiple spaces)
for col in df.select_dtypes(include='object').columns:
    has_unwanted_spaces = False
    
    # Iterate through non-null values and check each one
    for value in df[col][df[col].notna()]:
        # Convert to string and check for unwanted spaces
        str_value = str(value)
        # Check for leading/trailing spaces or multiple consecutive spaces
        if str_value != str_value.strip() or '  ' in str_value:
            has_unwanted_spaces = True
            break
    
    if has_unwanted_spaces:
        print(f"Unwanted spaces found in column: {col}")

Unwanted spaces found in column: GEN


## Data Inspection & Cleaning

#### Column "CID"

In [30]:
df["CID"].is_unique

True

In [31]:
df["CID"] = df["CID"].str.replace('NAS', '', regex=False)

In [32]:
df[~df["CID"].str.match(r"AW\d{8}")]

Unnamed: 0,CID,BDATE,GEN


#### Column "BDATE"

In [33]:
df['BDATE'] = pd.to_datetime(df['BDATE'], errors='coerce')


In [34]:
min_date = df['BDATE'].min()
max_date = df['BDATE'].max()
print(f"Date range: {min_date} to {max_date}")

# Check for dates in the future
future_dates = df[df['BDATE'] > pd.Timestamp.now()]
print(f"future dates: {future_dates}")

Date range: 1916-02-10 00:00:00 to 2080-03-15 00:00:00
future dates:               CID      BDATE     GEN
257    AW00011257 2050-07-06  Female
410    AW00011410 2042-02-22    Male
551    AW00011551 2050-05-21    Male
562    AW00011562 2038-10-17    Male
581    AW00011581 2045-03-03  Female
775    AW00011775 2050-11-22  Female
912    AW00011912 2066-06-16  Female
1123   AW00012123 2065-12-12    Male
2417   AW00013417 2050-09-07    Male
9062   AW00020062 2080-03-15    Male
14441  AW00025441 2055-01-23  Female


In [35]:
# Removes any future dates from 'BDAT'
df = df[df['BDATE'] <= pd.Timestamp.now()]

#### Column "GEN"

In [36]:
df["GEN"] = df["GEN"].str.strip()

In [37]:
df["GEN"].unique()

array(['Male', 'Female', '', 'M', 'F', None], dtype=object)

In [38]:
df["GEN"] = df["GEN"].replace({'M': 'Male', 'F': 'Female'})

In [39]:
df["GEN"] = df["GEN"].fillna('Unknown')

In [40]:
df["GEN"] = df["GEN"].replace({'': 'Unknown'})

## Load Transformed Data into SQL Server

In [41]:
# Load the transformed "cust_az12" data into the silver layer

server = 'mohamedibrahim'
database = 'DataWarehouse'

connection_string = (
    f'mssql+pyodbc://mohamedibrahim/DataWarehouse'
    '?driver=ODBC+Driver+17+for+SQL+Server'
    '&Trusted_Connection=yes'
)

engine = create_engine(connection_string)

df.to_sql(
    name='erp_cust_az12',
    schema='silver',
    con=engine,   
    if_exists='append',
    index=False
)

294