## Import libraries

In [9]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine

## Extract data from SQL server

In [10]:
# Extract the source data from the Bronze layer in SQL Server

server = 'mohamedibrahim'
database = 'DataWarehouse'

connection_string = (
    f'mssql+pyodbc://mohamedibrahim/DataWarehouse'
    '?driver=ODBC+Driver+17+for+SQL+Server'
    '&Trusted_Connection=yes'
)
engine = create_engine(connection_string)

query = "SELECT * FROM bronze.erp_px_cat_g1v2"
df = pd.read_sql(query, engine)

## Data overview

In [11]:
df.head()

Unnamed: 0,ID,CAT,SUBCAT,MAINTENANCE
0,AC_BR,Accessories,Bike Racks,Yes
1,AC_BS,Accessories,Bike Stands,No
2,AC_BC,Accessories,Bottles and Cages,No
3,AC_CL,Accessories,Cleaners,Yes
4,AC_FE,Accessories,Fenders,No


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37 entries, 0 to 36
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   ID           37 non-null     object
 1   CAT          37 non-null     object
 2   SUBCAT       37 non-null     object
 3   MAINTENANCE  37 non-null     object
dtypes: object(4)
memory usage: 1.3+ KB


In [13]:
df.isnull().sum()

ID             0
CAT            0
SUBCAT         0
MAINTENANCE    0
dtype: int64

In [14]:
df.duplicated().sum()

0

In [15]:
# Checks each object column for unwanted spaces (leading spaces, trailing spaces, multiple spaces)
for col in df.select_dtypes(include='object').columns:
    has_unwanted_spaces = False
    
    # Iterate through non-null values and check each one
    for value in df[col][df[col].notna()]:
        # Convert to string and check for unwanted spaces
        str_value = str(value)
        # Check for leading/trailing spaces or multiple consecutive spaces
        if str_value != str_value.strip() or '  ' in str_value:
            has_unwanted_spaces = True
            break
    
    if has_unwanted_spaces:
        print(f"Unwanted spaces found in column: {col}")

## Data Inspection & Cleaning

#### Column "ID"

In [16]:
df["ID"].is_unique

True

#### Column "CAT"

In [17]:
df["CAT"].unique()

array(['Accessories', 'Bikes', 'Clothing', 'Components'], dtype=object)

#### Column "SUBCAT"

In [18]:
df["SUBCAT"].unique()

array(['Bike Racks', 'Bike Stands', 'Bottles and Cages', 'Cleaners',
       'Fenders', 'Helmets', 'Hydration Packs', 'Lights', 'Locks',
       'Panniers', 'Pumps', 'Tires and Tubes', 'Mountain Bikes',
       'Road Bikes', 'Touring Bikes', 'Bib-Shorts', 'Caps', 'Gloves',
       'Jerseys', 'Shorts', 'Socks', 'Tights', 'Vests', 'Bottom Brackets',
       'Brakes', 'Chains', 'Cranksets', 'Derailleurs', 'Forks',
       'Handlebars', 'Headsets', 'Mountain Frames', 'Pedals',
       'Road Frames', 'Saddles', 'Touring Frames', 'Wheels'], dtype=object)

#### Column "MAINTENANCE"

In [19]:
df["MAINTENANCE"].unique()

array(['Yes', 'No'], dtype=object)

## Load Transformed Data into SQL Server

In [20]:
# Load the transformed "px_cat_g1v2" data into the silver layer

server = 'mohamedibrahim'
database = 'DataWarehouse'

connection_string = (
    f'mssql+pyodbc://mohamedibrahim/DataWarehouse'
    '?driver=ODBC+Driver+17+for+SQL+Server'
    '&Trusted_Connection=yes'
)

engine = create_engine(connection_string)

df.to_sql(
    name='erp_px_cat_g1v2',
    schema='silver',
    con=engine,   
    if_exists='append',
    index=False
)

37