## Import libraries

In [71]:
import pandas as pd
from sqlalchemy import create_engine
import numpy as np

## Extract data from SQL server

In [72]:
# Extract the source data from the Bronze layer in SQL Server

server = 'mohamedibrahim'
database = 'DataWarehouse'

connection_string = (
    f'mssql+pyodbc://mohamedibrahim/DataWarehouse'
    '?driver=ODBC+Driver+17+for+SQL+Server'
    '&Trusted_Connection=yes'
)
engine = create_engine(connection_string)

query = "SELECT * FROM bronze.crm_cust_info"
df = pd.read_sql(query, engine)

## Data Overview

In [73]:
df.head()

Unnamed: 0,cst_id,cst_key,cst_firstname,cst_lastname,cst_marital_status,cst_gndr,cst_create_date
0,11000.0,AW00011000,Jon,Yang,M,M,2025-10-06
1,11001.0,AW00011001,Eugene,Huang,S,M,2025-10-06
2,11002.0,AW00011002,Ruben,Torres,M,M,2025-10-06
3,11003.0,AW00011003,Christy,Zhu,S,F,2025-10-06
4,11004.0,AW00011004,Elizabeth,Johnson,S,F,2025-10-06


In [74]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18494 entries, 0 to 18493
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   cst_id              18490 non-null  float64
 1   cst_key             18494 non-null  object 
 2   cst_firstname       18486 non-null  object 
 3   cst_lastname        18487 non-null  object 
 4   cst_marital_status  18487 non-null  object 
 5   cst_gndr            13916 non-null  object 
 6   cst_create_date     18490 non-null  object 
dtypes: float64(1), object(6)
memory usage: 1011.5+ KB


In [75]:
df.isnull().sum()

cst_id                   4
cst_key                  0
cst_firstname            8
cst_lastname             7
cst_marital_status       7
cst_gndr              4578
cst_create_date          4
dtype: int64

In [76]:
df.duplicated().sum()

0

In [77]:
# Checks each object column for unwanted spaces (leading spaces, trailing spaces, multiple spaces)
for col in df.select_dtypes(include='object').columns:
    has_unwanted_spaces = False
    
    # Iterate through non-null values and check each one
    for value in df[col][df[col].notna()]:
        # Convert to string and check for unwanted spaces
        str_value = str(value)
        # Check for leading/trailing spaces or multiple consecutive spaces
        if str_value != str_value.strip() or '  ' in str_value:
            has_unwanted_spaces = True
            break
    
    if has_unwanted_spaces:
        print(f"Unwanted spaces found in column: {col}")

Unwanted spaces found in column: cst_firstname
Unwanted spaces found in column: cst_lastname


## Data Inspection & Cleaning

#### Column "cst_id"

In [78]:
df[df["cst_id"].isnull()]

Unnamed: 0,cst_id,cst_key,cst_firstname,cst_lastname,cst_marital_status,cst_gndr,cst_create_date
18446,,SF566,,,,,
18482,,PO25,,,,,
18492,,13451235,,,,,
18493,,A01Ass,,,,,


In [79]:
df["cst_id"].is_unique

False

In [80]:
df=df.dropna(subset=['cst_id'])

In [81]:
df[df.duplicated("cst_id", keep=False)]

Unnamed: 0,cst_id,cst_key,cst_firstname,cst_lastname,cst_marital_status,cst_gndr,cst_create_date
18433,29433.0,AW00029433,,,M,M,2026-01-25
18434,29433.0,AW00029433,Thomas,King,M,M,2026-01-27
18451,29449.0,AW00029449,,Chen,S,,2026-01-25
18452,29449.0,AW00029449,Laura,Chen,S,F,2026-01-26
18469,29466.0,AW00029466,,,,,2026-01-25
18470,29466.0,AW00029466,Lance,Jimenez,M,,2026-01-26
18471,29466.0,AW00029466,Lance,Jimenez,M,M,2026-01-27
18478,29473.0,AW00029473,Carmen,,,,2026-01-25
18479,29473.0,AW00029473,Carmen,Subram,S,,2026-01-26
18490,29483.0,AW00029483,,Navarro,,,2026-01-25


In [82]:
# Drop duplicated rows by indexes
df.drop([18433,18451,18469,18470,18478,18490],inplace=True)

In [83]:
df["cst_id"] = df["cst_id"].astype(int)

#### Column "cst_key"

In [84]:
df["cst_key"].unique().tolist()

['AW00011000',
 'AW00011001',
 'AW00011002',
 'AW00011003',
 'AW00011004',
 'AW00011005',
 'AW00011006',
 'AW00011007',
 'AW00011008',
 'AW00011009',
 'AW00011010',
 'AW00011011',
 'AW00011012',
 'AW00011013',
 'AW00011014',
 'AW00011015',
 'AW00011016',
 'AW00011017',
 'AW00011018',
 'AW00011019',
 'AW00011020',
 'AW00011021',
 'AW00011022',
 'AW00011023',
 'AW00011024',
 'AW00011025',
 'AW00011026',
 'AW00011027',
 'AW00011028',
 'AW00011029',
 'AW00011030',
 'AW00011031',
 'AW00011032',
 'AW00011033',
 'AW00011034',
 'AW00011035',
 'AW00011036',
 'AW00011037',
 'AW00011038',
 'AW00011039',
 'AW00011040',
 'AW00011041',
 'AW00011042',
 'AW00011043',
 'AW00011044',
 'AW00011045',
 'AW00011046',
 'AW00011047',
 'AW00011048',
 'AW00011049',
 'AW00011050',
 'AW00011051',
 'AW00011052',
 'AW00011053',
 'AW00011054',
 'AW00011055',
 'AW00011056',
 'AW00011057',
 'AW00011058',
 'AW00011059',
 'AW00011060',
 'AW00011061',
 'AW00011062',
 'AW00011063',
 'AW00011064',
 'AW00011065',
 'AW000110

In [85]:
df[~df["cst_key"].str.match(r"AW\d{8}", na=False)]

Unnamed: 0,cst_id,cst_key,cst_firstname,cst_lastname,cst_marital_status,cst_gndr,cst_create_date


#### Column "cst_firstname"

In [86]:
df["cst_firstname"] = df["cst_firstname"].str.strip()

#### Column "cst_lastname"

In [87]:
df["cst_lastname"] = df["cst_lastname"].str.strip()

#### column "cst_marital_status"

In [88]:
df["cst_marital_status"].unique()

array(['M', 'S'], dtype=object)

In [89]:
df["cst_marital_status"] = df["cst_marital_status"].replace('M','Married')
df["cst_marital_status"] = df["cst_marital_status"].replace('S','Single')

#### Column "cst_gndr"

In [90]:
df["cst_gndr"].unique()

array(['M', 'F', None], dtype=object)

In [91]:
df["cst_gndr"] = df["cst_gndr"].replace('F','Female')
df["cst_gndr"] = df["cst_gndr"].replace('M','Male')
df["cst_gndr"] = df["cst_gndr"].fillna('Unknown')

## Load Transformed Data into SQL Server


In [92]:
# Load the transformed "cust_info" data into the silver layer

server = 'mohamedibrahim'
database = 'DataWarehouse'

connection_string = (
    f'mssql+pyodbc://mohamedibrahim/DataWarehouse'
    '?driver=ODBC+Driver+17+for+SQL+Server'
    '&Trusted_Connection=yes'
)

engine = create_engine(connection_string)

df.to_sql(
    name='crm_cust_info',
    schema='silver',
    con=engine,   
    if_exists='append',
    index=False
)

245