## Import libraries

In [29]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine

## Extract data from SQL server

In [30]:
# Extract the source data from the Bronze layer in SQL Server

server = 'mohamedibrahim'
database = 'DataWarehouse'

connection_string = (
    f'mssql+pyodbc://mohamedibrahim/DataWarehouse'
    '?driver=ODBC+Driver+17+for+SQL+Server'
    '&Trusted_Connection=yes'
)
engine = create_engine(connection_string)

query = "SELECT * FROM bronze.crm_prd_info"
df = pd.read_sql(query, engine)

## Data Overview

In [31]:
df.head()

Unnamed: 0,prd_id,prd_key,prd_nm,prd_cost,prd_line,prd_start_dt,prd_end_dt
0,210,CO-RF-FR-R92B-58,HL Road Frame - Black- 58,,R,2003-07-01,
1,211,CO-RF-FR-R92R-58,HL Road Frame - Red- 58,,R,2003-07-01,
2,212,AC-HE-HL-U509-R,Sport-100 Helmet- Red,12.0,S,2011-07-01,2007-12-28
3,213,AC-HE-HL-U509-R,Sport-100 Helmet- Red,14.0,S,2012-07-01,2008-12-27
4,214,AC-HE-HL-U509-R,Sport-100 Helmet- Red,13.0,S,2013-07-01,


In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 397 entries, 0 to 396
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   prd_id        397 non-null    int64  
 1   prd_key       397 non-null    object 
 2   prd_nm        397 non-null    object 
 3   prd_cost      395 non-null    float64
 4   prd_line      380 non-null    object 
 5   prd_start_dt  397 non-null    object 
 6   prd_end_dt    200 non-null    object 
dtypes: float64(1), int64(1), object(5)
memory usage: 21.8+ KB


In [33]:
df.isnull().sum()

prd_id            0
prd_key           0
prd_nm            0
prd_cost          2
prd_line         17
prd_start_dt      0
prd_end_dt      197
dtype: int64

In [34]:
df.duplicated().sum()

0

In [35]:
# Checks each object column for unwanted spaces (leading spaces, trailing spaces, multiple spaces)
for col in df.select_dtypes(include='object').columns:
    has_unwanted_spaces = False
    
    # Iterate through non-null values and check each one
    for value in df[col][df[col].notna()]:
        # Convert to string and check for unwanted spaces
        str_value = str(value)
        # Check for leading/trailing spaces or multiple consecutive spaces
        if str_value != str_value.strip() or '  ' in str_value:
            has_unwanted_spaces = True
            break
    
    if has_unwanted_spaces:
        print(f"Unwanted spaces found in column: {col}")

Unwanted spaces found in column: prd_line


## Data Inspection & Cleaning

#### Column "prd_id"

In [36]:
df["prd_id"].is_unique

True

#### Column "prd_key"

In [37]:
df["cat_id"] = df["prd_key"].str[:5]

In [38]:
df["prd_key"] = df["prd_key"].str[6:]

#### Column "prd_nm"

In [39]:
df["prd_nm"].unique().tolist()

['HL Road Frame - Black- 58',
 'HL Road Frame - Red- 58',
 'Sport-100 Helmet- Red',
 'Sport-100 Helmet- Black',
 'Mountain Bike Socks- M',
 'Mountain Bike Socks- L',
 'Sport-100 Helmet- Blue',
 'AWC Logo Cap',
 'Long-Sleeve Logo Jersey- S',
 'Long-Sleeve Logo Jersey- M',
 'Long-Sleeve Logo Jersey- L',
 'Long-Sleeve Logo Jersey- XL',
 'HL Road Frame - Red- 62',
 'HL Road Frame - Red- 44',
 'HL Road Frame - Red- 48',
 'HL Road Frame - Red- 52',
 'HL Road Frame - Red- 56',
 'LL Road Frame - Black- 58',
 'LL Road Frame - Black- 60',
 'LL Road Frame - Black- 62',
 'LL Road Frame - Red- 44',
 'LL Road Frame - Red- 48',
 'LL Road Frame - Red- 52',
 'LL Road Frame - Red- 58',
 'LL Road Frame - Red- 60',
 'LL Road Frame - Red- 62',
 'ML Road Frame - Red- 44',
 'ML Road Frame - Red- 48',
 'ML Road Frame - Red- 52',
 'ML Road Frame - Red- 58',
 'ML Road Frame - Red- 60',
 'LL Road Frame - Black- 44',
 'LL Road Frame - Black- 48',
 'LL Road Frame - Black- 52',
 'HL Mountain Frame - Silver- 42',
 '

#### Column "prd_cost"

In [40]:
df[df["prd_cost"].isnull()]

Unnamed: 0,prd_id,prd_key,prd_nm,prd_cost,prd_line,prd_start_dt,prd_end_dt,cat_id
0,210,FR-R92B-58,HL Road Frame - Black- 58,,R,2003-07-01,,CO-RF
1,211,FR-R92R-58,HL Road Frame - Red- 58,,R,2003-07-01,,CO-RF


In [41]:
df["prd_cost"] = df["prd_cost"].fillna(0)

In [42]:
df[df["prd_cost"]<0]

Unnamed: 0,prd_id,prd_key,prd_nm,prd_cost,prd_line,prd_start_dt,prd_end_dt,cat_id


#### Column "cat_id"

In [43]:
df["cat_id"] = df["cat_id"].str.replace('-','_')

#### Column "prd_line"

In [44]:
df["prd_line"] = df["prd_line"].str.strip()

In [45]:
df["prd_line"].unique()

array(['R', 'S', 'M', None, 'T'], dtype=object)

In [46]:
df["prd_line"] = df["prd_line"].fillna('Unknown')

In [47]:
df["prd_line"] = df["prd_line"].replace('R','Road')
df["prd_line"] = df["prd_line"].replace('S','Other Sales')
df["prd_line"] = df["prd_line"].replace('M','Mountain')
df["prd_line"] = df["prd_line"].replace('T','Touring')

#### Columns "prd_start_dt" & "prd_end_dt"

In [48]:
# Convert to datetime
df['prd_start_dt'] = pd.to_datetime(df['prd_start_dt'])
df['prd_end_dt'] = pd.to_datetime(df['prd_end_dt'])

In [49]:
df[df["prd_end_dt"]<df["prd_start_dt"]]

Unnamed: 0,prd_id,prd_key,prd_nm,prd_cost,prd_line,prd_start_dt,prd_end_dt,cat_id
2,212,HL-U509-R,Sport-100 Helmet- Red,12.0,Other Sales,2011-07-01,2007-12-28,AC_HE
3,213,HL-U509-R,Sport-100 Helmet- Red,14.0,Other Sales,2012-07-01,2008-12-27,AC_HE
5,215,HL-U509,Sport-100 Helmet- Black,12.0,Other Sales,2011-07-01,2007-12-28,AC_HE
6,216,HL-U509,Sport-100 Helmet- Black,14.0,Other Sales,2012-07-01,2008-12-27,AC_HE
8,218,SO-B909-M,Mountain Bike Socks- M,3.0,Mountain,2011-07-01,2007-12-28,CL_SO
...,...,...,...,...,...,...,...,...
254,464,GL-H102-M,Half-Finger Gloves- M,10.0,Other Sales,2012-07-01,2008-12-27,CL_GL
256,466,GL-H102-L,Half-Finger Gloves- L,10.0,Other Sales,2012-07-01,2008-12-27,CL_GL
258,468,GL-F110-S,Full-Finger Gloves- S,16.0,Mountain,2012-07-01,2008-12-27,CL_GL
259,469,GL-F110-M,Full-Finger Gloves- M,16.0,Mountain,2012-07-01,2008-12-27,CL_GL


In [50]:
# Swapping the columns
# Reason: each end_dt is erlier than the start_dt
df = df.rename(columns={'prd_start_dt': 'prd_end_dt', 'prd_end_dt': 'prd_start_dt'})

In [51]:
from dateutil.relativedelta import relativedelta
# Print the differences
for end, start in zip(df['prd_end_dt'], df['prd_start_dt']):
    if pd.notna(start) and pd.notna(end):  # Ensure no missing values
        diff = relativedelta(end, start)
        print(f"Difference: {diff.years} years, {diff.months} months, {diff.days} days")


Difference: 3 years, 6 months, 3 days
Difference: 3 years, 6 months, 4 days
Difference: 3 years, 6 months, 3 days
Difference: 3 years, 6 months, 4 days
Difference: 3 years, 6 months, 3 days
Difference: 3 years, 6 months, 3 days
Difference: 3 years, 6 months, 3 days
Difference: 3 years, 6 months, 4 days
Difference: 3 years, 6 months, 3 days
Difference: 3 years, 6 months, 4 days
Difference: 3 years, 6 months, 3 days
Difference: 3 years, 6 months, 4 days
Difference: 3 years, 6 months, 3 days
Difference: 3 years, 6 months, 4 days
Difference: 3 years, 6 months, 3 days
Difference: 3 years, 6 months, 4 days
Difference: 3 years, 6 months, 3 days
Difference: 3 years, 6 months, 4 days
Difference: 3 years, 6 months, 3 days
Difference: 3 years, 6 months, 4 days
Difference: 3 years, 6 months, 3 days
Difference: 3 years, 6 months, 4 days
Difference: 3 years, 6 months, 3 days
Difference: 3 years, 6 months, 4 days
Difference: 3 years, 6 months, 3 days
Difference: 3 years, 6 months, 4 days
Difference: 

In [52]:
# Correcting illogical values
df['prd_end_dt'] = df['prd_end_dt'].replace(pd.to_datetime('2003-07-01'), pd.to_datetime('2013-07-01'))

In [53]:
# subtracts 3 years, 6 months, and 5 days from the prd_end_dt column and stores the result in a new column called end_sub_start
df['end_sub_start'] = df['prd_end_dt'].apply(lambda x: x - relativedelta(years=3, months=6, days=5))

In [54]:
df['prd_start_dt'] = df['prd_start_dt'].fillna(df['end_sub_start'])

In [55]:
df.drop(columns=['end_sub_start'], inplace=True)

## Load Transformed Data into SQL Server

In [56]:
# Load the transformed "prd_info" data into the silver layer

server = 'mohamedibrahim'
database = 'DataWarehouse'

connection_string = (
    f'mssql+pyodbc://mohamedibrahim/DataWarehouse'
    '?driver=ODBC+Driver+17+for+SQL+Server'
    '&Trusted_Connection=yes'
)

engine = create_engine(connection_string)

df.to_sql(
    name='crm_prd_info',
    schema='silver',
    con=engine,   
    if_exists='append',
    index=False
)

135