In [99]:
# Import libraries
import pandas as pd
import sys
import os

In [100]:
# Get the current working directory
current_dir = os.getcwd()
# Move up one level from the current directory
parent_dir = os.path.dirname(current_dir)
# Change directory into data directory
data_dir = os.path.join(parent_dir, 'data')

In [101]:
sys.path.append(parent_dir)

In [102]:
import config as cfg

# 1. Extract

In [103]:
df_fulfillment = pd.read_csv(os.path.join(data_dir, 'fulfillment.csv'), index_col=None)
df_inventory = pd.read_csv(os.path.join(data_dir, 'inventory.csv'), index_col=None)
df_orders = pd.read_csv(os.path.join(data_dir, 'orders_and_shipments.csv'), index_col=None)

In [104]:
df_fulfillment.head()

Unnamed: 0,Product Name,Warehouse Order Fulfillment (days)
0,Perfect Fitness Perfect Rip Deck,8.3
1,Nike Men's Dri-FIT Victory Golf Polo,6.6
2,O'Brien Men's Neoprene Life Vest,5.5
3,Nike Men's Free 5.0+ Running Shoe,9.4
4,Under Armour Girls' Toddler Spine Surge Runni,6.3


# 2.Transform

## 2.1 Rename Columns

In [105]:
def rename_columns(df):
    # Convert from camel case to snake case
    df.columns = (
        df.columns
        .str.replace('(?<=[a-z])(?=[A-Z])', '_', regex=True)
        .str.lower()
    )
    # Replace spaces with underscores
    df.columns = df.columns.str.replace(' ', '_')
    # Replace hyphens with underscores
    df.columns = df.columns.str.replace('-', '_')
    # Replace percentage signs with 'percent'
    df.columns = df.columns.str.replace('%', 'percent')
    # Remove parentheses
    df.columns = df.columns.str.replace('[()]', '', regex=True)
    # Replace multiple underscores with a single underscore
    df.columns = df.columns.str.replace('_+', '_', regex=True)
    # Remove leading and trailing underscores
    df.columns = df.columns.str.strip('_')
    
    return df

In [106]:
df_inventory = rename_columns(df_inventory)
df_fulfillment = rename_columns(df_fulfillment)
df_orders = rename_columns(df_orders)

In [107]:
df_orders.head()

Unnamed: 0,order_id,order_item_id,order_year_month,order_year,order_month,order_day,order_time,order_quantity,product_department,product_category,...,customer_country,warehouse_country,shipment_year,shipment_month,shipment_day,shipment_mode,shipment_days_scheduled,gross_sales,discount_percent,profit
0,3535,8793,202102,2021,2,21,14:07,1,Fan Shop,Fishing,...,Mexico,Puerto Rico,2021,2,27,Standard Class,4,400,0.25,200
1,4133,10320,202103,2021,3,2,7:37,1,Fan Shop,Fishing,...,Brazil,Puerto Rico,2021,3,6,Standard Class,4,400,0.09,200
2,7396,18517,202104,2021,4,18,22:47,1,Fan Shop,Fishing,...,Mexico,Puerto Rico,2021,4,20,Standard Class,4,400,0.06,200
3,11026,27608,202106,2021,6,10,22:32,1,Fan Shop,Fishing,...,Denmark,Puerto Rico,2021,6,12,Standard Class,4,400,0.15,200
4,11026,27609,202106,2021,6,10,22:32,1,Fan Shop,Fishing,...,Denmark,Puerto Rico,2021,6,12,Standard Class,4,400,0.13,200


In [108]:
df_fulfillment.columns

Index(['product_name', 'warehouse_order_fulfillment_days'], dtype='object')

## 2.2 Check Datatypes

In [109]:
df_fulfillment.dtypes

product_name                         object
warehouse_order_fulfillment_days    float64
dtype: object

In [110]:
df_inventory.dtypes

product_name                object
year_month                   int64
warehouse_inventory          int64
inventory_cost_per_unit    float64
dtype: object

In [111]:
df_orders.dtypes

order_id                    int64
order_item_id               int64
order_year_month            int64
order_year                  int64
order_month                 int64
order_day                   int64
order_time                 object
order_quantity              int64
product_department         object
product_category           object
product_name               object
customer_id                 int64
customer_market            object
customer_region            object
customer_country           object
warehouse_country          object
shipment_year               int64
shipment_month              int64
shipment_day                int64
shipment_mode              object
shipment_days_scheduled     int64
gross_sales                 int64
discount_percent           object
profit                      int64
dtype: object

# 3. Load

In [112]:
import pyodbc
from sqlalchemy import create_engine
import sqlalchemy

In [113]:
# Test out connection
conn_str = pyodbc.connect(
    'DRIVER={SQL Server};'
    F'SERVER={cfg.SERVER_NAME};'
    F'DATABASE={cfg.DATABASE_NAME};'
    'Trusted_Connection=yes;'
)

In [114]:
def get_column_types(df):
    # Map Pandas data types to SQL Server data types
    type_map = {
        'object': 'NVARCHAR(MAX)',
        'int64': 'BIGINT',
        'float64': 'FLOAT',
        'datetime64[ns]': 'DATETIME2',
        'bool': 'BIT'
    }
    return [type_map.get(str(dt), 'NVARCHAR(MAX)') for dt in df.dtypes]

In [115]:
def load_table(df, table_name, schema_name):
    # Define the connection string
    conn_str = (
        f'DRIVER={{SQL Server}};'
        f'SERVER={cfg.SERVER_NAME};'
        f'DATABASE={cfg.DATABASE_NAME};'
        'Trusted_Connection=yes;'
    )

    # Create a pyodbc connection using the connection string
    conn = pyodbc.connect(conn_str)
    cursor = conn.cursor()

    # Drop the table if it already exists
    cursor.execute(f"IF OBJECT_ID('{schema_name}.{table_name}', 'U') IS NOT NULL DROP TABLE {schema_name}.{table_name}")
    conn.commit()

    # Create the table schema
    columns = ', '.join([f'{col} {dtype}' for col, dtype in zip(df.columns, get_column_types(df))])
    create_table_sql = f"CREATE TABLE {schema_name}.{table_name} ({columns})"
    cursor.execute(create_table_sql)
    conn.commit()

    # Insert the data into the table
    insert_sql = f"INSERT INTO {schema_name}.{table_name} VALUES ({','.join(['?'] * len(df.columns))})"
    for _, row in df.iterrows():
        cursor.execute(insert_sql, *row.tolist())
    conn.commit()

    cursor.close()
    conn.close()
    print(f"DataFrame loaded successfully into {schema_name}.{table_name}")

In [116]:
load_table(df_fulfillment, 'fulfillment', 'original')

DataFrame loaded successfully into original.fulfillment


In [117]:
load_table(df_orders, 'orders', 'original')

DataFrame loaded successfully into original.orders


In [118]:
load_table(df_inventory, 'inventory', 'original')

DataFrame loaded successfully into original.inventory


In [119]:
# Step 1: Generate the date range
date_range = pd.date_range(start='2021-01-01', end='2023-12-31')

# Step 2: Create the DataFrame
df_date = pd.DataFrame({
    'date': date_range,
    'year': date_range.year,
    'quarter': date_range.quarter,
    'month': date_range.month,
    'month_name': date_range.strftime('%B'),
    'month_abbreviation': date_range.strftime('%b'),
    'year_month': date_range.strftime('%Y%m'),  # Adding the year_month column
    'day': date_range.day,
    'day_of_week': date_range.weekday + 2,  # Monday=1, Sunday=7
    'day_name': date_range.strftime('%A'),
    'day_abbreviation': date_range.strftime('%a'),
    'week': date_range.isocalendar().week,
    'is_weekend': date_range.weekday >= 5,  # 5=Saturday, 6=Sunday
    'fiscal_year': date_range.year,  # Adjust if fiscal year differs
    'fiscal_quarter': date_range.quarter,
    'fiscal_month': date_range.month,
    'is_last_day_of_month': date_range.isin(pd.date_range(start='2021-01-01', end='2023-12-31', freq='M')),
    'is_last_day_of_quarter': date_range.isin(pd.date_range(start='2021-01-01', end='2023-12-31', freq='Q')),
    'is_last_day_of_year': date_range.isin(pd.date_range(start='2021-01-01', end='2023-12-31', freq='A')),
})

  'is_last_day_of_month': date_range.isin(pd.date_range(start='2021-01-01', end='2023-12-31', freq='M')),
  'is_last_day_of_quarter': date_range.isin(pd.date_range(start='2021-01-01', end='2023-12-31', freq='Q')),
  'is_last_day_of_year': date_range.isin(pd.date_range(start='2021-01-01', end='2023-12-31', freq='A')),


In [120]:
df_date.head()

Unnamed: 0,date,year,quarter,month,month_name,month_abbreviation,year_month,day,day_of_week,day_name,day_abbreviation,week,is_weekend,fiscal_year,fiscal_quarter,fiscal_month,is_last_day_of_month,is_last_day_of_quarter,is_last_day_of_year
2021-01-01,2021-01-01,2021,1,1,January,Jan,202101,1,6,Friday,Fri,53,False,2021,1,1,False,False,False
2021-01-02,2021-01-02,2021,1,1,January,Jan,202101,2,7,Saturday,Sat,53,True,2021,1,1,False,False,False
2021-01-03,2021-01-03,2021,1,1,January,Jan,202101,3,8,Sunday,Sun,53,True,2021,1,1,False,False,False
2021-01-04,2021-01-04,2021,1,1,January,Jan,202101,4,2,Monday,Mon,1,False,2021,1,1,False,False,False
2021-01-05,2021-01-05,2021,1,1,January,Jan,202101,5,3,Tuesday,Tue,1,False,2021,1,1,False,False,False


In [123]:
load_table(df_date, 'dim_date', 'dbo')

DataFrame loaded successfully into dbo.dim_date
