In [1]:
import time
import pandas as pd
import psycopg2 as pg2
import psycopg2.extras as extras

In [2]:
def execute_query(path:str, conn:pg2.connect):
    with conn.cursor() as cursor:
        with open(path, 'r') as f:
            try:
                cursor.execute(f.read())
                conn.commit()
            except (Exception, pg2.DatabaseError) as error:
                print("Error: %s" % error)
                conn.rollback()
                return
    print('Query executed successfully')

In [3]:
def insert_into_table(table:str, data:pd.DataFrame, conn:pg2.connect):
    query = "INSERT INTO %s(%s) VALUES %%s" % (table, ','.join([f'"{col}"' for col in data.columns]))
    with conn.cursor() as cursor:
        try:
            extras.execute_values(cursor, query, data.values)
            conn.commit()
        except (Exception, pg2.DatabaseError) as error:
            print("Error: %s" % error)
            conn.rollback()
            return
        print("Inserted {} rows into {}".format(len(data), table))

In [4]:
def select_from_table(table:str, columns:list, conn:pg2.connect):
    query = "SELECT %s FROM %s" % (','.join(columns), table)
    with conn.cursor() as cursor:
        try:
            cursor.execute(query)
            conn.commit()
            return pd.DataFrame(cursor.fetchall(), columns=columns)
        except (Exception, pg2.DatabaseError) as error:
            print("Error: %s" % error)
            conn.rollback()
            return

# Load and explore Excel table

In [5]:
data = pd.read_excel(
    'Sample - Superstore.xlsx', 
    parse_dates=['Order Date', 'Ship Date'], 
    dtype={'Postal Code': str}
)

  warn("""Cannot parse header or footer so it will be ignored""")


In [6]:
data.info()
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9994 entries, 0 to 9993
Data columns (total 23 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Row ID         9994 non-null   int64         
 1   Order ID       9994 non-null   object        
 2   Order Date     9994 non-null   datetime64[ns]
 3   Ship Date      9994 non-null   datetime64[ns]
 4   Ship Mode      9994 non-null   object        
 5   Customer ID    9994 non-null   object        
 6   Customer Name  9994 non-null   object        
 7   Segment        9994 non-null   object        
 8   Country        9994 non-null   object        
 9   City           9994 non-null   object        
 10  State          9994 non-null   object        
 11  Postal Code    9983 non-null   object        
 12  Region         9994 non-null   object        
 13  Product ID     9994 non-null   object        
 14  Category       9994 non-null   object        
 15  Sub-Category   9994 n

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,...,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit,Person,Returned
0,1,CA-2018-152156,2018-11-08,2018-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.9136,Cassandra Brandow,No
1,2,CA-2018-152156,2018-11-08,2018-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3,0.0,219.582,Cassandra Brandow,No
2,3,CA-2018-138688,2018-06-12,2018-06-16,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,...,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62,2,0.0,6.8714,Anna Andreadi,No
3,4,US-2017-108966,2017-10-11,2017-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,FUR-TA-10000577,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table,957.5775,5,0.45,-383.031,Cassandra Brandow,No
4,5,US-2017-108966,2017-10-11,2017-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,OFF-ST-10000760,Office Supplies,Storage,Eldon Fold 'N Roll Cart System,22.368,2,0.2,2.5164,Cassandra Brandow,No


In [7]:
data[data['Postal Code'].isna()][['State', 'City', 'Postal Code']].drop_duplicates()

Unnamed: 0,State,City,Postal Code
2234,Vermont,Burlington,


In [8]:
data['Postal Code'] = data['Postal Code'].fillna('05401')

# Connection to PostgreSQL and insertion data to database

In [9]:
conn = pg2.connect(host='ls-df11e1cd190cf4b7aee7a0eb1512dbda0ae0780f.clldgu4cdaby.eu-central-1.rds.amazonaws.com', 
                   dbname='dbpostgres', 
                   user='postgres',
                   password='13154200')
conn.set_session(autocommit=True)

In [10]:
execute_query('queries/create_stg.sql', conn)

Query executed successfully


In [11]:
insert_into_table("stg.orders", data, conn)

Inserted 9994 rows into stg.orders


In [12]:
data.columns = [col.lower().replace(' ', '_').replace('-', '_') for col in data.columns]

### Customers

In [13]:
execute_query('queries/create_dw.sql', conn)

Query executed successfully


In [14]:
customers = data[['customer_id', 'customer_name', 'segment']].drop_duplicates()
customers.head()

Unnamed: 0,customer_id,customer_name,segment
0,CG-12520,Claire Gute,Consumer
2,DV-13045,Darrin Van Huff,Corporate
3,SO-20335,Sean O'Donnell,Consumer
5,BH-11710,Brosina Hoffman,Consumer
12,AA-10480,Andrew Allen,Consumer


In [15]:
insert_into_table('dw.customers', customers, conn)

Inserted 793 rows into dw.customers


### Regions

In [16]:
regions = data[['region', 'person']].drop_duplicates()
regions

Unnamed: 0,region,person
0,South,Cassandra Brandow
2,West,Anna Andreadi
14,Central,Kelly Williams
23,East,Chuck Magee


In [17]:
insert_into_table('dw.regions', regions, conn)

Inserted 4 rows into dw.regions


### Geography

In [18]:
geography = data[['country', 'city', 'state', 'postal_code', 'region']].drop_duplicates().reset_index(drop=True)
geography.head()

Unnamed: 0,country,city,state,postal_code,region
0,United States,Henderson,Kentucky,42420,South
1,United States,Los Angeles,California,90036,West
2,United States,Fort Lauderdale,Florida,33311,South
3,United States,Los Angeles,California,90032,West
4,United States,Concord,North Carolina,28027,South


In [19]:
regions_uid = regions[['region']]
regions_uid['region_uid'] = range(1, regions_uid.shape[0]+1)
regions_uid

Unnamed: 0,region,region_uid
0,South,1
2,West,2
14,Central,3
23,East,4


In [20]:
geography = geography.merge(regions_uid, on='region')
geography = geography[geography.columns.drop('region')]
geography.head()

Unnamed: 0,country,city,state,postal_code,region_uid
0,United States,Henderson,Kentucky,42420,1
1,United States,Fort Lauderdale,Florida,33311,1
2,United States,Concord,North Carolina,28027,1
3,United States,Melbourne,Florida,32935,1
4,United States,Springfield,Virginia,22153,1


In [21]:
insert_into_table('dw.geography', geography, conn)

Inserted 632 rows into dw.geography


### Products

In [22]:
products = data[['product_id', 'product_name', 'category', 'sub_category']].drop_duplicates()
products.head()

Unnamed: 0,product_id,product_name,category,sub_category
0,FUR-BO-10001798,Bush Somerset Collection Bookcase,Furniture,Bookcases
1,FUR-CH-10000454,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",Furniture,Chairs
2,OFF-LA-10000240,Self-Adhesive Address Labels for Typewriters b...,Office Supplies,Labels
3,FUR-TA-10000577,Bretford CR4500 Series Slim Rectangular Table,Furniture,Tables
4,OFF-ST-10000760,Eldon Fold 'N Roll Cart System,Office Supplies,Storage


In [23]:
insert_into_table('dw.products', products, conn)

Inserted 1894 rows into dw.products


### Orders

In [24]:
orders = data[['order_id', 'order_date']].drop_duplicates()
orders.head()

Unnamed: 0,order_id,order_date
0,CA-2018-152156,2018-11-08
2,CA-2018-138688,2018-06-12
3,US-2017-108966,2017-10-11
5,CA-2016-115812,2016-06-09
12,CA-2019-114412,2019-04-15


In [25]:
date_df = select_from_table('dw.calendar', ['date_id', 'date'], conn)
date_df['date'] = pd.to_datetime(date_df['date'])

In [26]:
orders = orders.merge(date_df, left_on='order_date', right_on='date', how='left')[['order_id', 'date_id']]

In [27]:
insert_into_table('dw.orders', orders[['order_id', 'date_id']], conn)

Inserted 5009 rows into dw.orders


### Order_facts

In [30]:
ord_facts = data[['row_id', 'order_id', 'product_id', 'product_name', 'customer_id', 'returned']]
ord_facts.loc[:, 'returned'] = ord_facts['returned'].replace({'Yes': True, 'No': False}).astype(bool)
ord_facts.head()

Unnamed: 0,row_id,order_id,product_id,product_name,customer_id,returned
0,1,CA-2018-152156,FUR-BO-10001798,Bush Somerset Collection Bookcase,CG-12520,False
1,2,CA-2018-152156,FUR-CH-10000454,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",CG-12520,False
2,3,CA-2018-138688,OFF-LA-10000240,Self-Adhesive Address Labels for Typewriters b...,DV-13045,False
3,4,US-2017-108966,FUR-TA-10000577,Bretford CR4500 Series Slim Rectangular Table,SO-20335,False
4,5,US-2017-108966,OFF-ST-10000760,Eldon Fold 'N Roll Cart System,SO-20335,False


In [31]:
prod_df = select_from_table('dw.products', ['product_uid', 'product_id', 'product_name'], conn)
ord_facts = ord_facts.merge(prod_df, on=['product_id', 'product_name'], how='left')

cust_df = select_from_table('dw.customers', ['customer_uid', 'customer_id'], conn)
ord_facts = ord_facts.merge(cust_df, on='customer_id', how='left')

ord_df = select_from_table('dw.orders', ['order_uid', 'order_id', 'date_id'], conn)
ord_facts = ord_facts.merge(ord_df, on='order_id', how='left')

ord_facts.head()

Unnamed: 0,row_id,order_id,product_id,product_name,customer_id,returned,product_uid,customer_uid,order_uid,date_id
0,1,CA-2018-152156,FUR-BO-10001798,Bush Somerset Collection Bookcase,CG-12520,False,100000,10000,1000000,3234
1,2,CA-2018-152156,FUR-CH-10000454,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",CG-12520,False,100001,10000,1000000,3234
2,3,CA-2018-138688,OFF-LA-10000240,Self-Adhesive Address Labels for Typewriters b...,DV-13045,False,100002,10001,1000001,3085
3,4,US-2017-108966,FUR-TA-10000577,Bretford CR4500 Series Slim Rectangular Table,SO-20335,False,100003,10002,1000002,2841
4,5,US-2017-108966,OFF-ST-10000760,Eldon Fold 'N Roll Cart System,SO-20335,False,100004,10002,1000002,2841


In [32]:
ord_facts = ord_facts[['row_id', 'order_uid', 'product_uid', 'customer_uid', 'returned']]

In [33]:
insert_into_table('dw.order_facts', ord_facts, conn)

Inserted 9994 rows into dw.order_facts


### Shipping

In [34]:
shipping = data[['row_id', 'ship_date', 'ship_mode', 'state', 'city', 'postal_code']]
shipping.head()

Unnamed: 0,row_id,order_id,order_date,ship_date,ship_mode,state,city,postal_code
0,1,CA-2018-152156,2018-11-08,2018-11-11,Second Class,Kentucky,Henderson,42420
1,2,CA-2018-152156,2018-11-08,2018-11-11,Second Class,Kentucky,Henderson,42420
2,3,CA-2018-138688,2018-06-12,2018-06-16,Second Class,California,Los Angeles,90036
3,4,US-2017-108966,2017-10-11,2017-10-18,Standard Class,Florida,Fort Lauderdale,33311
4,5,US-2017-108966,2017-10-11,2017-10-18,Standard Class,Florida,Fort Lauderdale,33311


In [36]:
shipping = shipping.merge(date_df, left_on='ship_date', right_on='date', how='left')

In [37]:
geo_df = select_from_table('dw.geography', ['geo_id', 'state', 'city', 'postal_code'], conn)
shipping = shipping.merge(geo_df, on=['state', 'city', 'postal_code'], how='left')[['row_id', 'date_id', 'ship_mode', 'geo_id']]
shipping.head()

Unnamed: 0,row_id,order_uid,date_id,ship_mode,geo_id
0,1,1000000,3237,Second Class,1000
1,2,1000000,3237,Second Class,1000
2,3,1000001,3089,Second Class,1136
3,4,1000002,2848,Standard Class,1001
4,5,1000002,2848,Standard Class,1001


In [38]:
insert_into_table('dw.shipping', shipping, conn)

Inserted 9994 rows into dw.shipping


In [39]:
ord_facts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9994 entries, 0 to 9993
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   row_id        9994 non-null   int64 
 1   order_uid     9994 non-null   int64 
 2   product_uid   9994 non-null   int64 
 3   customer_uid  9994 non-null   int64 
 4   returned      9994 non-null   object
dtypes: int64(4), object(1)
memory usage: 390.5+ KB


### Metrics

In [40]:
metrics = data[['row_id', 'sales', 'quantity', 'discount', 'profit']]
metrics.head()

Rows count:  9994


Unnamed: 0,row_id,sales,quantity,discount,profit
0,1,261.96,2,0.0,41.9136
1,2,731.94,3,0.0,219.582
2,3,14.62,2,0.0,6.8714
3,4,957.5775,5,0.45,-383.031
4,5,22.368,2,0.2,2.5164


In [41]:
insert_into_table('dw.metrics', metrics, conn)

Inserted 9994 rows into dw.metrics
