In [1]:
import configparser
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
config = configparser.ConfigParser()
config.read('clustertab.config')

['clustertab.config']

In [3]:
db = config['POSTGRES']['PG_DB']
user = config['POSTGRES']['PG_UNAME']
passwd = config['POSTGRES']['PG_PASS']
port = config['POSTGRES']['PG_PORT']
host = config['POSTGRES']['PG_HOST']

In [4]:
db

'retail_db'

In [5]:
credentials = "postgresql://{}:{}@{}:{}/{}".format(user,passwd,host,port,db)

#using psycopg2 to test connection since there are no tables
import psycopg2
try:
    conn = psycopg2.connect(host=host,dbname=db,user=user,password=passwd,port=port)
except Exception as e:
    print(e)
    
conn.set_session(autocommit=True)

try:
    cur = conn.cursor()
    
except:
    print(e)

In [6]:
credentials

'postgresql://postgres:1234@172.17.0.2:5432/retail_db'

In [7]:
#Helper functions to work with the database
def schemaGen(dataframe, schemaName):
    localSchema = pd.io.sql.get_schema(dataframe,schemaName)
    localSchema = localSchema.replace('TEXT','VARCHAR(255)').replace('INTEGER','NUMERIC').replace('\n','').replace('"',"")
    return "".join(localSchema)

#Using pandas read_sql for getting schema
def getSchema(tableName, credentials):
    schema = pd.read_sql("""SELECT * FROM information_schema.columns where table_name='{}'""".format(tableName),con=credentials)
    return schema

#Issue is in using pd.read_sql to write data to the database. so using psycopg2
def queryTable(query):
    try:
        schema = cur.execute(query)
        return 
    except Exception as e:
        print(e)
        
#This doesn't return anything

#Using the pd.read_sql for getting data from db
def queryBase(query):
    requiredTable = pd.read_sql(query,con=credentials)
    return requiredTable

#This returns the dataframe

In [9]:
queryTable("""CREATE OR REPLACE VIEW orders_v
                AS
            SELECT * FROM orders""")

In [10]:
queryBase("""SELECT * FROM information_schema.tables
            WHERE table_name ~ 'orders'""")

Unnamed: 0,table_catalog,table_schema,table_name,table_type,self_referencing_column_name,reference_generation,user_defined_type_catalog,user_defined_type_schema,user_defined_type_name,is_insertable_into,is_typed,commit_action
0,retail_db,public,orders,BASE TABLE,,,,,,YES,NO,
1,retail_db,public,janorders,BASE TABLE,,,,,,YES,NO,
2,retail_db,public,orders_v,VIEW,,,,,,YES,NO,


In [15]:
queryTable("""UPDATE orders_v 
                SET order_status = upper(order_status)""")

In [16]:
queryBase("""SELECT * FROM orders_v LIMIT 5""")

Unnamed: 0,order_id,order_date,order_customer_id,order_status
0,21,2013-07-25,2711,PENDING
1,22,2013-07-25,333,COMPLETE
2,23,2013-07-25,4367,PENDING_PAYMENT
3,24,2013-07-25,11441,CLOSED
4,25,2013-07-25,9503,CLOSED


In [17]:
queryBase("""SELECT * FROM orders LIMIT 5""")

Unnamed: 0,order_id,order_date,order_customer_id,order_status
0,21,2013-07-25,2711,PENDING
1,22,2013-07-25,333,COMPLETE
2,23,2013-07-25,4367,PENDING_PAYMENT
3,24,2013-07-25,11441,CLOSED
4,25,2013-07-25,9503,CLOSED


In [18]:
queryTable("""CREATE OR REPLACE VIEW order_details
                AS
                SELECT * FROM orders o
                JOIN order_items oi 
                ON o.order_id = oi.order_item_order_id""")

In [19]:
queryBase("SELECT * FROM order_details LIMIT 5")

Unnamed: 0,order_id,order_date,order_customer_id,order_status,order_item_id,order_item_order_id,order_item_product_id,order_item_quantity,order_item_subtotal,order_item_product_price
0,1,2013-07-25,11599,CLOSED,1,1,957,1,299.98,299.98
1,2,2013-07-25,256,PENDING_PAYMENT,2,2,1073,1,199.99,199.99
2,2,2013-07-25,256,PENDING_PAYMENT,3,2,502,5,250.0,50.0
3,2,2013-07-25,256,PENDING_PAYMENT,4,2,403,1,129.99,129.99
4,4,2013-07-25,8827,CLOSED,5,4,897,2,49.98,24.99


In [21]:
#The join activity is completely abstracted
queryBase("""SELECT order_date,
    order_item_product_id,
    round(sum(order_item_subtotal)::numeric, 2) AS revenue
FROM order_details 
GROUP BY order_date,
    order_item_product_id
ORDER BY order_date,
    revenue DESC
LIMIT 10""")

Unnamed: 0,order_date,order_item_product_id,revenue
0,2013-07-25,1004,10799.46
1,2013-07-25,957,9599.36
2,2013-07-25,191,8499.15
3,2013-07-25,365,7558.74
4,2013-07-25,1073,6999.65
5,2013-07-25,1014,6397.44
6,2013-07-25,403,5589.57
7,2013-07-25,502,5100.0
8,2013-07-25,627,2879.28
9,2013-07-25,226,599.99


In [23]:
queryTable("""UPDATE order_details
SET
    order_status = 'pending_payment'
WHERE order_id = 2""")

cannot update view "order_details"
DETAIL:  Views that do not select from a single table or view are not automatically updatable.
HINT:  To enable updating the view, provide an INSTEAD OF UPDATE trigger or an unconditional ON UPDATE DO INSTEAD rule.



In [24]:
queryBase("""SELECT * FROM orders_v LIMIT 5""")

Unnamed: 0,order_id,order_date,order_customer_id,order_status
0,21,2013-07-25,2711,PENDING
1,22,2013-07-25,333,COMPLETE
2,23,2013-07-25,4367,PENDING_PAYMENT
3,24,2013-07-25,11441,CLOSED
4,25,2013-07-25,9503,CLOSED


### Overall Create View provides a way point 
The more the query is complex, better it is put the query inside the views and get it referred. The views can be referred globally, not restricted the query.

### Using With Clause : Vanishing Views
The purpose of the with claues is improving readability of the query, and reducing confusion. The name given holds good for the query only, after that view vanishes

### Subqueries

Sub queries are commonly used with queries using analytic functions to filter the data further. We will see details after going through analytic functions as part of this section.

It is mandatory to have alias for the sub query.

Sub queries can also be used in WHERE clause with IN as well as EXISTS. As part of the sub query we can have join like conditions between tables in FROM clause of the main query and sub query. Such queries are called as Nested Sub Queries.

 You can see example with `IN` as well as `EXISTS` operators.
 
### CTAS - Create Table as SelectÂ¶

Let us understand details related to CTAS or Create Table As Select.

CTAS is primarily used to create tables based on query results.
Following are some of the use cases for which we typically use CTAS.

Taking back up of tables for troubleshooting and debugging performance issues.

Reorganizing the tables for performance tuning.
Getting query results into a table for data analysis as well as 
checking data quality.

We cannot specify column names and data types as part of CREATE TABLE clause in CTAS. It will pick the column names from the SELECT clause

```
At times we have to create empty table with only structure of the table. We can specify always false condition such as `1 = 2` as part of `WHERE` clause using CTAS.```

### Advanced DML Operations

* We can insert query results into a table using `INSERT` with `SELECT`.
* As long as columns specified for table in `INSERT` statement and columns projected in `SELECT` clause match, it works.
* We can also use query results for `UPDATE` as well as `DELETE`.


We need to use nested sub queries as part of the delete with `NOT EXISTS` or `NOT IN` as demonstrated below. We cannot use direct joins as part of the `DELETE`.


## Merging or Upserting Data

At times we need to merge or upsert the data (update existing records and insert new records)

* One of the way to achieve merge or upsert is to develop 2 statements - one to update and other to insert.
* The queries in both the statements (update and insert) should return mutually exclusive results. 
* Even though the statements can be executed in any order, updating first and then inserting perform better in most of the cases (as update have to deal with lesser number of records with this approach)
* We can also take care of merge or upsert using `INSERT` with `ON CONFLICT (columns) DO UPDATE`.

Merging and Upserting needs more through learning. So parking it for the moment.. Going forward to Pivoting and Analytics Function

In [26]:
queryBase("""WITH order_detailQ AS(
                SELECT * FROM orders o
                JOIN order_items oi
                ON o.order_id = oi.order_item_order_id
                ) SELECT * FROM order_detailQ LIMIT 5""")

Unnamed: 0,order_id,order_date,order_customer_id,order_status,order_item_id,order_item_order_id,order_item_product_id,order_item_quantity,order_item_subtotal,order_item_product_price
0,1,2013-07-25,11599,CLOSED,1,1,957,1,299.98,299.98
1,2,2013-07-25,256,PENDING_PAYMENT,2,2,1073,1,199.99,199.99
2,2,2013-07-25,256,PENDING_PAYMENT,3,2,502,5,250.0,50.0
3,2,2013-07-25,256,PENDING_PAYMENT,4,2,403,1,129.99,129.99
4,4,2013-07-25,8827,CLOSED,5,4,897,2,49.98,24.99


In [27]:
queryTable("""CREATE OR REPLACE VIEW daily_product_revenue_v
AS
WITH order_details_nq AS (
    SELECT * FROM orders o
        JOIN order_items oi
            on o.order_id = oi.order_item_order_id
) SELECT order_date,
    order_item_product_id,
    round(sum(order_item_subtotal)::numeric, 2) AS revenue
FROM order_details_nq 
GROUP BY order_date,
    order_item_product_id""")

In [29]:
queryBase("""SELECT * FROM daily_product_revenue_v LIMIT 5""")

Unnamed: 0,order_date,order_item_product_id,revenue
0,2013-07-25,24,319.96
1,2013-07-25,37,69.98
2,2013-07-25,93,74.97
3,2013-07-25,134,100.0
4,2013-07-25,191,8499.15


In [32]:
queryBase("""SELECT * FROM order_items oi
            WHERE EXISTS (
            SELECT 1 FROM orders o
            WHERE o.order_id = oi.order_item_order_id
            )LIMIT 10""")

Unnamed: 0,order_item_id,order_item_order_id,order_item_product_id,order_item_quantity,order_item_subtotal,order_item_product_price
0,1,1,957,1,299.98,299.98
1,2,2,1073,1,199.99,199.99
2,3,2,502,5,250.0,50.0
3,4,2,403,1,129.99,129.99
4,5,4,897,2,49.98,24.99
5,6,4,365,5,299.95,59.99
6,7,4,502,3,150.0,50.0
7,8,4,1014,4,199.92,49.98
8,9,5,957,1,299.98,299.98
9,10,5,365,5,299.95,59.99


In [33]:
queryTable("""CREATE TABLE customers_backup
            AS
            SELECT * FROM customers""")

In [34]:
queryBase("""SELECT * FROM customers_backup LIMIT 5""")

Unnamed: 0,customer_id,customer_fname,customer_lname,customer_email,customer_password,customer_street,customer_city,customer_state,customer_zipcode
0,1,Richard,Hernandez,XXXXXXXXX,XXXXXXXXX,6303 Heather Plaza,Brownsville,TX,78521
1,2,Mary,Barrett,XXXXXXXXX,XXXXXXXXX,9526 Noble Embers Ridge,Littleton,CO,80126
2,3,Ann,Smith,XXXXXXXXX,XXXXXXXXX,3422 Blue Pioneer Bend,Caguas,PR,725
3,4,Mary,Jones,XXXXXXXXX,XXXXXXXXX,8324 Little Common,San Marcos,CA,92069
4,5,Robert,Hudson,XXXXXXXXX,XXXXXXXXX,10 Crystal River Mall,Caguas,PR,725


In [37]:
queryTable("""CREATE TABLE orders_backup
AS
SELECT order_id,
    to_char(order_date, 'yyyy')::int AS order_year,
    to_char(order_date, 'Mon')::int AS order_month,
    to_char(order_date, 'dd')::int AS order_day_of_month,
    to_char(order_date, 'DDD')::int AS order_day_of_year,
    order_customer_id,
    order_status
FROM orders""")

relation "orders_backup" already exists



In [38]:
queryBase("""SELECT * FROM orders_backup LIMIT 5""")

Unnamed: 0,order_id,order_year,order_month,order_day_of_month,order_day_of_year,order_customer_id,order_status
0,21,2013,7,25,206,2711,PENDING
1,22,2013,7,25,206,333,COMPLETE
2,23,2013,7,25,206,4367,PENDING_PAYMENT
3,24,2013,7,25,206,11441,CLOSED
4,25,2013,7,25,206,9503,CLOSED


In [39]:
queryTable("""CREATE TABLE order_items_empty
AS
SELECT * FROM order_items WHERE 1 = 2""")

In [40]:
queryTable("""CREATE TABLE customer_order_metrics_mthly (
    customer_id INT,
    order_month CHAR(7),
    order_count INT,
    order_revenue FLOAT
)""")

In [43]:
queryTable("""ALTER TABLE customer_order_metrics_mthly
                ADD PRIMARY KEY (order_month, customer_id)""")

In [44]:
queryTable("""INSERT INTO customer_order_metrics_mthly
SELECT o.order_customer_id,
    to_char(o.order_date, 'yyyy-MM') AS order_month,
    count(1) order_count,
    NULL
FROM orders o 
    JOIN order_items oi
        ON o.order_id = oi.order_item_order_id
GROUP BY o.order_customer_id,
    to_char(o.order_date, 'yyyy-MM')""")

In [46]:
queryBase("""SELECT * FROM customer_order_metrics_mthly LIMIT 5""")

Unnamed: 0,customer_id,order_month,order_count,order_revenue
0,7343,2014-05,5,
1,5038,2014-03,1,
2,5291,2014-07,4,
3,9201,2014-03,1,
4,10312,2014-03,4,


In [47]:
queryTable("""UPDATE customer_order_metrics_mthly comd
SET 
    (order_count, order_revenue) = (
        SELECT count(1),
            round(sum(order_item_subtotal)::numeric, 2)
        FROM orders o 
            JOIN order_items oi
                ON o.order_id = oi.order_item_order_id
        WHERE o.order_customer_id = comd.customer_id
            AND to_char(o.order_date, 'yyyy-MM') = comd.order_month
            AND to_char(o.order_date, 'yyyy-MM') = '2013-08'
            AND comd.order_month = '2013-08'
        GROUP BY o.order_customer_id,
            to_char(o.order_date, 'yyyy-MM')
    )
WHERE EXISTS (
    SELECT 1 FROM orders o
    WHERE o.order_customer_id = comd.customer_id
        AND to_char(o.order_date, 'yyyy-MM') = comd.order_month
        AND to_char(o.order_date, 'yyyy-MM') = '2013-08'
) AND comd.order_month = '2013-08'""")

In [48]:
queryBase("""SELECT * FROM customer_order_metrics_mthly
WHERE order_month = '2013-08'
ORDER BY order_month,
    customer_id
LIMIT 10""")

Unnamed: 0,customer_id,order_month,order_count,order_revenue
0,2,2013-08,5,769.82
1,13,2013-08,5,1065.93
2,14,2013-08,3,459.97
3,18,2013-08,1,129.99
4,20,2013-08,2,739.91
5,22,2013-08,5,769.96
6,24,2013-08,2,399.91
7,25,2013-08,1,129.99
8,33,2013-08,3,929.92
9,34,2013-08,4,789.92
