In [2]:
import configparser
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [3]:
config = configparser.ConfigParser()
config.read('clustertab.config')

['clustertab.config']

In [6]:
db = config['POSTGRES']['PG_DB']
user = config['POSTGRES']['PG_UNAME']
passwd = config['POSTGRES']['PG_PASS']
port = config['POSTGRES']['PG_PORT']
host = config['POSTGRES']['PG_HOST']

In [7]:
db

'retail_db'

In [8]:
credentials = "postgresql://{}:{}@{}:{}/{}".format(user,passwd,host,port,db)

#using psycopg2 to test connection since there are no tables
import psycopg2
try:
    conn = psycopg2.connect(host=host,dbname=db,user=user,password=passwd,port=port)
except Exception as e:
    print(e)
    
conn.set_session(autocommit=True)

try:
    cur = conn.cursor()
    
except:
    print(e)

In [9]:
credentials

'postgresql://postgres:1234@172.17.0.2:5432/retail_db'

In [10]:
#Helper functions to work with the database
def schemaGen(dataframe, schemaName):
    localSchema = pd.io.sql.get_schema(dataframe,schemaName)
    localSchema = localSchema.replace('TEXT','VARCHAR(255)').replace('INTEGER','NUMERIC').replace('\n','').replace('"',"")
    return "".join(localSchema)

#Using pandas read_sql for getting schema
def getSchema(tableName, credentials):
    schema = pd.read_sql("""SELECT * FROM information_schema.columns where table_name='{}'""".format(tableName),con=credentials)
    return schema

#Issue is in using pd.read_sql to write data to the database. so using psycopg2
def queryTable(query):
    try:
        schema = cur.execute(query)
        return 
    except Exception as e:
        print(e)
        
#This doesn't return anything

#Using the pd.read_sql for getting data from db
def queryBase(query):
    requiredTable = pd.read_sql(query,con=credentials)
    return requiredTable

#This returns the dataframe

### Purpose of Pre-Defined Functions

* How to use official documentation of Postgres to get syntax and symantecs of the pre-defined functions?
* Understand different categories of functions
* How to use functions effectively using real world examples?
* How to manipulate strings and dates?
* How to deal with nulls, convert data types etc?
* Self evaluate by solving the exercises by using multiple functions in tandem.

Postgres provides robust set of pre-defined functions to come up with solutions quickly as per the business requirements. There are many functions, but we will see the most common ones here.
* Following are the categories of functions that are more commonly used.
  * String Manipulation
  * Date Manipulation
  * Numeric Functions
  * Type Conversion Functions
  * CASE and WHEN
  * and more
* One can go to the official documentation from [Postgres website]

## String Manipulation Functions

We use string manipulation functions quite extensively. Here are some of the important functions which we typically use.
* Case Conversion - `lower`, `upper`, `initcap`
* Getting size of the column value - `length`
* Extracting Data - `substr` and `split_part`
* Trimming and Padding functions - `trim`, `rtrim`, `ltrim`, `rpad` and `lpad`
* Reversing strings - `reverse`
* Concatenating multiple strings `concat` and `concat_ws`

## Date Manipulation Functions

Let us go through some of the important date manipulation functions.
* Getting Current Date and Timestamp
* Date Arithmetic using `INTERVAL` and `-` operator
* Getting beginning date or time using `date_trunc`
* Extracting information using `to_char` as well as calendar functions.
* Dealing with unix timestamp using `from_unixtime`, `to_unix_timestamp`


## Handling NULL Values

Let us understand how to handle nulls.
* By default if we try to add or concatenate null to another column or expression or literal, it will return null.
* If we want to replace null with some default value, we can use `coalesce`.
  * Replace commission_pct with 0 if it is null.
* `coalesce` returns first not null value if we pass multiple arguments to it.
* We have a function called as `nullif`. If the first argument is equal to second argument, it returns null. It is typically used when we compare against 2 columns where nulls are also involved.
* You might have seen functions like `nvl`, `nvl2` etc with respect to databases like Oracle. Postgres does not support them.

In [13]:
#There are 50 string related functions or routines
queryBase("""SELECT COUNT(1) FROM information_schema.routines 
                WHERE routine_name ~ 'str'""")

Unnamed: 0,count
0,50


In [14]:
queryBase("""SELECT substring('liquidmoon' FROM 2 for 3)""") #position and length

Unnamed: 0,substring
0,iqu


In [15]:
#Two ways
queryBase("""SELECT substring('liquidmoon',2,3)""")

Unnamed: 0,substring
0,iqu


In [17]:
#These queries are directly executing on the database server. Not creating any tables
queryBase("""SELECT lower('HellWOrlD') as lower_result,
                    upper('HellWOrlD') as upper_result,
                    initcap('HellWOrlD') as initial_cap""")

Unnamed: 0,lower_result,upper_result,initial_cap
0,hellworld,HELLWORLD,Hellworld


In [18]:
#These queries are directly executing on the database server. Not creating any tables
queryBase("""SELECT length('HellWOrlD') as wordLength""")

Unnamed: 0,wordlength
0,9


In [21]:
#The functions gets applied and they are returned in another column.
queryBase("""SELECT customer_fname AS database_fname,
                    lower(customer_fname) AS lower_custFname, 
                    upper(customer_lname) AS upper_custLname
                    FROM customers
                    LIMIT 5""")

Unnamed: 0,database_fname,lower_custfname,upper_custlname
0,Richard,richard,HERNANDEZ
1,Mary,mary,BARRETT
2,Ann,ann,SMITH
3,Mary,mary,JONES
4,Robert,robert,HUDSON


In [22]:
#Note the index is starting with 1, not 0
queryBase("""SELECT substr('2013-07-25 00:00:00.0', 1, 4) AS result""")

Unnamed: 0,result
0,2013


In [23]:
#Note the index is starting with 1, not 0
queryBase("""SELECT substring('2013-07-25 00:00:00.0', 1, 4) AS result""")

Unnamed: 0,result
0,2013


In [25]:
queryBase("""SELECT CONCAT(SUBSTRING(customer_fname, 1, 2),
                        SUBSTRING(customer_lname, 1, 2)) AS initials
          FROM customers LIMIT 2""")

Unnamed: 0,initials
0,RiHe
1,MaBa


In [26]:
queryBase("""SELECT CONCAT(SUBSTRING(customer_fname, 1, 1),
                        SUBSTRING(customer_lname, 1, 2)) AS initials
          FROM customers LIMIT 2""")

Unnamed: 0,initials
0,RHe
1,MBa


In [27]:
queryBase("""SELECT substring('2013-07-25 00:00:00.0' from 12) AS result""")
#one of the ways to access the timestamp

Unnamed: 0,result
0,00:00:00.0


In [28]:
queryBase("""SELECT left('123 456 7890', 3) AS result""")

Unnamed: 0,result
0,123


In [29]:
queryBase("""SELECT right('123 456 7890', 3) AS result""")

Unnamed: 0,result
0,890


In [33]:
queryBase("""SELECT right(customer_lname,length(customer_lname) - 4) AS result
                FROM customers LIMIT 5""")

Unnamed: 0,result
0,andez
1,ett
2,h
3,s
4,on


In [36]:
queryBase("""WITH unique_ids AS (
    SELECT '241-80-7115' AS unique_id UNION
    SELECT '694-30-6851' UNION
    SELECT '586-92-5361' UNION
    SELECT '884-65-284' UNION
    SELECT '876-99-585' UNION
    SELECT '831-59-5593' UNION
    SELECT '399-88-3617' UNION
    SELECT '733-17-4217' UNION
    SELECT '873-68-9778' UNION
    SELECT '48'
) SELECT unique_id,
    substring(unique_id FROM 1 FOR 3) AS unique_id_first3,
    substring(unique_id FROM '......$') AS unique_id_last4 
FROM unique_ids
ORDER BY unique_id""") #the ...$ option is very interesting

Unnamed: 0,unique_id,unique_id_first3,unique_id_last4
0,241-80-7115,241,0-7115
1,399-88-3617,399,8-3617
2,48,48,
3,586-92-5361,586,2-5361
4,694-30-6851,694,0-6851
5,733-17-4217,733,7-4217
6,831-59-5593,831,9-5593
7,873-68-9778,873,8-9778
8,876-99-585,876,99-585
9,884-65-284,884,65-284


In [40]:
#If in doubt can check in the below query. The ::varchar changes the type of variable4
queryBase("""SELECT order_id, substr(order_date::varchar, 5, 5) AS datepart,
            order_status
            FROM orders LIMIT 10""")

Unnamed: 0,order_id,datepart,order_status
0,1,-07-2,CLOSED
1,2,-07-2,PENDING_PAYMENT
2,3,-07-2,COMPLETE
3,4,-07-2,CLOSED
4,5,-07-2,COMPLETE
5,6,-07-2,COMPLETE
6,7,-07-2,COMPLETE
7,8,-07-2,PROCESSING
8,9,-07-2,PENDING_PAYMENT
9,10,-07-2,PENDING_PAYMENT


In [43]:
queryBase("""SELECT split_part('2013-07-26','-',3)""")#The number provides the 

Unnamed: 0,split_part
0,26


In [46]:
queryBase("""WITH addresses AS (
    SELECT '593 Fair Oaks Pass, Frankfort, Kentucky, 40618' AS address UNION
    SELECT ', Vancouver, Washington, 98687' UNION
    SELECT '83047 Glacier Hill Circle, Sacramento, California, 94237' UNION
    SELECT '935 Columbus Junction, Cincinnati, Ohio, 45213' UNION
    SELECT '03010 Nevada Crossing, El Paso, Texas, 88579' UNION
    SELECT '9 Dunning Circle, , Arizona, 85271' UNION
    SELECT '96 Fair Oaks Way, Decatur, Illinois, 62525' UNION
    SELECT '999 Caliangt Avenue, Greenville, South Carolina, 29615' UNION
    SELECT '2 Saint Paul Trail, Bridgeport, , 06673' UNION
    SELECT '3 Reindahl Center, Ogden, Utah'
) SELECT split_part(address, ', ', 1) street,
    split_part(address, ', ', 2) city,
    split_part(address, ', ', 3) state,
    split_part(address, ', ', 4) postal_code
FROM addresses
WHERE split_part(address, ',',1)=''
ORDER BY postal_code""")

Unnamed: 0,street,city,state,postal_code
0,,Vancouver,Washington,98687


In [47]:
#if is represente as case when expression end AS exprs 
queryBase("""WITH unique_ids AS (
    SELECT '241-80-7115' AS unique_id UNION
    SELECT '694-30-6851' UNION
    SELECT '586-92-5361' UNION
    SELECT '884-65-284' UNION
    SELECT '876-99-585' UNION
    SELECT '831-59-5593' UNION
    SELECT '399-88-3617' UNION
    SELECT '733-17-4217' UNION
    SELECT '873-68-9778' UNION
    SELECT '480-69-032'
) SELECT unique_id,
    substring(unique_id FROM 1 FOR 3) AS unique_id_first3,
    substring(unique_id FROM '....$') AS unique_id_last4,
    CASE WHEN length(split_part(unique_id, '-', 3)) = 4
        THEN split_part(unique_id, '-', 3)
        ELSE 'Invalid'
    END AS unique_id_last
FROM unique_ids
ORDER BY unique_id""")

Unnamed: 0,unique_id,unique_id_first3,unique_id_last4,unique_id_last
0,241-80-7115,241,7115,7115
1,399-88-3617,399,3617,3617
2,480-69-032,480,-32,Invalid
3,586-92-5361,586,5361,5361
4,694-30-6851,694,6851,6851
5,733-17-4217,733,4217,4217
6,831-59-5593,831,5593,5593
7,873-68-9778,873,9778,9778
8,876-99-585,876,-585,Invalid
9,884-65-284,884,-284,Invalid


In [48]:
queryBase("""SELECT position('@' IN 'it@versity.com'),
    position ('@' IN 'itversity.com')""")

Unnamed: 0,position,position.1
0,3,0


In [50]:
queryBase("""SELECT strpos(customer_street,' ') AS spacePos FROM customers LIMIT 5""")

Unnamed: 0,spacepos
0,5
1,5
2,5
3,5
4,3


In [52]:
 queryBase("""SELECT position(' ' IN customer_street) AS spacePos FROM customers LIMIT 5""")

Unnamed: 0,spacepos
0,5
1,5
2,5
3,5
4,3


In [57]:
queryBase("""SELECT length(rtrim('     HELLORoma     '))""")

Unnamed: 0,length
0,14


In [58]:
queryBase("""SELECT length(trim('     HELLORoma     '))""")

Unnamed: 0,length
0,9


In [62]:
#Count the number of characters, space is trimmed and the 
queryBase("""SELECT length(ltrim('     HELLORoma-----','-'))""")

Unnamed: 0,length
0,19


In [64]:
#Count the number of characters, you will realize one side of the word is trimed
queryBase("""SELECT length(ltrim('     HELLORoma'))""")

Unnamed: 0,length
0,9


In [65]:
queryBase("""SELECT 2013 AS year, 7 AS month, 25 AS myDate""")

Unnamed: 0,year,month,mydate
0,2013,7,25


In [68]:
queryBase("""SELECT lpad(7::varchar, 5, '0') AS result""")

Unnamed: 0,result
0,7


In [70]:
queryBase("""SELECT lpad(58::varchar, 20, '0') AS result""")

Unnamed: 0,result
0,58


In [71]:
queryBase("""SELECT * FROM (SELECT 2013 AS year, 7 AS month, 25 AS myDate) q""")

Unnamed: 0,year,month,mydate
0,2013,7,25


In [72]:
queryBase("""SELECT concat(year, '-', lpad(month::varchar, 2, '0'), '-',
              lpad(myDate::varchar, 2, '0')) AS order_date
FROM
    (SELECT 2013 AS year, 7 AS month, 25 AS myDate) q""")

Unnamed: 0,order_date
0,2013-07-25


In [75]:
queryBase("""SELECT replace('Halo World', 'alo', 'ello') AS result""")

Unnamed: 0,result
0,Hello World


In [77]:
queryBase("""SELECT overlay('Halo World' PLACING 'ello8z' FROM 2 for 3) AS result""")

Unnamed: 0,result
0,Hello8z World


### Starting Date Time manipulation

In [79]:
queryBase("""SELECT current_date AS current_date""")

Unnamed: 0,current_date
0,2022-11-13


In [80]:
queryBase("""SELECT current_timestamp AS current_ts""")

Unnamed: 0,current_ts
0,2022-11-13 07:26:18.707577+00:00


In [81]:
queryBase("""SELECT SUBSTRING(current_date::VARCHAR, 1, 4)""")

Unnamed: 0,substring
0,2022


In [82]:
queryBase("""SELECT current_date + interval '32 MONTHS' AS results""")

Unnamed: 0,results
0,2025-07-13


In [84]:
queryBase("""SELECT current_timestamp + interval '50 MINUTES' AS results""")

Unnamed: 0,results
0,2022-11-13 08:18:53.238883+00:00


In [85]:
queryBase("""SELECT current_timestamp + interval '3 DAYS 2 HOURS 50 MINUTES' AS results""")

Unnamed: 0,results
0,2022-11-16 10:19:11.756409+00:00


In [86]:
queryBase("""SELECT '2019-03-20'::date - '2017-12-31'::date AS subdatae""")

Unnamed: 0,subdatae
0,444


In [87]:
queryBase("""SELECT date_trunc('YEAR', current_date) AS year_beginning""")

Unnamed: 0,year_beginning
0,2022-01-01 00:00:00+00:00


In [89]:
queryBase("""SELECT date_trunc('MONTH', current_date) AS year_beginning""")

Unnamed: 0,year_beginning
0,2022-11-01 00:00:00+00:00


In [91]:
queryBase("""SELECT to_char(current_timestamp, 'yyyy') AS current_ts""")

Unnamed: 0,current_ts
0,2022


In [105]:
queryBase("""SELECT to_char((current_timestamp + interval '5 days'), 'D') AS current_ts""")

Unnamed: 0,current_ts
0,6


In [106]:
queryBase("""SELECT to_char((current_timestamp + interval '5 days'), 'HH') AS current_ts""")

Unnamed: 0,current_ts
0,7


In [107]:
queryBase("""SELECT to_char((current_timestamp + interval '5 days'), 'dayHH') AS current_ts""")

Unnamed: 0,current_ts
0,friday 07


In [110]:
queryBase("""SELECT to_char((current_timestamp + interval '5 days'), 'yyyy-dayHH') AS current_ts""")

Unnamed: 0,current_ts
0,2022-friday 07


In [111]:
queryBase("""SELECT to_char((current_timestamp + interval '5 days'), 'yyyy-day:HH:MM:SS') AS current_ts""")

Unnamed: 0,current_ts
0,2022-friday :07:11:53


In [112]:
queryBase("""SELECT EXTRACT(week FROM current_date)""")

Unnamed: 0,extract
0,45.0


In [114]:
queryBase("""SELECT EXTRACT(doy FROM current_date)""")

Unnamed: 0,extract
0,317.0


In [116]:
queryBase("""SELECT EXTRACT(minute FROM current_timestamp)""")

Unnamed: 0,extract
0,42.0


In [117]:
queryBase("""SELECT EXTRACT('epoch' FROM current_timestamp)""")

Unnamed: 0,extract
0,1668325000.0


### Numerical Functions

In [119]:
queryBase("""SELECT abs(-10.5), abs(10)""")

Unnamed: 0,abs,abs.1
0,10.5,10


In [121]:
#First Filter
queryBase("""SELECT order_item_subtotal FROM order_items
WHERE order_item_order_id = 2""")

Unnamed: 0,order_item_subtotal
0,199.99
1,250.0
2,129.99


In [120]:
#Then make functions to work on that
queryBase("""SELECT avg(order_item_subtotal) AS order_revenue_avg FROM order_items
WHERE order_item_order_id = 2""")

Unnamed: 0,order_revenue_avg
0,193.326667


In [123]:
queryBase("""
            SELECT order_item_order_id, 
                sum(order_item_subtotal) AS order_revenue_sum,
                avg(order_item_subtotal) AS order_revenue_avg
            FROM order_items
            GROUP BY order_item_order_id
            ORDER BY order_item_order_id
            LIMIT 10
""")

Unnamed: 0,order_item_order_id,order_revenue_sum,order_revenue_avg
0,1,299.98,299.98
1,2,579.98,193.326667
2,4,699.85,174.9625
3,5,1129.86,225.972
4,7,579.92,193.306667
5,8,729.84,182.46
6,9,599.96,199.986667
7,10,651.92,130.384
8,11,919.79,183.958
9,12,1299.87,259.974


In [130]:
queryBase("""
SELECT
    round(10.48,1) rnd,
    floor(10.48) flr,
    ceil(100.478) ciel""")

Unnamed: 0,rnd,flr,ciel
0,10.5,10.0,101.0


In [133]:
#When using the expression inside the function the the results have to be type casted
queryBase("""SELECT order_item_order_id, 
    round(sum(order_item_subtotal)::numeric, 2) AS order_revenue_avg 
FROM order_items
GROUP BY order_item_order_id
LIMIT 5""")

Unnamed: 0,order_item_order_id,order_revenue_avg
0,1,299.98
1,2,579.98
2,4,699.85
3,5,1129.86
4,7,579.92


In [134]:
queryBase("""SELECT order_item_order_id, 
    round(sum(order_item_subtotal)::numeric, 2) AS order_revenue_sum,
    min(order_item_subtotal) AS order_item_subtotal_min,
    max(order_item_subtotal) AS order_item_subtotal_max 
FROM order_items
GROUP BY order_item_order_id
LIMIT 10""")

Unnamed: 0,order_item_order_id,order_revenue_sum,order_item_subtotal_min,order_item_subtotal_max
0,1,299.98,299.98,299.98
1,2,579.98,129.99,250.0
2,4,699.85,49.98,299.95
3,5,1129.86,99.96,299.98
4,7,579.92,79.95,299.98
5,8,729.84,50.0,299.95
6,9,599.96,199.98,199.99
7,10,651.92,21.99,199.99
8,11,919.79,49.98,399.96
9,12,1299.87,100.0,499.95


In [135]:
queryBase("""SELECT (random() * 100)::int + 1""")

Unnamed: 0,?column?
0,41


In [137]:
queryBase("""
SELECT '09'::int AS result
""")

Unnamed: 0,result
0,9


In [138]:
queryBase("""
SELECT '09'::int AS result
""")

Unnamed: 0,result
0,9


In [139]:
queryBase("""
SELECT cast('0.04000' AS FLOAT) AS result
""")

Unnamed: 0,result
0,0.04


In [140]:
queryBase("""
SELECT to_char('2020-09-30'::date, 'MM') AS month
""")

Unnamed: 0,month
0,9


In [139]:
queryBase("""
SELECT cast('0.04000' AS FLOAT) AS result
""")

Unnamed: 0,result
0,0.04


In [141]:
queryBase("""
SELECT split_part('2020-09-30', '-', 2) AS month
""")

Unnamed: 0,month
0,9


In [143]:
queryBase("""
SELECT to_char('2020-09-30', 'DAY') AS month
""")

ProgrammingError: (psycopg2.errors.AmbiguousFunction) function to_char(unknown, unknown) is not unique
LINE 2: SELECT to_char('2020-09-30', 'DAY') AS month
               ^
HINT:  Could not choose a best candidate function. You might need to add explicit type casts.

[SQL: 
SELECT to_char('2020-09-30', 'DAY') AS month
]
(Background on this error at: https://sqlalche.me/e/14/f405)

In [144]:
queryBase("""
SELECT to_char('2020-09-30'::date, 'DAY') AS month
""")

Unnamed: 0,month
0,WEDNESDAY


### Null handlers

In [146]:
queryTable("""
CREATE TABLE IF NOT EXISTS sales(
    sales_person_id INT,
    sales_amount FLOAT,
    commission_pct INT
)""")

In [147]:
queryBase("""INSERT INTO sales VALUES
    (1, 1000, 10),
    (2, 1500, 8),
    (3, 500, NULL),
    (4, 800, 5),
    (5, 250, NULL)""")

ResourceClosedError: This result object does not return rows. It has been closed automatically.

In [148]:
queryBase("""SELECT * FROM sales""")

Unnamed: 0,sales_person_id,sales_amount,commission_pct
0,1,1000.0,10.0
1,2,1500.0,8.0
2,3,500.0,
3,4,800.0,5.0
4,5,250.0,


In [149]:
queryBase("""SELECT s.*, 
    round((sales_amount * commission_pct / 100)::numeric, 2) AS incorrect_commission_amount
FROM sales AS s""")

Unnamed: 0,sales_person_id,sales_amount,commission_pct,incorrect_commission_amount
0,1,1000.0,10.0,100.0
1,2,1500.0,8.0,120.0
2,3,500.0,,
3,4,800.0,5.0,40.0
4,5,250.0,,


In [150]:
queryBase("""SELECT s.*, 
    round((sales_amount * coalesce(commission_pct, 0) / 100)::numeric, 2) AS commission_amount
FROM sales AS s""")

Unnamed: 0,sales_person_id,sales_amount,commission_pct,commission_amount
0,1,1000.0,10.0,100.0
1,2,1500.0,8.0,120.0
2,3,500.0,,0.0
3,4,800.0,5.0,40.0
4,5,250.0,,0.0


In [151]:
queryBase("""SELECT nullif(1, 1)""")

Unnamed: 0,nullif
0,


In [152]:
queryBase("""SELECT nullif(1, 0)""")

Unnamed: 0,nullif
0,1
