In [1]:
import configparser
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
config = configparser.ConfigParser()
config.read('clustertab.config')

['clustertab.config']

In [8]:
db = 'partition_training'
user = config['POSTGRES']['PG_UNAME']
passwd = config['POSTGRES']['PG_PASS']
port = config['POSTGRES']['PG_PORT']
host = config['POSTGRES']['PG_HOST']

In [9]:
db

'partition_training'

In [10]:
credentials = "postgresql://{}:{}@{}:{}/{}".format(user,passwd,host,port,db)

#using psycopg2 to test connection since there are no tables
import psycopg2
try:
    conn = psycopg2.connect(host=host,dbname=db,user=user,password=passwd,port=port)
except Exception as e:
    print(e)
    
conn.set_session(autocommit=True)

try:
    cur = conn.cursor()
    
except:
    print(e)

In [11]:
credentials

'postgresql://postgres:1234@172.17.0.2:5432/partition_training'

In [12]:
#Helper functions to work with the database
def schemaGen(dataframe, schemaName):
    localSchema = pd.io.sql.get_schema(dataframe,schemaName)
    localSchema = localSchema.replace('TEXT','VARCHAR(255)').replace('INTEGER','NUMERIC').replace('\n','').replace('"',"")
    return "".join(localSchema)

#Using pandas read_sql for getting schema
def getSchema(tableName, credentials):
    schema = pd.read_sql("""SELECT * FROM information_schema.columns where table_name='{}'""".format(tableName),con=credentials)
    return schema

#Issue is in using pd.read_sql to write data to the database. so using psycopg2
def queryTable(query):
    try:
        schema = cur.execute(query)
        return 
    except Exception as e:
        print(e)
        
#This doesn't return anything

#Using the pd.read_sql for getting data from db
def queryBase(query):
    requiredTable = pd.read_sql(query,con=credentials)
    return requiredTable

#This returns the dataframe

### Purpose of Partitioning

- Main purpose is to get performance benefits by segregating the data into partitions

- Managing the partitions individually is much easier when deleting or moving the partitions

- When reporting are sending the data for analysis down the pipe-line, it is simpler to refer to a particular partition. (In the exercises there is a scenario to find the order details in Jan month. In such scenarios, the partitioning by Range will help) 

These partitions are created during the Table creation itsels. So this is a Data Definition Task.

Before the exercise is started, a seperate Database is created. Then all the activities are done in that database to avoid confusion

We will be working on all three types of List, Range and Hash Partitions here. The flow is always the creation, management and manipulating data for each of the partitions. 

This activity provides the insight into how glue and later parquet files work. 

In [14]:
queryTable("""CREATE TABLE users_part (
    user_id SERIAL,
    user_first_name VARCHAR(30) NOT NULL,
    user_last_name VARCHAR(30) NOT NULL,
    user_email_id VARCHAR(50) NOT NULL,
    user_email_validated BOOLEAN DEFAULT FALSE,
    user_password VARCHAR(200),
    user_role VARCHAR(1) NOT NULL DEFAULT 'U', --U and A
    is_active BOOLEAN DEFAULT FALSE,
    created_dt DATE DEFAULT CURRENT_DATE,
    last_updated_ts TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    PRIMARY KEY (user_role, user_id)
) PARTITION BY LIST(user_role)""")

In [15]:
#Why create this?
queryTable("""CREATE INDEX users_part_email_id_idx 
    ON users_part(user_email_id)""")

In [16]:
# Fail as the data contains No partition information
queryTable("""INSERT INTO users_part (user_first_name, user_last_name, user_email_id)
VALUES 
    ('Scott', 'Tiger', 'scott@tiger.com'),
    ('Donald', 'Duck', 'donald@duck.com'),
    ('Mickey', 'Mouse', 'mickey@mouse.com')""")

no partition of relation "users_part" found for row
DETAIL:  Partition key of the failing row contains (user_role) = (U).



In [17]:
queryTable("""CREATE TABLE users_part_default 
                PARTITION OF users_part DEFAULT""")

Once the partition is initiated, the same can be listed using \d+ in the server prompt.

Also the number of partitions start increasing under the table

In [18]:
queryTable("""INSERT INTO users_part (user_first_name, user_last_name, user_email_id, user_role)
VALUES 
    ('Scott', 'Tiger', 'scott@tiger.com', 'U'),
    ('Donald', 'Duck', 'donald@duck.com', 'U'),
    ('Mickey', 'Mouse', 'mickey@mouse.com', 'U')""")

In [19]:
queryBase("""SELECT * FROM users_part""")

Unnamed: 0,user_id,user_first_name,user_last_name,user_email_id,user_email_validated,user_password,user_role,is_active,created_dt,last_updated_ts
0,2,Scott,Tiger,scott@tiger.com,False,,U,False,2022-11-13,2022-11-13 02:05:47.377391
1,3,Donald,Duck,donald@duck.com,False,,U,False,2022-11-13,2022-11-13 02:05:47.377391
2,4,Mickey,Mouse,mickey@mouse.com,False,,U,False,2022-11-13,2022-11-13 02:05:47.377391


In [20]:
#At this moment there seems to be no difference between the tables. It will soon change
queryBase("""SELECT * FROM users_part_default""")

Unnamed: 0,user_id,user_first_name,user_last_name,user_email_id,user_email_validated,user_password,user_role,is_active,created_dt,last_updated_ts
0,2,Scott,Tiger,scott@tiger.com,False,,U,False,2022-11-13,2022-11-13 02:05:47.377391
1,3,Donald,Duck,donald@duck.com,False,,U,False,2022-11-13,2022-11-13 02:05:47.377391
2,4,Mickey,Mouse,mickey@mouse.com,False,,U,False,2022-11-13,2022-11-13 02:05:47.377391


In [21]:
queryTable("""CREATE TABLE users_part_a 
                PARTITION OF users_part  
                    FOR VALUES IN ('A')""")

In [22]:
queryTable("""UPDATE users_part
                SET user_role = 'A'
                    WHERE user_email_id = 'scott@tiger.com'""")

In [23]:
queryBase("""SELECT * FROM users_part""")

Unnamed: 0,user_id,user_first_name,user_last_name,user_email_id,user_email_validated,user_password,user_role,is_active,created_dt,last_updated_ts
0,2,Scott,Tiger,scott@tiger.com,False,,A,False,2022-11-13,2022-11-13 02:05:47.377391
1,3,Donald,Duck,donald@duck.com,False,,U,False,2022-11-13,2022-11-13 02:05:47.377391
2,4,Mickey,Mouse,mickey@mouse.com,False,,U,False,2022-11-13,2022-11-13 02:05:47.377391


In [24]:
#The data point update has created the below data insertion into users_part_a
queryBase("""SELECT * FROM users_part_a""")

Unnamed: 0,user_id,user_first_name,user_last_name,user_email_id,user_email_validated,user_password,user_role,is_active,created_dt,last_updated_ts
0,2,Scott,Tiger,scott@tiger.com,False,,A,False,2022-11-13,2022-11-13 02:05:47.377391


In [25]:
#The data from users_part_default has moved directly to users_part_a. Only the data point was updated
queryBase("""SELECT * FROM users_part_default""")

Unnamed: 0,user_id,user_first_name,user_last_name,user_email_id,user_email_validated,user_password,user_role,is_active,created_dt,last_updated_ts
0,3,Donald,Duck,donald@duck.com,False,,U,False,2022-11-13,2022-11-13 02:05:47.377391
1,4,Mickey,Mouse,mickey@mouse.com,False,,U,False,2022-11-13,2022-11-13 02:05:47.377391


In [26]:
#The "U" values are currently considered default
queryTable("""CREATE TABLE users_part_u 
                    PARTITION OF users_part  
                        FOR VALUES IN ('U')""")

updated partition constraint for default partition "users_part_default" would be violated by some row



In [27]:
#detaching partitions
queryTable("""ALTER TABLE users_part
                    DETACH PARTITION users_part_default""")

In [28]:
#The "U" values can now be moved into new partition
queryTable("""CREATE TABLE users_part_u 
                    PARTITION OF users_part  
                        FOR VALUES IN ('U')""")

In [29]:
#The data from users_part_default is not moved into users_part_u
queryBase("""SELECT * FROM users_part_u""")

Unnamed: 0,user_id,user_first_name,user_last_name,user_email_id,user_email_validated,user_password,user_role,is_active,created_dt,last_updated_ts


In [30]:
#The data from users_part_default is manually inserted into 
queryTable("""INSERT INTO users_part_u
                SELECT * FROM users_part_default""")

In [31]:
queryBase("""SELECT * FROM users_part_u""")

Unnamed: 0,user_id,user_first_name,user_last_name,user_email_id,user_email_validated,user_password,user_role,is_active,created_dt,last_updated_ts
0,3,Donald,Duck,donald@duck.com,False,,U,False,2022-11-13,2022-11-13 02:05:47.377391
1,4,Mickey,Mouse,mickey@mouse.com,False,,U,False,2022-11-13,2022-11-13 02:05:47.377391


In [32]:
#The data from users_part_default is manually inserted into 
queryTable("""DROP TABLE users_part_default""")

In [34]:
queryTable("""CREATE TABLE users_part_default
                    PARTITION OF users_part DEFAULT""")

In [36]:
queryTable("""INSERT INTO users_part (user_first_name, user_last_name, user_email_id)
VALUES 
    ('tosca', 'Major', 'tosca@gmail.com'),
    ('Aki', 'Kimbo', 'kimbo@acimbo.com')""")

In [38]:
#There is default setting in DDL of the users_part, so the above data points have 'U'
queryBase("""SELECT * FROM users_part""")

Unnamed: 0,user_id,user_first_name,user_last_name,user_email_id,user_email_validated,user_password,user_role,is_active,created_dt,last_updated_ts
0,2,Scott,Tiger,scott@tiger.com,False,,A,False,2022-11-13,2022-11-13 02:05:47.377391
1,3,Donald,Duck,donald@duck.com,False,,U,False,2022-11-13,2022-11-13 02:05:47.377391
2,4,Mickey,Mouse,mickey@mouse.com,False,,U,False,2022-11-13,2022-11-13 02:05:47.377391
3,5,tosca,Major,tosca@gmail.com,False,,U,False,2022-11-13,2022-11-13 02:24:55.698886
4,6,Aki,Kimbo,kimbo@acimbo.com,False,,U,False,2022-11-13,2022-11-13 02:24:55.698886


In [40]:
queryTable("""INSERT INTO users_part (user_first_name, user_last_name, user_email_id,user_role)
VALUES 
    ('kili', 'manjaror', 'kimi@kiliman.com','N')""")

In [41]:
queryBase("""SELECT * FROM users_part_default""")

Unnamed: 0,user_id,user_first_name,user_last_name,user_email_id,user_email_validated,user_password,user_role,is_active,created_dt,last_updated_ts
0,7,kili,manjaror,kimi@kiliman.com,False,,N,False,2022-11-13,2022-11-13 02:30:09.045313


In [42]:
queryTable("""DELETE FROM users_part WHERE user_email_id = 'donald@duck.com'""")

In [43]:
queryBase("""SELECT * FROM users_part""")

Unnamed: 0,user_id,user_first_name,user_last_name,user_email_id,user_email_validated,user_password,user_role,is_active,created_dt,last_updated_ts
0,2,Scott,Tiger,scott@tiger.com,False,,A,False,2022-11-13,2022-11-13 02:05:47.377391
1,4,Mickey,Mouse,mickey@mouse.com,False,,U,False,2022-11-13,2022-11-13 02:05:47.377391
2,5,tosca,Major,tosca@gmail.com,False,,U,False,2022-11-13,2022-11-13 02:24:55.698886
3,6,Aki,Kimbo,kimbo@acimbo.com,False,,U,False,2022-11-13,2022-11-13 02:24:55.698886
4,7,kili,manjaror,kimi@kiliman.com,False,,N,False,2022-11-13,2022-11-13 02:30:09.045313


In [46]:
queryTable("""DELETE FROM users_part WHERE user_email_id ~ 'tosca'""")

In [47]:
queryBase("""SELECT * FROM users_part_u""")

Unnamed: 0,user_id,user_first_name,user_last_name,user_email_id,user_email_validated,user_password,user_role,is_active,created_dt,last_updated_ts
0,4,Mickey,Mouse,mickey@mouse.com,False,,U,False,2022-11-13,2022-11-13 02:05:47.377391
1,6,Aki,Kimbo,kimbo@acimbo.com,False,,U,False,2022-11-13,2022-11-13 02:24:55.698886


### Starting the range partition exercise

  * Create table using `PARTITION BY RANGE`
  * Add default and range specific partitions
  * Validate by inserting data into the table

How the data will be provided in reality. As some CSV or User input. When it is provided as CSV file, the files are split based on partitions into individual files or even into 
seperate folders. 

If I need to compare the effort of Glue with manual data engineering, the process of moving the data from multiple CSV files from different folders has to be understood

In [48]:
queryTable("""DROP TABLE IF EXISTS users_part""")

In [49]:
queryTable("""CREATE TABLE users_range_part (
    user_id SERIAL,
    user_first_name VARCHAR(30) NOT NULL,
    user_last_name VARCHAR(30) NOT NULL,
    user_email_id VARCHAR(50) NOT NULL,
    user_email_validated BOOLEAN DEFAULT FALSE,
    user_password VARCHAR(200),
    user_role VARCHAR(1) NOT NULL DEFAULT 'U', --U and A
    is_active BOOLEAN DEFAULT FALSE,
    created_dt DATE DEFAULT CURRENT_DATE,
    last_updated_ts TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    PRIMARY KEY (created_dt, user_id)
) PARTITION BY RANGE(created_dt)""")

* We can add partition to existing partitioned table using `CREATE TABLE partition_name PARTITION OF table_name`.
* We can have a partition for default values so that all the data that does not satisfy the partition condition can be added to it.
* We can have a partition for specific range of values using `FOR VALUES FROM (from_value) TO (to_value)` as part of `CREATE TABLE partition_name PARTITION OF table_name`.
* Once partitions are added, we can insert data into the partitioned table.


In [50]:
queryTable("""CREATE TABLE users_range_part_flt
                PARTITION OF users_range_part DEFAULT""")

In [52]:
queryTable("""CREATE TABLE users_range_part_2016
                    PARTITION OF users_range_part
                        FOR VALUES FROM ('2016-01-01') TO ('2016-12-31');
            CREATE TABLE users_range_part_2017
                PARTITION OF users_range_part
                    FOR VALUES FROM ('2017-01-01') TO ('2017-12-31');
            CREATE TABLE users_range_part_2018
                PARTITION OF users_range_part
                    FOR VALUES FROM ('2018-01-01') TO ('2018-12-31');
            CREATE TABLE users_range_part_2019
                PARTITION OF users_range_part
                    FOR VALUES FROM ('2019-01-01') TO ('2019-12-31');
""")

In [53]:
queryTable("""INSERT INTO users_range_part 
    (user_first_name, user_last_name, user_email_id, created_dt)
VALUES 
    ('Scott', 'Tiger', 'scott@tiger.com', '2018-10-01'),
    ('Donald', 'Duck', 'donald@duck.com', '2019-02-10'),
    ('Mickey', 'Mouse', 'mickey@mouse.com', '2017-06-22')""")

In [55]:
queryBase("""SELECT * FROM users_range_part_flt""")

Unnamed: 0,user_id,user_first_name,user_last_name,user_email_id,user_email_validated,user_password,user_role,is_active,created_dt,last_updated_ts


In [56]:
queryBase("""SELECT * FROM users_range_part_2017""")

Unnamed: 0,user_id,user_first_name,user_last_name,user_email_id,user_email_validated,user_password,user_role,is_active,created_dt,last_updated_ts
0,3,Mickey,Mouse,mickey@mouse.com,False,,U,False,2017-06-22,2022-11-13 03:16:58.713794


In [57]:
queryBase("""SELECT * FROM users_range_part_2016""")

Unnamed: 0,user_id,user_first_name,user_last_name,user_email_id,user_email_validated,user_password,user_role,is_active,created_dt,last_updated_ts


In [58]:
queryBase("""SELECT * FROM users_range_part""")

Unnamed: 0,user_id,user_first_name,user_last_name,user_email_id,user_email_validated,user_password,user_role,is_active,created_dt,last_updated_ts
0,3,Mickey,Mouse,mickey@mouse.com,False,,U,False,2017-06-22,2022-11-13 03:16:58.713794
1,1,Scott,Tiger,scott@tiger.com,False,,U,False,2018-10-01,2022-11-13 03:16:58.713794
2,2,Donald,Duck,donald@duck.com,False,,U,False,2019-02-10,2022-11-13 03:16:58.713794


In [60]:
queryTable("""ALTER TABLE users_range_part
    DETACH PARTITION users_range_part_2016;
    ALTER TABLE users_range_part
    DETACH PARTITION users_range_part_2017;
    ALTER TABLE users_range_part
    DETACH PARTITION users_range_part_2018;
    ALTER TABLE users_range_part
    DETACH PARTITION users_range_part_2019""")

In [83]:
import pandas as pd
from pandas.tseries.offsets import MonthBegin, MonthEnd

months = pd.date_range(start='1/1/2016', end='3/31/2022', freq='1M')

for month in months:
    begin_date = month - MonthBegin(1)
    end_date = month + MonthEnd(0)
    print(str(month)[:7].replace('-', ''), end=':')
    print(str(begin_date).split(' ')[0], end=':')
    print(str(end_date).split(' ')[0])

201601:2016-01-01:2016-01-31
201602:2016-02-01:2016-02-29
201603:2016-03-01:2016-03-31
201604:2016-04-01:2016-04-30
201605:2016-05-01:2016-05-31
201606:2016-06-01:2016-06-30
201607:2016-07-01:2016-07-31
201608:2016-08-01:2016-08-31
201609:2016-09-01:2016-09-30
201610:2016-10-01:2016-10-31
201611:2016-11-01:2016-11-30
201612:2016-12-01:2016-12-31
201701:2017-01-01:2017-01-31
201702:2017-02-01:2017-02-28
201703:2017-03-01:2017-03-31
201704:2017-04-01:2017-04-30
201705:2017-05-01:2017-05-31
201706:2017-06-01:2017-06-30
201707:2017-07-01:2017-07-31
201708:2017-08-01:2017-08-31
201709:2017-09-01:2017-09-30
201710:2017-10-01:2017-10-31
201711:2017-11-01:2017-11-30
201712:2017-12-01:2017-12-31
201801:2018-01-01:2018-01-31
201802:2018-02-01:2018-02-28
201803:2018-03-01:2018-03-31
201804:2018-04-01:2018-04-30
201805:2018-05-01:2018-05-31
201806:2018-06-01:2018-06-30
201807:2018-07-01:2018-07-31
201808:2018-08-01:2018-08-31
201809:2018-09-01:2018-09-30
201810:2018-10-01:2018-10-31
201811:2018-11

In [63]:
MonthBegin(1)

<MonthBegin>

In [65]:
MonthBegin(10)

<10 * MonthBegins>

In [69]:
MonthEnd(10) + months[0]

Timestamp('2016-11-30 00:00:00', freq='M')

In [62]:
months

DatetimeIndex(['2016-01-31', '2016-02-29', '2016-03-31'], dtype='datetime64[ns]', freq='M')

In [75]:
for month in months:
    begin_date = month - MonthBegin(1)
    end_date = month + MonthEnd(0)
    print(f'Adding partition for {begin_date} and {end_date}')
    yyyymm=str(month)[:7].replace('-', '')
    begin_date=str(begin_date).split(' ')[0]
    end_date=str(end_date).split(' ')[0]
    query = f'''
            CREATE TABLE users_range_part_{yyyymm}
            PARTITION OF users_range_part
            FOR VALUES FROM ('{begin_date}') TO ('{end_date}')
        ''' 

    print(query)

Adding partition for 2016-01-01 00:00:00 and 2016-01-31 00:00:00

            CREATE TABLE users_range_part_201601
            PARTITION OF users_range_part
            FOR VALUES FROM ('2016-01-01') TO ('2016-01-31')
        
Adding partition for 2016-02-01 00:00:00 and 2016-02-29 00:00:00

            CREATE TABLE users_range_part_201602
            PARTITION OF users_range_part
            FOR VALUES FROM ('2016-02-01') TO ('2016-02-29')
        
Adding partition for 2016-03-01 00:00:00 and 2016-03-31 00:00:00

            CREATE TABLE users_range_part_201603
            PARTITION OF users_range_part
            FOR VALUES FROM ('2016-03-01') TO ('2016-03-31')
        


In [87]:
queryTable("""ALTER TABLE users_range_part
                DETACH PARTITION users_range_part_flt""")

In [88]:
for month in months:
    begin_date = month - MonthBegin(1)
    end_date = month + MonthEnd(0)
    print(f'Adding partition for {begin_date} and {end_date}')
    yyyymm=str(month)[:7].replace('-', '')
    begin_date=str(begin_date).split(' ')[0]
    end_date=str(end_date).split(' ')[0]
    query = f'''
            CREATE TABLE users_range_part_{yyyymm}
            PARTITION OF users_range_part
            FOR VALUES FROM ('{begin_date}') TO ('{end_date}')
        ''' 
    queryTable(f"""DROP TABLE IF EXISTS users_range_part_{yyyymm}""")
    queryTable(query)

Adding partition for 2016-01-01 00:00:00 and 2016-01-31 00:00:00
Adding partition for 2016-02-01 00:00:00 and 2016-02-29 00:00:00
Adding partition for 2016-03-01 00:00:00 and 2016-03-31 00:00:00
Adding partition for 2016-04-01 00:00:00 and 2016-04-30 00:00:00
Adding partition for 2016-05-01 00:00:00 and 2016-05-31 00:00:00
Adding partition for 2016-06-01 00:00:00 and 2016-06-30 00:00:00
Adding partition for 2016-07-01 00:00:00 and 2016-07-31 00:00:00
Adding partition for 2016-08-01 00:00:00 and 2016-08-31 00:00:00
Adding partition for 2016-09-01 00:00:00 and 2016-09-30 00:00:00
Adding partition for 2016-10-01 00:00:00 and 2016-10-31 00:00:00
Adding partition for 2016-11-01 00:00:00 and 2016-11-30 00:00:00
Adding partition for 2016-12-01 00:00:00 and 2016-12-31 00:00:00
Adding partition for 2017-01-01 00:00:00 and 2017-01-31 00:00:00
Adding partition for 2017-02-01 00:00:00 and 2017-02-28 00:00:00
Adding partition for 2017-03-01 00:00:00 and 2017-03-31 00:00:00
Adding partition for 2017

In [89]:
queryTable("""INSERT INTO users_range_part
SELECT * FROM users_range_part_2016;
INSERT INTO users_range_part
SELECT * FROM users_range_part_2017;
INSERT INTO users_range_part
SELECT * FROM users_range_part_2018;
INSERT INTO users_range_part
SELECT * FROM users_range_part_2019
""")

In [90]:
queryBase("""SELECT * FROM users_range_part_201706""")

Unnamed: 0,user_id,user_first_name,user_last_name,user_email_id,user_email_validated,user_password,user_role,is_active,created_dt,last_updated_ts
0,3,Mickey,Mouse,mickey@mouse.com,False,,U,False,2017-06-22,2022-11-13 03:16:58.713794


In [91]:
queryBase("""SELECT table_catalog, 
    table_schema, 
    table_name FROM information_schema.tables
WHERE table_name ~ 'users_range_part_'
ORDER BY table_name""")

Unnamed: 0,table_catalog,table_schema,table_name
0,partition_training,public,users_range_part_2016
1,partition_training,public,users_range_part_201601
2,partition_training,public,users_range_part_201602
3,partition_training,public,users_range_part_201603
4,partition_training,public,users_range_part_201604
...,...,...,...
75,partition_training,public,users_range_part_202112
76,partition_training,public,users_range_part_202201
77,partition_training,public,users_range_part_202202
78,partition_training,public,users_range_part_202203


Here are the steps involved in creating table using hash partitioning strategy.

* Create table using PARTITION BY HASH
* Add default and remainder specific partitions based up on modulus.
* Validate by inserting data into the table

We can detach as well as drop the partitions from the table.

Hash partitioning is typically done on sparse columns such as user_id

In [92]:
queryTable("""CREATE TABLE users_hash_part (
    user_id SERIAL,
    user_first_name VARCHAR(30) NOT NULL,
    user_last_name VARCHAR(30) NOT NULL,
    user_email_id VARCHAR(50) NOT NULL,
    user_email_validated BOOLEAN DEFAULT FALSE,
    user_password VARCHAR(200),
    user_role VARCHAR(1) NOT NULL DEFAULT 'U', --U and A
    is_active BOOLEAN DEFAULT FALSE,
    created_dt DATE DEFAULT CURRENT_DATE,
    last_updated_ts TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    PRIMARY KEY (user_id)
) PARTITION BY HASH(user_id)""")

In [93]:
queryTable("""CREATE TABLE users_hash_part_default
PARTITION OF users_hash_part DEFAULT""")

a hash-partitioned table may not have a default partition



In [94]:
queryTable("""CREATE TABLE users_hash_part_0_of_8
                    PARTITION OF users_hash_part
                        FOR VALUES WITH (modulus 8, remainder 0);
            CREATE TABLE users_hash_part_1_of_8
                    PARTITION OF users_hash_part
                        FOR VALUES WITH (modulus 8, remainder 1);
            CREATE TABLE users_hash_part_2_of_8
                    PARTITION OF users_hash_part
                        FOR VALUES WITH (modulus 8, remainder 2);
            CREATE TABLE users_hash_part_3_of_8
                    PARTITION OF users_hash_part
                        FOR VALUES WITH (modulus 8, remainder 3);
            CREATE TABLE users_hash_part_4_of_8
                    PARTITION OF users_hash_part
                        FOR VALUES WITH (modulus 8, remainder 4);
            CREATE TABLE users_hash_part_5_of_8
                    PARTITION OF users_hash_part
                        FOR VALUES WITH (modulus 8, remainder 5);
            CREATE TABLE users_hash_part_6_of_8
                    PARTITION OF users_hash_part
                        FOR VALUES WITH (modulus 8, remainder 6);
            CREATE TABLE users_hash_part_7_of_8
                    PARTITION OF users_hash_part
                        FOR VALUES WITH (modulus 8, remainder 7);""")

In [95]:
#When you insert there is no need to care about the partitions, the table takes care
queryTable("""INSERT INTO users_hash_part
    (user_first_name, user_last_name, user_email_id, created_dt)
VALUES 
    ('Scott', 'Tiger', 'scott@tiger.com', '2018-10-01'),
    ('Donald', 'Duck', 'donald@duck.com', '2019-02-10'),
    ('Mickey', 'Mouse', 'mickey@mouse.com', '2017-06-22')""")

In [96]:
queryBase("""SELECT * FROM users_hash_part""")

Unnamed: 0,user_id,user_first_name,user_last_name,user_email_id,user_email_validated,user_password,user_role,is_active,created_dt,last_updated_ts
0,1,Scott,Tiger,scott@tiger.com,False,,U,False,2018-10-01,2022-11-13 03:59:10.361660
1,3,Mickey,Mouse,mickey@mouse.com,False,,U,False,2017-06-22,2022-11-13 03:59:10.361660
2,2,Donald,Duck,donald@duck.com,False,,U,False,2019-02-10,2022-11-13 03:59:10.361660


In [97]:
queryBase("""SELECT * FROM users_hash_part_0_of_8""")

Unnamed: 0,user_id,user_first_name,user_last_name,user_email_id,user_email_validated,user_password,user_role,is_active,created_dt,last_updated_ts
0,1,Scott,Tiger,scott@tiger.com,False,,U,False,2018-10-01,2022-11-13 03:59:10.361660


In [98]:
queryBase("""SELECT * FROM users_hash_part_1_of_8""")

Unnamed: 0,user_id,user_first_name,user_last_name,user_email_id,user_email_validated,user_password,user_role,is_active,created_dt,last_updated_ts
0,3,Mickey,Mouse,mickey@mouse.com,False,,U,False,2017-06-22,2022-11-13 03:59:10.361660


In [99]:
queryBase("""SELECT * FROM users_hash_part_2_of_8""")

Unnamed: 0,user_id,user_first_name,user_last_name,user_email_id,user_email_validated,user_password,user_role,is_active,created_dt,last_updated_ts
0,2,Donald,Duck,donald@duck.com,False,,U,False,2019-02-10,2022-11-13 03:59:10.361660


In [100]:
queryBase("""SELECT * FROM users_hash_part_3_of_8""")

Unnamed: 0,user_id,user_first_name,user_last_name,user_email_id,user_email_validated,user_password,user_role,is_active,created_dt,last_updated_ts


## Sub Partitioning

We can have sub partitions created with different permutations and combinations. Sub Partitioning is also known as nested partitioning.
* List - List
* List - Range
and others.

In [101]:
queryTable("""CREATE TABLE users_qtly (
    user_id SERIAL,
    user_first_name VARCHAR(30) NOT NULL,
    user_last_name VARCHAR(30) NOT NULL,
    user_email_id VARCHAR(50) NOT NULL,
    user_email_validated BOOLEAN DEFAULT FALSE,
    user_password VARCHAR(200),
    user_role VARCHAR(1) NOT NULL DEFAULT 'U', --U and A
    is_active BOOLEAN DEFAULT FALSE,
    created_dt DATE DEFAULT CURRENT_DATE,
    created_year INT,
    created_mnth INT,
    last_updated_ts TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    PRIMARY KEY (created_year, created_mnth, user_id)
) PARTITION BY LIST(created_year)""")

In [102]:
#The first partition will be partitioned again
queryTable("""CREATE TABLE users_qtly_2016
                PARTITION OF users_qtly
                    FOR VALUES IN (2016)
                        PARTITION BY LIST (created_mnth)""")

In [102]:
#The first partition will be partitioned again
queryTable("""CREATE TABLE users_qtly_2016
                PARTITION OF users_qtly
                    FOR VALUES IN (2016)
                        PARTITION BY LIST (created_mnth)""")

In [103]:
#The first partition will be partitioned again, based on the month number. This will be used 
#in the exercise
queryTable("""CREATE TABLE users_qtly_2016q1
                    PARTITION OF users_qtly_2016
                        FOR VALUES IN (1, 2, 3)""")

### List - Range Partitioning

Let us understand how we can create table using list - Range sub partitioning using same example as before (partitioning by year and then by quarter).
* Create table with `PARTITION BY LIST` with `created_year`.
* Create tables for yearly partitions with `PARTITION BY RANGE` with `created_month`.
* Create tables for quarterly partitions with the range of values using `FOR VALUES FROM (lower_bound) TO (upper_bound)`.

In [104]:
#The first partition will be partitioned again, based on the month number. This will be used 
#in the exercise
queryTable("""CREATE TABLE users_qtly_2016
                    PARTITION OF users_qtly
                        FOR VALUES IN (2016)
                            PARTITION BY RANGE (created_mnth)""")

relation "users_qtly_2016" already exists



In [105]:
#The first partition will be partitioned again, based on the month number. This will be used 
#in the exercise
queryTable("""CREATE TABLE users_qtly_2016q1
                    PARTITION OF users_qtly_2016
                            FOR VALUES FROM (1) TO (3)""")

relation "users_qtly_2016q1" already exists

