# Demo : Database Denormalization

use pandas (optional) just for display rows.  
If needed, follow [pandas installation instruction](https://pandas.pydata.org/docs/getting_started/install.html)

In [None]:
import pandas as pd

Import postgresql library and open connection

In [None]:
import psycopg2

try:
    conn = psycopg2.connect("host=34.101.229.192 dbname=postgres user=postgres password=CourseDE888")
    conn.set_session(autocommit=True)
    
    cur = conn.cursor()
except Exception as e: 
    print("Error: cannot open cursor for SQL interaction")
    print(e)

### What if we want several report?

**1st Report** is team details, including:
  - team name
  - team origin
  - is hero
  - headquarter name
  - headquarter location
  
----  
  
**2nd Report** is member details, including:
  - team name
  - team origin
  - is hero
  - member name
  - member real name
  - member alias
  - member homeland name
  - member superpower

This is 1st report from normalized table

In [None]:
sql = """
    SELECT t.team_name, t.origin, t.is_hero, hq.name, hq.location
    FROM teams t    
    INNER JOIN headquarters hq ON
        t.team_id = hq.team_id
    """
    
cur.execute(sql)

for row in cur.fetchall():
    print(row)

This is 2nd report from normalized table

In [None]:
sql = """
        SELECT t.team_name, t.origin, t.is_hero, m.name, m.real_name, m.alias, h.name, s.name
        FROM teams t
        INNER JOIN members m ON
            m.team_id = t.team_id
        INNER JOIN member_superpowers ms ON
            m.member_id = ms.member_id
        INNER JOIN superpowers s ON
            s.superpower_id = ms.superpower_id
        INNER JOIN homelands h ON
            h.homeland_id = m.homeland_id
        ORDER BY t.team_name, m.name
    """

cur.execute(sql)

for row in cur.fetchall():
    print(row)

### Denormalize table for 1st report

<img src="img/postgresql-denormalization-diagram-01.png" align="left"/>

Create table

In [None]:
cur.execute("DROP TABLE IF EXISTS team_reports")

cur.execute("""
    CREATE TABLE IF NOT EXISTS team_reports(
        team_id integer NOT NULL,
        team_name varchar,
        team_origin varchar,
        is_hero boolean,
        headquarter_name varchar,
        headquarter_location varchar
    );
""")

cur.execute("""
    CREATE INDEX IF NOT EXISTS team_reports_team_name_idx 
        ON team_reports (team_name)
""")

Insert data into table

In [None]:
cur.execute("""
    INSERT INTO team_reports (
        team_id, team_name, team_origin, is_hero, headquarter_name, headquarter_location) 
    (
        SELECT t.team_id, t.team_name, t.origin,
               t.is_hero, hq.name, hq.location
        FROM teams t
        INNER JOIN headquarters hq ON t.team_id = hq.team_id
    );
""")

### Denormalize table for 2nd report

<img src="img/postgresql-denormalization-diagram-02.png" align="left"/>

Create table

In [None]:
cur.execute("DROP TABLE IF EXISTS member_detail_reports CASCADE")
cur.execute("DROP TABLE IF EXISTS member_team_reports CASCADE")

cur.execute("""
    CREATE TABLE IF NOT EXISTS member_team_reports(
        team_id integer PRIMARY KEY,
        team_name varchar,
        team_origin varchar,
        is_hero boolean
    );
""")

cur.execute("""
    CREATE TABLE IF NOT EXISTS member_detail_reports(
        member_id integer NOT NULL,
        team_id integer REFERENCES member_team_reports(team_id),
        member_name varchar,
        member_real_name varchar,
        member_alias varchar,
        member_homeland varchar,
        member_superpower varchar
    );
""")

cur.execute("""
    CREATE INDEX IF NOT EXISTS member_team_reports_team_name_idx 
        ON member_team_reports (team_name)
""")

cur.execute("""
    CREATE INDEX IF NOT EXISTS member_detail_reports_member_name_idx 
        ON member_detail_reports (member_name)
""")

Insert data into table

In [None]:
cur.execute("""
    INSERT INTO member_team_reports (
        team_id, team_name, team_origin, is_hero) 
        (
            SELECT t.team_id, t.team_name, t.origin, t.is_hero
            FROM teams t
        );
""")

cur.execute("""
    INSERT INTO member_detail_reports (
        member_id, team_id, member_name, member_real_name, 
        member_alias, member_homeland, member_superpower) 
        (
            SELECT m.member_id, m.team_id, m.name, 
                   m.real_name, m.alias, h.name, s.name
            FROM members m
            INNER JOIN member_superpowers ms ON
                m.member_id = ms.member_id
            INNER JOIN superpowers s ON
                s.superpower_id = ms.superpower_id
            INNER JOIN homelands h ON
                h.homeland_id = m.homeland_id
        );
""")

### Now we can select from denormalized tables, with minimum joins.

1st report (from one table). Using pandas just for neater display, it's not mandatory.

In [None]:
cur.execute("SELECT * FROM team_reports")

pd.DataFrame(cur.fetchall(), columns=[desc[0] for desc in cur.description])

2nd report (From two tables)

In [None]:
cur.execute("""
    SELECT mtr.*, mdr.member_id, mdr.member_name, mdr.member_real_name, 
        mdr.member_alias, mdr.member_homeland, mdr.member_superpower 
      FROM member_team_reports mtr 
      INNER JOIN member_detail_reports mdr ON mtr.team_id = mdr.team_id 
""")

pd.DataFrame(cur.fetchall(), columns=[desc[0] for desc in cur.description])

Less table, easier aggregate.

In [None]:
cur.execute("""
    SELECT mtr.team_origin, count(DISTINCT mdr.member_id) members_count
      FROM member_team_reports mtr 
      INNER JOIN member_detail_reports mdr ON mtr.team_id = mdr.team_id 
      GROUP BY mtr.team_origin 
      ORDER BY members_count DESC, team_origin
""")

pd.DataFrame(cur.fetchall(), columns=[desc[0] for desc in cur.description])

Another aggregate sample.

In [None]:
cur.execute("""
    SELECT mdr.member_superpower, mtr.team_name, mtr.team_origin, 
           count(DISTINCT mdr.member_id) superpower_count
      FROM member_team_reports mtr 
      INNER JOIN member_detail_reports mdr ON mtr.team_id = mdr.team_id
      WHERE lower(member_superpower) in ('flight', 'super strength')
      GROUP BY mtr.team_name, mtr.team_origin, mdr.member_superpower
      ORDER by superpower_count desc, team_name
""")

pd.DataFrame(cur.fetchall(), columns=[desc[0] for desc in cur.description])

Duplication in member_id is OK in denormalized table

In [None]:
cur.execute("SELECT * FROM member_detail_reports ORDER BY member_id")

pd.DataFrame(cur.fetchall(), columns=[desc[0] for desc in cur.description])