# 1.1_data_modeling_postgres_normalization
<img src="https://upload.wikimedia.org/wikipedia/commons/2/29/Postgresql_elephant.svg" width="100" height="100">

The goal of the demo is to maintain data integrity and reduce data redundancy. The exercises don't have tables with composite primary keys, meaning there won't be true violations of 2NF or 3NF, partial or transitive dependencies.

In [1]:
# Import libraries
import psycopg2
from dotenv import load_dotenv
import os

# Load environment variables from the .env file
dotenv_path = "../.env"
load_dotenv()


# Retrieve database connection details from the .env file
user = os.getenv("postgres_username")
password = os.getenv("postgres_password")

In [2]:
# Connect to database
try: 
    conn = psycopg2.connect(f"host=127.0.0.1 dbname=studentdb user={user} password={password}")
except psycopg2.Error as e: 
    print("Error: Could not make connection to the database")
    print(e)

# Get a cursor
try: 
    cur = conn.cursor()
except psycopg2.Error as e: 
    print("Error: Could not get cursor to the dtabase")
    print(e)

# Set automatic commit to be true
conn.set_session(autocommit=True)

# Create a new db, with this table, insert the data, and validate it
<img src="images/1.1_data_modeling_postgres_normalization_1.jpg" width="30%">

In [3]:
# CREATE DATABASE
try: 
    cur.execute("CREATE DATABASE music")
except psycopg2.Error as e:
    print(e)

# CREATE TABLE
try: 
    cur.execute("CREATE TABLE IF NOT EXISTS music_store (transaction_id int, \
                                                         customer_name varchar, cashier_name varchar, \
                                                         year int, albums_purchased text[]);")
except psycopg2.Error as e: 
    print("Error: Issue creating table")
    print (e)
    
# INSERT INTO
try: 
    cur.execute("INSERT INTO music_store (transaction_id, customer_name, cashier_name, year, albums_purchased) \
                 VALUES (%s, %s, %s, %s, %s)", \
                 (1, "Amanda", "Sam", 2000, ["Rubber Soul", "Let it Be"]))
except psycopg2.Error as e: 
    print("Error: Inserting Rows")
    print (e)
    
try: 
    cur.execute("INSERT INTO music_store (transaction_id, customer_name, cashier_name, year, albums_purchased) \
                 VALUES (%s, %s, %s, %s, %s)", \
                 (2, "Toby", "Sam", 2000, ["My Generation"]))
except psycopg2.Error as e: 
    print("Error: Inserting Rows")
    print (e)
    
try: 
    cur.execute("INSERT INTO music_store (transaction_id, customer_name, cashier_name, year, albums_purchased) \
                 VALUES (%s, %s, %s, %s, %s)", \
                 (3, "Max", "Bob", 2018, ["Meet the Beatles", "Help!"]))
except psycopg2.Error as e: 
    print("Error: Inserting Rows")
    print (e)
    
    
# SELECT
try: 
    cur.execute("SELECT * FROM music_store;")
except psycopg2.Error as e: 
    print("Error: select *")
    print (e)

row = cur.fetchone()
while row:
   print(row)
   row = cur.fetchone()

database "music" already exists

(1, 'Amanda', 'Sam', 2000, ['Rubber Soul', 'Let it Be'])
(2, 'Toby', 'Sam', 2000, ['My Generation'])
(3, 'Max', 'Bob', 2018, ['Meet the Beatles', 'Help!'])


# Moving to 1st Normal Form (1NF)
This data has not been normalized.  
To get this data into 1NF, remove any collections or lists of data and break up the list of albums into individual rows.

<img src="images/1.1_data_modeling_postgres_normalization_2.jpg" width="30%">

In [4]:
# CREATE TABLE
try: 
    cur.execute("CREATE TABLE IF NOT EXISTS music_store2 (transaction_id int, \
                                                         customer_name varchar, cashier_name varchar, \
                                                         year int, albums_purchased text);")
except psycopg2.Error as e: 
    print("Error: Issue creating table")
    print (e)
    
# INSERT INTO
try: 
    cur.execute("INSERT INTO music_store2 (transaction_id, customer_name, cashier_name, year, albums_purchased) \
                 VALUES (%s, %s, %s, %s, %s)", \
                 (1, "Amanda", "Sam", 2000, "Rubber Soul"))
except psycopg2.Error as e: 
    print("Error: Inserting Rows")
    print (e)

try: 
    cur.execute("INSERT INTO music_store2 (transaction_id, customer_name, cashier_name, year, albums_purchased) \
                 VALUES (%s, %s, %s, %s, %s)", \
                 (1, "Amanda", "Sam", 2000, "Let it Be"))
except psycopg2.Error as e: 
    print("Error: Inserting Rows")
    print (e)
    
try: 
    cur.execute("INSERT INTO music_store2 (transaction_id, customer_name, cashier_name, year, albums_purchased) \
                 VALUES (%s, %s, %s, %s, %s)", \
                 (2, "Toby", "Sam", 2000, "My Generation"))
except psycopg2.Error as e: 
    print("Error: Inserting Rows")
    print (e)
    
try: 
    cur.execute("INSERT INTO music_store2 (transaction_id, customer_name, cashier_name, year, albums_purchased) \
                 VALUES (%s, %s, %s, %s, %s)", \
                 (3, "Max", "Bob", 2018, "Help!"))
except psycopg2.Error as e: 
    print("Error: Inserting Rows")
    print (e)
    
try: 
    cur.execute("INSERT INTO music_store2 (transaction_id, customer_name, cashier_name, year, albums_purchased) \
                 VALUES (%s, %s, %s, %s, %s)", \
                 (3, "Max", "Bob", 2018, "Meet the Beatles"))
except psycopg2.Error as e: 
    print("Error: Inserting Rows")
    print (e)
    
# SELECT
try: 
    cur.execute("SELECT * FROM music_store2;")
except psycopg2.Error as e: 
    print("Error: select *")
    print (e)

row = cur.fetchone()
while row:
   print(row)
   row = cur.fetchone()

(1, 'Amanda', 'Sam', 2000, 'Rubber Soul')
(1, 'Amanda', 'Sam', 2000, 'Let it Be')
(2, 'Toby', 'Sam', 2000, 'My Generation')
(3, 'Max', 'Bob', 2018, 'Help!')
(3, 'Max', 'Bob', 2018, 'Meet the Beatles')


# Moving to 2nd Normal Form (2NF)
- 1st step: Moving the data into 1NF. Done. Every record in the table is now unique.

<img src="images/1.1_data_modeling_postgres_normalization_2.jpg" width="30%">

- 2NF violation: partial dependencies = when a column depends on part of a composite primary key instead of the whole key. 
- This exercise can't teach partial dependence, because it has a simple primary key, so we're only removing a column with redundant info: `albums purchased`get their own table.

<img src="images/1.1_data_modeling_postgres_normalization_3.jpg" width="20%">

In [5]:
# CREATE TABLE
try: 
    cur.execute("CREATE TABLE IF NOT EXISTS transactions (transaction_id int, \
                                                           customer_name varchar, cashier_name varchar, \
                                                           year int);")
except psycopg2.Error as e: 
    print("Error: Issue creating table")
    print (e)

try: 
    cur.execute("CREATE TABLE IF NOT EXISTS albums_sold (album_id int, transaction_id int, \
                                                          album_name varchar);")
except psycopg2.Error as e: 
    print("Error: Issue creating table")
    print (e)
    
# INSERT INTO
try: 
    cur.execute("INSERT INTO transactions (transaction_id, customer_name, cashier_name, year) \
                 VALUES (%s, %s, %s, %s)", \
                 (1, "Amanda", "Sam", 2000))
except psycopg2.Error as e: 
    print("Error: Inserting Rows")
    print (e)

try: 
    cur.execute("INSERT INTO transactions (transaction_id, customer_name, cashier_name, year) \
                 VALUES (%s, %s, %s, %s)", \
                 (2, "Toby", "Sam", 2000))
except psycopg2.Error as e: 
    print("Error: Inserting Rows")
    print (e)
    
try: 
    cur.execute("INSERT INTO transactions (transaction_id, customer_name, cashier_name, year) \
                 VALUES (%s, %s, %s, %s)", \
                 (3, "Max", "Bob", 2018))
except psycopg2.Error as e: 
    print("Error: Inserting Rows")
    print (e)
    
try: 
    cur.execute("INSERT INTO albums_sold (album_id, transaction_id, album_name) \
                 VALUES (%s, %s, %s)", \
                 (1, 1, "Rubber Soul"))
except psycopg2.Error as e: 
    print("Error: Inserting Rows")
    print (e)

try: 
    cur.execute("INSERT INTO albums_sold (album_id, transaction_id, album_name) \
                 VALUES (%s, %s, %s)", \
                 (2, 1, "Let it Be"))
except psycopg2.Error as e: 
    print("Error: Inserting Rows")
    print (e)
    
try: 
    cur.execute("INSERT INTO albums_sold (album_id, transaction_id, album_name) \
                 VALUES (%s, %s, %s)", \
                 (3, 2, "My Generation"))
except psycopg2.Error as e: 
    print("Error: Inserting Rows")
    print (e)
    
try: 
    cur.execute("INSERT INTO albums_sold (album_id, transaction_id, album_name) \
                 VALUES (%s, %s, %s)", \
                 (4, 3, "Meet the Beatles"))
except psycopg2.Error as e: 
    print("Error: Inserting Rows")
    print (e)

try: 
    cur.execute("INSERT INTO albums_sold (album_id, transaction_id, album_name) \
                 VALUES (%s, %s, %s)", \
                 (5, 3, "Help!"))
except psycopg2.Error as e: 
    print("Error: Inserting Rows")
    print (e)

# SELECT
print("Table: transactions\n")
try: 
    cur.execute("SELECT * FROM transactions;")
except psycopg2.Error as e: 
    print("Error: select *")
    print (e)

row = cur.fetchone()
while row:
   print(row)
   row = cur.fetchone()

print("\nTable: albums_sold\n")
try: 
    cur.execute("SELECT * FROM albums_sold;")
except psycopg2.Error as e: 
    print("Error: select *")
    print (e)
row = cur.fetchone()
while row:
   print(row)
   row = cur.fetchone()

Table: transactions

(1, 'Amanda', 'Sam', 2000)
(2, 'Toby', 'Sam', 2000)
(3, 'Max', 'Bob', 2018)

Table: albums_sold

(1, 1, 'Rubber Soul')
(2, 1, 'Let it Be')
(3, 2, 'My Generation')
(4, 3, 'Meet the Beatles')
(5, 3, 'Help!')


`JOIN` the `transactions` and `albums_sold` tables to get all the information as in the original first table `music_store`.

In [6]:
try: 
    cur.execute("SELECT * FROM transactions JOIN albums_sold ON transactions.transaction_id = albums_sold.transaction_id ;")
except psycopg2.Error as e: 
    print("Error: select *")
    print (e)

row = cur.fetchone()
while row:
   print(row)
   row = cur.fetchone()

(1, 'Amanda', 'Sam', 2000, 1, 1, 'Rubber Soul')
(1, 'Amanda', 'Sam', 2000, 2, 1, 'Let it Be')
(2, 'Toby', 'Sam', 2000, 3, 2, 'My Generation')
(3, 'Max', 'Bob', 2018, 4, 3, 'Meet the Beatles')
(3, 'Max', 'Bob', 2018, 5, 3, 'Help!')


# Moving to 3rd Normal Form (3NF)
- 1NF. Done. Every record in the table is unique.
- 2NF. Done. There are no partial dependencies. 

<img src="images/1.1_data_modeling_postgres_normalization_3.jpg" width="20%">

- 3NF: eliminate transitive dependency = when a non-key column doesn’t directly depend on the primary key but on another non-key column that does.
- This exercise can't demo transitive dependency, because it only has a simple primary key.  
All the non-key columns directly depend on the primary key, so we're only removing a column with redundant info, meaning `cashiers/employees` get their own table.

<img src="images/1.1_data_modeling_postgres_normalization_4.jpg" width="20%">

In [7]:
# CREATE TABLE
try: 
    cur.execute("CREATE TABLE IF NOT EXISTS transactions2 (transaction_id int, \
                                                           customer_name varchar, cashier_id int, \
                                                           year int);")
except psycopg2.Error as e: 
    print("Error: Issue creating table")
    print (e)

try: 
    cur.execute("CREATE TABLE IF NOT EXISTS employees (employee_id int, \
                                                       employee_name varchar);")
except psycopg2.Error as e: 
    print("Error: Issue creating table")
    print (e)

# INSERT INTO
try: 
    cur.execute("INSERT INTO transactions2 (transaction_id, customer_name, cashier_id, year) \
                 VALUES (%s, %s, %s, %s)", \
                 (1, "Amanda", 1, 2000))
except psycopg2.Error as e: 
    print("Error: Inserting Rows")
    print (e)

try: 
    cur.execute("INSERT INTO transactions2 (transaction_id, customer_name, cashier_id, year) \
                 VALUES (%s, %s, %s, %s)", \
                 (2, "Toby", 1, 2000))
except psycopg2.Error as e: 
    print("Error: Inserting Rows")
    print (e)
    
try: 
    cur.execute("INSERT INTO transactions2 (transaction_id, customer_name, cashier_id, year) \
                 VALUES (%s, %s, %s, %s)", \
                 (3, "Max", 2, 2018))
except psycopg2.Error as e: 
    print("Error: Inserting Rows")
    print (e)

try: 
    cur.execute("INSERT INTO employees (employee_id, employee_name) \
                 VALUES (%s, %s)", \
                 (1, "Sam"))
except psycopg2.Error as e: 
    print("Error: Inserting Rows")
    print (e)

try: 
    cur.execute("INSERT INTO employees (employee_id, employee_name) \
                 VALUES (%s, %s)", \
                 (2, "Bob"))
except psycopg2.Error as e: 
    print("Error: Inserting Rows")
    print (e)    

# SELECT
print("Table: transactions2\n")
try: 
    cur.execute("SELECT * FROM transactions2;")
except psycopg2.Error as e: 
    print("Error: select *")
    print (e)

row = cur.fetchone()
while row:
   print(row)
   row = cur.fetchone()

print("\nTable: albums_sold\n")
try: 
    cur.execute("SELECT * FROM albums_sold;")
except psycopg2.Error as e: 
    print("Error: select *")
    print (e)

row = cur.fetchone()
while row:
   print(row)
   row = cur.fetchone()

print("\nTable: employees\n")
try: 
    cur.execute("SELECT * FROM employees;")
except psycopg2.Error as e: 
    print("Error: select *")
    print (e)

row = cur.fetchone()
while row:
   print(row)
   row = cur.fetchone()

Table: transactions2

(1, 'Amanda', 1, 2000)
(2, 'Toby', 1, 2000)
(3, 'Max', 2, 2018)

Table: albums_sold

(1, 1, 'Rubber Soul')
(2, 1, 'Let it Be')
(3, 2, 'My Generation')
(4, 3, 'Meet the Beatles')
(5, 3, 'Help!')

Table: employees

(1, 'Sam')
(2, 'Bob')


`JOIN` the `transactions2, albums_sold, employees` tables to get all the information as in the original first table `music_store`.

In [8]:
try: 
    cur.execute("SELECT * FROM (transactions2 JOIN albums_sold ON \
                               transactions2.transaction_id = albums_sold.transaction_id) JOIN \
                               employees ON transactions2.cashier_id=employees.employee_id;")
except psycopg2.Error as e: 
    print("Error: select *")
    print (e)

row = cur.fetchone()
while row:
   print(row)
   row = cur.fetchone()

(1, 'Amanda', 1, 2000, 1, 1, 'Rubber Soul', 1, 'Sam')
(1, 'Amanda', 1, 2000, 2, 1, 'Let it Be', 1, 'Sam')
(2, 'Toby', 1, 2000, 3, 2, 'My Generation', 1, 'Sam')
(3, 'Max', 2, 2018, 4, 3, 'Meet the Beatles', 2, 'Bob')
(3, 'Max', 2, 2018, 5, 3, 'Help!', 2, 'Bob')


If the output for the above cell is:

`1, 'Amanda', 1, 2000, 1, 1, 'Rubber Soul', 1, 'Sam'`  
`1, 'Amanda', 1, 2000, 2, 1, 'Let it Be', 1, 'Sam'`  
`2, 'Toby', 1, 2000, 3, 2, 'My Generation', 1, 'Sam'`  
`3, 'Max', 2, 2018, 4, 3, 'Meet the Beatles', 2, 'Bob'`  
`3, 'Max', 2, 2018, 5, 3, 'Help!', 2, 'Bob'`

the dataset is now normalized. Drop the tables, close the cursor and connection.

In [9]:
# DROP TABLE
try: 
    cur.execute("DROP table music_store")
except psycopg2.Error as e: 
    print("Error: Dropping table")
    print (e)
try: 
    cur.execute("DROP table music_store2")
except psycopg2.Error as e: 
    print("Error: Dropping table")
    print (e)
try: 
    cur.execute("DROP table albums_sold")
except psycopg2.Error as e: 
    print("Error: Dropping table")
    print (e)
try: 
    cur.execute("DROP table employees")
except psycopg2.Error as e: 
    print("Error: Dropping table")
    print (e)
try: 
    cur.execute("DROP table transactions")
except psycopg2.Error as e: 
    print("Error: Dropping table")
    print (e)
try: 
    cur.execute("DROP table transactions2")
except psycopg2.Error as e: 
    print("Error: Dropping table")
    print (e)

In [10]:
# cursor, connection close
try: 
    cur.close()
    conn.close()
except psycopg2.Error as e:
    print(e)