In [58]:
# Demo for connecting to a PostgreSQL database in python
# DB Browser, like the colab notebook can also be used to exploring and demonstration
# Step 1 - have the library psycopg2 installed
# Locally: pipenv install psycopg2_binary
!pip install psycopg2-binary # Want it to be already built




In [2]:
import psycopg2

In [59]:
dir(psycopg2)

['BINARY',
 'Binary',
 'DATETIME',
 'DataError',
 'DatabaseError',
 'Date',
 'DateFromTicks',
 'Error',
 'IntegrityError',
 'InterfaceError',
 'InternalError',
 'NUMBER',
 'NotSupportedError',
 'OperationalError',
 'ProgrammingError',
 'ROWID',
 'STRING',
 'Time',
 'TimeFromTicks',
 'Timestamp',
 'TimestampFromTicks',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__libpq_version__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '__version__',
 '_connect',
 '_ext',
 '_json',
 '_psycopg',
 '_range',
 'apilevel',
 'compat',
 'connect',
 'errors',
 'extensions',
 'paramstyle',
 'threadsafety',
 'tz']

In [60]:
help(psycopg2.connect)

Help on function connect in module psycopg2:

connect(dsn=None, connection_factory=None, cursor_factory=None, **kwargs)
    Create a new database connection.
    
    The connection parameters can be specified as a string:
    
        conn = psycopg2.connect("dbname=test user=postgres password=secret")
    
    or using a set of keyword arguments:
    
        conn = psycopg2.connect(database="test", user="postgres", password="secret")
    
    Or as a mix of both. The basic connection parameters are:
    
    - *dbname*: the database name
    - *database*: the database name (only as keyword argument)
    - *user*: user name used to authenticate
    - *password*: password used to authenticate
    - *host*: database host address (defaults to UNIX socket if not provided)
    - *port*: connection port number (defaults to 5432 if not provided)
    
    Using the *connection_factory* parameter a different class or connections
    factory can be specified. It should be a callable object tak

In [61]:
# Connection to postgreSQL needs more auth/host information to connect
# Note - sensitive and confidential information, should not be checked 
# into Git, more on how to handle them through the .env files
dbname ="omekiiqr"
user = "omekiiqr"  # ElephantSQL chooses to reuse dbname and username
password = "iWYlHjsvGl4EoiyazC1T_12vsKQUEAuL" # Don't share/commit - very sensitive
host = "ruby.db.elephantsql.com"  #  Port is 5432 by default

In [62]:
# If we make too many connections, the database complains
# Be sure to close cursors and connections
pg_connect = psycopg2.connect( dbname=dbname, user = user, password = password, host = host)

In [63]:
pg_connect

<connection object at 0x7f0367a3eb40; dsn: 'user=omekiiqr password=xxx dbname=omekiiqr host=ruby.db.elephantsql.com', closed: 0>

In [64]:
dir(pg_connect)

['DataError',
 'DatabaseError',
 'Error',
 'IntegrityError',
 'InterfaceError',
 'InternalError',
 'NotSupportedError',
 'OperationalError',
 'ProgrammingError',
 '__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__enter__',
 '__eq__',
 '__exit__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'async',
 'async_',
 'autocommit',
 'binary_types',
 'cancel',
 'close',
 'closed',
 'commit',
 'cursor',
 'cursor_factory',
 'deferrable',
 'dsn',
 'encoding',
 'fileno',
 'get_backend_pid',
 'get_dsn_parameters',
 'get_native_connection',
 'get_parameter_status',
 'get_transaction_status',
 'info',
 'isexecuting',
 'isolation_level',
 'lobject',
 'notices',
 'notifies',
 'pgconn_ptr',
 'poll',
 'protocol_version',
 'readonly',
 'reset',
 'rollback',
 'server_version',
 'set_cli

In [65]:
pg_cursor = pg_connect.cursor() # Works in the same way with sqlite


In [66]:
help(pg_cursor.execute)

Help on built-in function execute:

execute(...) method of psycopg2.extensions.cursor instance
    execute(query, vars=None) -- Execute query with bound vars.



In [67]:
help(pg_cursor.executemany) # And more functionality

Help on built-in function executemany:

executemany(...) method of psycopg2.extensions.cursor instance
    executemany(query, vars_list) -- Execute many queries with bound vars.



In [68]:
# Connected but the db is empty
# We going to run an example to populate the database
create_table_statement = """
CREATE TABLE test_table (
  id SERIAL PRIMARY KEY,
  name varchar(40) NOT NULL,
  data JSONB
);
"""
# These types are PostgreSQL specific and will not work with SQLite

pg_cursor.execute(create_table_statement)
pg_connect.commit() # Save the changes by commiting

DuplicateTable: ignored

In [69]:
insert_statement = """
INSERT INTO test_table (name, data) VALUES
(
  'A row name',
  null
),
(
  'Another row, with JSON this time',
  '{ "a": 1, "b": ["dog", "cat", 42], "c": true }'::JSONB
)
"""
pg_cursor.execute(insert_statement)
pg_connect.commit()

InFailedSqlTransaction: ignored

In [70]:
query = "SELECT * FROM test_table"
pg_cursor.execute(query)

InFailedSqlTransaction: ignored

In [10]:
pg_cursor

<cursor object at 0x7f0367a3d9f8; closed: 0>

In [11]:
pg_cursor.fetchall()

[(1, 'A row name', None),
 (2,
  'Another row, with JSON this time',
  {'a': 1, 'b': ['dog', 'cat', 42], 'c': True}),
 (3, 'A row name', None),
 (4,
  'Another row, with JSON this time',
  {'a': 1, 'b': ['dog', 'cat', 42], 'c': True})]

In [41]:
pg_cursor.close()

In [42]:
pg_cursor = pg_connect.cursor()
pg_cursor.execute("INSERT INTO test_table (name, data) VALUES (null, null);")

InFailedSqlTransaction: ignored

In [None]:
# Database contraints are enforced and effected, this ensures good data quality
# Use the .close() to avoid database errors

ETL

Extract – Transform – Load

Extract - get the data fromt eh source
Load - insert data into the required destination
We would like to extract data from rpg_data sqlite 3 and load it into PostgreSQL

If we change so that it fits or change format (calculate/summarize) – that would be the transform step

We are making our first cloud "ETL"


In [12]:
# Getting the data
!wget https://github.com/LambdaSchool/DS-Unit-3-Sprint-2-SQL-and-Databases/blob/master/module1-introduction-to-sql/rpg_db.sqlite3?raw=true

--2020-08-11 19:54:22--  https://github.com/LambdaSchool/DS-Unit-3-Sprint-2-SQL-and-Databases/blob/master/module1-introduction-to-sql/rpg_db.sqlite3?raw=true
Resolving github.com (github.com)... 140.82.112.4
Connecting to github.com (github.com)|140.82.112.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://github.com/LambdaSchool/DS-Unit-3-Sprint-2-SQL-and-Databases/raw/master/module1-introduction-to-sql/rpg_db.sqlite3 [following]
--2020-08-11 19:54:23--  https://github.com/LambdaSchool/DS-Unit-3-Sprint-2-SQL-and-Databases/raw/master/module1-introduction-to-sql/rpg_db.sqlite3
Reusing existing connection to github.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/LambdaSchool/DS-Unit-3-Sprint-2-SQL-and-Databases/master/module1-introduction-to-sql/rpg_db.sqlite3 [following]
--2020-08-11 19:54:23--  https://raw.githubusercontent.com/LambdaSchool/DS-Unit-3-Sprint-2-SQL-and-Databases/master/module1-in

In [13]:
# Renaming the database
!mv 'rpg_db.sqlite3?raw=true' rpg_db.sqlite3

In [14]:
!ls

rpg_db.sqlite3	sample_data


In [15]:
# Step 1 - Extract data from sqlite3

import os
import sqlite3

# construction of  a path to wherever the database exists
# DB_FILEPATH = "rpg_db.sqlite3.db"
# DB_FILEPATH = os.path.join(os.path.dirname(__file__), "rpg_db.sqlite3")

s1_connection = sqlite3.connect("rpg_db.sqlite3")
# print("CONNECTION:", connection)

s1_cursor = s1_connection.cursor()

In [16]:
# Our goal is to copy the charactercreator_character table
get_characters = "SELECT * FROM Charactercreator_character;"

In [17]:
characters = s1_cursor.execute(get_characters).fetchall()

In [18]:
len(characters)

302

In [19]:
characters[:5]

[(1, 'Aliquid iste optio reiciendi', 0, 0, 10, 1, 1, 1, 1),
 (2, 'Optio dolorem ex a', 0, 0, 10, 1, 1, 1, 1),
 (3, 'Minus c', 0, 0, 10, 1, 1, 1, 1),
 (4, 'Sit ut repr', 0, 0, 10, 1, 1, 1, 1),
 (5, 'At id recusandae expl', 0, 0, 10, 1, 1, 1, 1)]

In [None]:
# Step 1 is complete , we have a list of tuples with all our character data
# Note that this is not a pandas dataframe
# We do not know types so far, so we need to figure out that in the transform
# step

In [20]:
# Step 2 - Transform
# Our goal is to make a schema to define a table that fits this data in PostgreSQL
# Can we recheck old schema?
s1_cursor.execute("PRAGMA table_info(Charactercreator_character);").fetchall()

[(0, 'character_id', 'integer', 1, None, 1),
 (1, 'name', 'varchar(30)', 1, None, 0),
 (2, 'level', 'integer', 1, None, 0),
 (3, 'exp', 'integer', 1, None, 0),
 (4, 'hp', 'integer', 1, None, 0),
 (5, 'strength', 'integer', 1, None, 0),
 (6, 'intelligence', 'integer', 1, None, 0),
 (7, 'dexterity', 'integer', 1, None, 0),
 (8, 'wisdom', 'integer', 1, None, 0)]

In [21]:
# Need to make a create statement in PostgreSQL that captures the above types
create_character_table = """
CREATE TABLE Charactercreator_character(
character_id SERIAL PRIMARY KEY, 
name VARCHAR(30),
level INTEGER,
exp INTEGER,
hp INTEGER,
strength INTEGER,
intelligence INTEGER,
dexterity INTEGER,
wisdom INTEGER
);
"""

In [43]:
# Defining a function to refresh connection and cursor
def refresh_connection_and_cursor(connect, cursor):
  cursor.close()
  connect.close()
  pg_connect = psycopg2.connect(dbname=dbname, user=user,
                             password=password, host=host)
  pg_cursor = pg_connect.cursor()
  return pg_connect, pg_cursor

In [44]:
pg_connect, pg_cursor = refresh_connection_and_cursor(pg_connect, pg_cursor)

In [45]:
# Execute the create table
pg_cursor.execute(create_character_table)
pg_connect.commit()

DuplicateTable: ignored

In [46]:
# We can query postgre database to see what data it has
# This is a clever optional step which shows postgre internals
show_tables = """
SELECT 
* 
FROM 
  pg_catalog.pg_tables
WHERE
  schemaname !=  'pg_catalog'
AND
  schemaname !=  'information_schema';
"""
pg_cursor.execute(show_tables)
pg_cursor.fetchall()

InFailedSqlTransaction: ignored

In [58]:
# We now have a place to insert our characters without the need to transform as much
# Step 3  - LOAD
characters[0]


(1, 'Aliquid iste optio reiciendi', 0, 0, 10, 1, 1, 1, 1)

In [59]:
characters[0][1:]

('Aliquid iste optio reiciendi', 0, 0, 10, 1, 1, 1, 1)

In [47]:
# If we ran that, we'd insert the first character
# But we want them all - loops!
for character in characters:
  insert_character = """
    INSERT INTO charactercreator_character
    (name, level, exp, hp, strength, intelligence, dexterity, wisdom)
    VALUES """ + str(character[1:]) + ";"
  pg_cursor.execute(insert_character)

InFailedSqlTransaction: ignored

In [None]:
# Note - we're executing each character one at a time
# That works, and is simple, but inefficient (lots of roundtrips to database)
# Stretch/afternoon goal - see if you can combine into a single
# insert that does them all at once

In [48]:
pg_connect.commit()

In [49]:
# Postgre cursor needs to fetch in a separate step unlike sqlite
pg_cursor.execute("SELECT * FROM charactercreator_character LIMIT 5;")
pg_cursor.fetchall()

[]

In [50]:
pg_cursor

<cursor object at 0x7f0367a3dce0; closed: 0>

In [51]:
# To make sure other connections and cursors know about our insertion
# we need to commit
pg_connect.commit()

In [52]:
# We have done a basic ETL
# How can we verify that?
len(characters)

302

In [None]:
# Ids are different (on first run, now fixed)!
# That's because we had an aborted run
# Let's fix this by deleting the data and DROPping the table
# Other tables are fine, but we'll dump the data *and* schema to rerun
# pg_curs.execute('DROP TABLE charactercreator_character;')
# pg_conn.commit()

In [None]:
# Now we need to rerun the above... scrolling up and down, because notebooks
# Specifically rerunning character table create statement and data inserts

In [54]:
# Now the data looks the same! But let's check it systematically
pg_cursor.execute('SELECT * FROM charactercreator_character;')
pg_characters = pg_cursor.fetchall()

In [55]:
# We could do more spot checks, but let's loop and check them all
# TODO/afternoon task - consider making this a more formal test
for character, pg_character in zip(characters, pg_characters):
  assert character == pg_character

In [57]:
# No complaints - which means they're all the same!
# Closing out cursor/connection to wrap up
pg_cursor.close()
pg_connect.close()
s1_cursor.close()
s1_connect.close()

NameError: ignored