In [5]:
import psycopg2
import pandas as pd
import sqlalchemy as sq

In [6]:
pip install ipython-sql

Note: you may need to restart the kernel to use updated packages.


In [7]:
%load_ext sql

In [13]:
DB_ENDPOINT = "localhost"
DEFAULT_DB = 'postgres'
DB = 'csv_etl'
DB_USER = 'postgres'
DB_PASSWORD = '1234'
DB_PORT = '5432'

In [14]:
try:
    conn = psycopg2.connect(
        host=DB_ENDPOINT,
        port=DB_PORT,
        dbname=DEFAULT_DB,
        user=DB_USER,
        password=DB_PASSWORD)
    conn_string = "postgresql://{}:{}@{}:{}/{}".format(DB_USER, DB_PASSWORD, DB_ENDPOINT, DB_PORT, DB)
    print("Connection established")
    conn.set_session(autocommit=True)
    print("Connection is in Auto Commit")
except psycopg2.Error as e:
    print("Error: Could not make connection to the Default Postgres database")
    print(e)

Connection established
Connection is in Auto Commit


In [15]:
%sql $conn_string

In [None]:
# Get min and max for our Calendar Table
"""
print (data_frame_payments.date.min())
print (data_frame_payments.date.max())
print (data_frame_trips.start_at.min())
print (data_frame_trips.start_at.max())

"""

In [11]:
#function to Produce the Dimension Calendar Table
def dimension_datetime_frame(start='1969-01-01', end='2050-12-31'):
    """ Return a ready  Dimension Calendar Table frame with precision of seconds"""
    df = pd.DataFrame({"DateTime": pd.date_range(start=start, end=end, freq="S")})
    df["second"] = df.DateTime.dt.second
    df["minute"] = df.DateTime.dt.minute
    df["hour"] = df.DateTime.dt.hour
    df["day"] = df.DateTime.dt.day
    df["dayofweek"] = df.DateTime.dt.dayofweek
    df["is_weekend"]= df.DateTime.dt.dayofweek > 4
    df["month"] = df.DateTime.dt.month
    df["Quarter"] = df.DateTime.dt.quarter
    df["Year"] = df.DateTime.dt.year
    return df


engine = sq.create_engine(f"postgresql+psycopg2://{DB_USER}:{DB_PASSWORD}@{DB_ENDPOINT}:{DB_PORT}/{DB}")
calendar_df = dimension_datetime_frame()
calendar_df.to_sql('dimDate', engine)


481

In [14]:
%%sql
CREATE TABLE IF NOT EXISTS dimRiders
(
    rider_key            INT PRIMARY KEY,
    first_name           VARCHAR(50) NOT NULL,
    last_name            VARCHAR(50) NOT NULL,
    address              VARCHAR(100) NOT NULL,
    birthday             DATE NOT NULL,
    account_start_date   DATE NOT NULL,
    account_end_date     DATE,
    is_member            BOOLEAN
);

 * postgresql://postgres:***@localhost:5432/csv_etl
Done.


[]

In [15]:
%%sql
CREATE TABLE IF NOT EXISTS dimStation
(
    station_key          VARCHAR(50) PRIMARY KEY,
    station_name         VARCHAR(75) NOT NULL,
    latitude             FLOAT NOT NULL,
    longitude            FLOAT NOT NULL
);

 * postgresql://postgres:***@localhost:5432/csv_etl
Done.


[]

In [24]:
%%sql
CREATE TABLE IF NOT EXISTS FactTrip
(
    trip_key             VARCHAR(50) PRIMARY KEY,
    rideable_type        VARCHAR(75) NOT NULL,
    start_date_id        TIMESTAMP NOT NULL REFERENCES "dimDate"("DateTime"),
    ended_date_id        TIMESTAMP NOT NULL REFERENCES "dimDate"("DateTime"),
    start_station_id     VARCHAR(50) NOT NULL REFERENCES dimStation(station_key),
    end_station_id       VARCHAR(50) NOT NULL REFERENCES dimStation(station_key),
    rider_id             INT REFERENCES dimRiders(rider_key),
    age                  INTERVAL,
    trip_duration        INTERVAL

);

 * postgresql://postgres:***@localhost:5432/csv_etl
Done.


[]

In [25]:
%%sql
CREATE TABLE IF NOT EXISTS FactPayment
(
    payment_id           INT PRIMARY KEY,
    date_id              TIMESTAMP NOT NULL REFERENCES "dimDate"("DateTime"),
    rider_id             INT REFERENCES dimRiders(rider_key),
    amount               MONEY
);

 * postgresql://postgres:***@localhost:5432/csv_etl
Done.


[]

In [39]:
%%sql
INSERT INTO dimRiders (rider_key, first_name, last_name, address, birthday, account_start_date, account_end_date, is_member)
SELECT rider_id                                       AS rider_key,
       first                                          AS first_name,
       last                                           AS last_name,
       address                                        AS address,
       birthday                                       AS birthday,
       account_start_date                             AS account_start_date,
       account_end_date                               AS account_end_date,
       is_member                                      AS is_member
FROM riders;

 * postgresql://postgres:***@localhost:5432/csv_etl
75000 rows affected.


[]

In [40]:
%%sql
INSERT INTO dimStation (station_key, station_name, latitude, longitude)
SELECT station_id                                     AS station_key,
       name                                           AS station_name,
       latitude                                       AS latitude,
       longitude                                      AS longitude
FROM station;

 * postgresql://postgres:***@localhost:5432/csv_etl
838 rows affected.


[]

In [29]:
%%sql
INSERT INTO Factpayment (payment_id, date_id, rider_id, amount)
SELECT p.payment_id                                         AS payment_id,
       d."DateTime"                                         AS date_id,
       p.rider_id                                           AS rider_id,
       p.amount                                             AS amount
FROM payment p
JOIN "dimDate" d ON ( CAST(p.date AS TIMESTAMP) = d."DateTime" );

 * postgresql://postgres:***@localhost:5432/csv_etl
1946607 rows affected.


[]

In [36]:
%%sql
INSERT INTO FactTrip (trip_key, rideable_type, start_date_id, ended_date_id, start_station_id, end_station_id, rider_id, age, trip_duration)
SELECT t.trip_id                                            AS trip_key,
       t.rideable_type                                      AS rideable_type,
       start."DateTime"                                     AS start_date_id,
       stop."DateTime"                                      AS ended_date_id,
       t.start_station_id                                   AS start_station_id,
       t.end_station_id                                     AS end_station_id,
       t.rider_id                                           AS rider_id,
      AGE(t.start_at,CAST(r.birthday AS TIMESTAMP))         AS INTERVAL,
      AGE(t.ended_at, t.start_at)                           AS INTERVAL
FROM trip t
JOIN "dimDate" start ON ( start."DateTime" = t.start_at )
JOIN "dimDate" stop ON ( stop."DateTime" = t.ended_at )
JOIN dimRiders as r ON (r.rider_key = t.rider_id)
WHERE t.start_at != t.ended_at

 * postgresql://postgres:***@localhost:5432/csv_etl
1211 rows affected.


[]