# Import Libraries

In [69]:
from google.cloud import bigquery
from google.cloud.exceptions import NotFound
import pandas as pd
import os
import logging
from datetime import datetime

# Create Functions

In [94]:
def generate_time_dimension(start, end):
    """
    generate_time_dimension
    Creates a list of all possible time in a day
    Then adds additional columns of hour, minute, and second
    Returns a new dataframe
    """
    seconds = pd.period_range(start, end, freq='S')
    seconds = seconds.to_timestamp(freq='S')
    df = pd.DataFrame({"full_time": seconds})
    df["hour"] = df.full_time.dt.strftime("%H")
    df["minute"] = df.full_time.dt.strftime("%M")
    df["second"] = df.full_time.dt.strftime("%S")
    df['full_time'] = df['full_time'].apply(lambda x: x.time())
    return df

In [70]:
def generate_date_dimension(start, end):
    """
    generate_date_dimension
    Creates a calendar of all dates between 'start' and 'end'
    Then adds additional columns of day, week and month information in various formats
    See this for format details: https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes
    Returns a new dataframe
    """
    df = pd.DataFrame({"full_date": pd.date_range(start, end)})
    df["weekday_name"] = df.full_date.dt.strftime("%A")
    df["day_of_week"] = df.full_date.dt.strftime("%w")
    df["month_name"] = df.full_date.dt.strftime("%B")
    df["day_of_month"] = df.full_date.dt.strftime("%d")
    df["month_of_year"] = df.full_date.dt.strftime("%m")
    df["quarter"] = df.full_date.dt.quarter
    df["year"] = df.full_date.dt.strftime("%Y")
    return df

In [72]:
def create_bigquery_client(logging):
    """
    create_bigquery_client
    Creates a BigQuery client using the path to the service account key file
    for credentials.
    Returns the BigQuery client object
    """
    try:
        bqclient = bigquery.Client.from_service_account_json('keys/new-cis4400-381214-f4f2229d6853.json') # replace with your own SA keys
        logging.info("Created BigQuery Client: %s",bqclient)
        return bqclient
    except Exception as err:
        logging.error("Failed to create BigQuery Client.", err)
        os._exit(-1)
    return bqclient

In [73]:
def upload_bigquery_table(logging, bqclient, table_path, write_disposition, df):
    """
    upload_bigquery_table
    Accepts a path to a BigQuery table, the write disposition and a dataframe
    Loads the data into the BigQuery table from the dataframe.
    for credentials.
    The write disposition is either
    write_disposition="WRITE_TRUNCATE"  Erase the target data and load all new data.   
    write_disposition="WRITE_APPEND"    Append to the existing table
    """
    try:
        logging.info("Creating BigQuery Job configuration with write_disposition=%s", write_disposition)
        job_config = bigquery.LoadJobConfig(write_disposition=write_disposition)
        logging.info("Submitting the BigQuery job")
        job = bqclient.load_table_from_dataframe(df, table_path, job_config=job_config)  
        logging.info("Job  results: %s",job.result())
    except Exception as err:
        logging.error("Failed to load BigQuery Table. %s", err)

In [74]:
def bigquery_table_exists(bqclient, table_path):
    """
    bigquery_table_exists
    Accepts a path to a BigQuery table
    Checks if the BigQuery table exists.
    Returns True or False
    """    
    try:
        bqclient.get_table(table_path)  # Make an API request.
        return True
    except NotFound:
        return False

In [75]:
def query_bigquery_table(logging, table_path, bqclient, surrogate_key):
    """
    query_bigquery_table
    Accepts a path to a BigQuery table and the name of the surrogate key
    Queries the BigQuery table but leaves out the update_timestamp and surrogate key columns
    Returns the dataframe
    """    
    bq_df = pd.DataFrame
    sql_query = 'SELECT * EXCEPT ( update_timestamp, ' + surrogate_key  +') FROM `' + table_path + '`'
    logging.info("Running query: %s", sql_query)
    try:
        bq_df = bqclient.query(sql_query).to_dataframe()
    except Exception as err:
        logging.info("Error querying the table. %s", err)
    return bq_df

In [76]:
def add_surrogate_key(df, dimension_name='customers', offset=1):
    """
    add_surrogate_key  
    Accepts a data frame and inserts an integer identifier as the first column
    Returns the modified dataframe
    """
    df.reset_index(drop=True, inplace=True)
    df.insert(0, dimension_name+'_dim_id', df.index+offset)
    return df

In [77]:
def add_update_date(df, current_date):
    """
    add_update_date
    Accepts a data frame and inserts the current date as a new field
    Returns the modified dataframe
    """
    df['update_date'] = pd.to_datetime(current_date)
    return df

In [78]:
def add_update_timestamp(df):
    """
    add_update_timestamp
    Accepts a data frame and inserts the current datetime as a new field
    Returns the modified dataframe
    """
    df['update_timestamp'] = pd.Timestamp('now', tz='utc').replace(microsecond=0)
    return df

In [79]:
def build_new_table(logging, bqclient, dimension_table_path, dimension_name, df):
    """
    build_new_table
    Accepts a path to a dimensional table, the dimension name and a data frame 
    Add the surrogate key and a record timestamp to the data frame
    Inserts the contents of the dataframe to the dimensional table.
    """
    logging.info("Target dimension table %s does not exit", dimension_table_path)
    df = add_surrogate_key(df, dimension_name, 1)
    df = add_update_timestamp(df)
    upload_bigquery_table(logging, bqclient, dimension_table_path, "WRITE_TRUNCATE", df)

In [80]:
def insert_existing_table(logging, bqclient, dimension_table_path, dimension_name, surrogate_key, df):
    """
    insert_existing_table
    Accepts a path to a dimensional table, the dimension name and a data frame 
    Compares the new data to the existing data in the table.
    Inserts the new/modified records to the existing table
    """
    bq_df = pd.DataFrame
    logging.info("Target dimension table %s exits. Checking for differences.", dimension_table_path)
    bq_df = query_bigquery_table(logging, dimension_table_path, bqclient, surrogate_key)
    new_records_df = df[~df.apply(tuple,1).isin(bq_df.apply(tuple,1))]
    logging.info("Found %d new records.", new_records_df.shape[0])
    if new_records_df.shape[0] > 0:
        new_surrogate_key_value = bq_df.shape[0]+1
        new_records_df = add_surrogate_key(new_records_df, dimension_name, new_surrogate_key_value)
        new_records_df = add_update_timestamp(new_records_df)
        upload_bigquery_table(logging, bqclient, dimension_table_path, "WRITE_APPEND", new_records_df)    

# Generate Date and Time Dimensions

## For 311 Complaints Data

In [2]:
dimension_name = 'date'
surrogate_key = dimension_name + '_dim_id'
business_key = dimension_name + '_id'

gcp_project = 'cis4400-381214' # replace to your own project id
bq_dataset = '311_complaints_dataset' # replace to your own dataset name
table_name = dimension_name+'_dimension'

dimension_table_path = ".".join([gcp_project,bq_dataset,table_name])

In [3]:
# Set up logging
current_date = datetime.today().strftime('%Y%m%d')
log_filename = "_".join(["etl",dimension_name,current_date]) + ".log"
logging.basicConfig(filename=log_filename, encoding='utf-8', format='%(asctime)s %(message)s', level=logging.DEBUG)
logging.info("=========================================================================")
logging.info("Starting ETL Run for dimension " + dimension_name + " on date " + current_date)

In [14]:
# date dimension
if __name__ == "__main__":
    df = pd.DataFrame
    df = generate_date_dimension(start='2019-01-01', end='2023-12-31')
    bqclient = create_bigquery_client(logging)
    target_table_exists = bigquery_table_exists(bqclient, dimension_table_path  )
    if not target_table_exists:
        build_new_table(logging, bqclient, dimension_table_path, dimension_name, df)
    if target_table_exists:
        print("Date dimension already exists. Will not overwrite it")
    logging.shutdown()

## For Motor Vehicle Collision Data 

In [81]:
gcp_project = 'cis4400-381214' # replace to your own project id
bq_dataset = 'motor_vehicle_collision_dataset' # replace to your own dataset name

dimension_name = 'date'
surrogate_key = dimension_name + '_dim_id'
business_key = dimension_name + '_id'
table_name = dimension_name+'_dimension'
dimension_table_path = ".".join([gcp_project,bq_dataset,table_name])

In [None]:
# Set up logging
current_date = datetime.today().strftime('%Y%m%d')
log_filename = "_".join(["etl",dimension_name,current_date]) + ".log"
logging.basicConfig(filename=log_filename, encoding='utf-8', format='%(asctime)s %(message)s', level=logging.DEBUG)
logging.info("=========================================================================")
logging.info("Starting ETL Run for dimension " + dimension_name + " on date " + current_date)

In [82]:
# date dimension
if __name__ == "__main__":
    df = pd.DataFrame
    df = generate_date_dimension(start='2019-01-01', end='2023-12-31')
    bqclient = create_bigquery_client(logging)
    target_table_exists = bigquery_table_exists(bqclient, dimension_table_path  )
    if not target_table_exists:
        build_new_table(logging, bqclient, dimension_table_path, dimension_name, df)
    if target_table_exists:
        print("Date dimension already exists. Will not overwrite it")
    logging.shutdown()

In [95]:
dimension_name = 'time'
surrogate_key = dimension_name + '_dim_id'
business_key = dimension_name + '_id'
table_name = dimension_name+'_dimension'
dimension_table_path = ".".join([gcp_project,bq_dataset,table_name])

In [96]:
# time dimension
if __name__ == "__main__":
    df = pd.DataFrame
    df = generate_time_dimension(start='2019-01-01 00:00:00', end='2019-01-01 23:59:59')
    bqclient = create_bigquery_client(logging)
    target_table_exists = bigquery_table_exists(bqclient, dimension_table_path  )
    if not target_table_exists:
        build_new_table(logging, bqclient, dimension_table_path, dimension_name, df)
    if target_table_exists:
        print("Time dimension already exists. Will not overwrite it")
    logging.shutdown()