In [1]:
from google.cloud import bigquery
from google.cloud.exceptions import NotFound
import pandas as pd
import os
import logging
from datetime import datetime

In [12]:
# working on creating a for loop that loads all dimensions
dim_dict = {
    'agency':['agency', 'agency_name'],
    'location':['incident_zip', 'intersection_street_1', 'intersection_street_2', 'borough', 'city', 'state']
}

In [18]:
list(dim_dict.keys())[1]

'location'

In [2]:
dimension_name = 'agency' # replace with dimension names
surrogate_key = f"{dimension_name}_dim_id"
business_key = f'{dimension_name}_id'

gcp_project = 'cis4400-381214' # replace to your own project id
bq_dataset = '311_complaints_dataset' # replace to your own dataset name
table_name = f"{dimension_name}_dimension"
dimension_table_path = f"{gcp_project}.{bq_dataset}.{table_name}"

file_source_path = 'data'

In [3]:
for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)
current_date = datetime.today().strftime('%Y%m%d')
log_filename = "_".join(["etl",dimension_name,current_date])+".log"
logging.basicConfig(filename=log_filename, encoding='utf-8', format='%(asctime)s %(message)s', level=logging.DEBUG)
logging.info("=========================================================================")
logging.info(f"Starting ETL Run for dimension {dimension_name} on date {current_date}")

In [4]:
def load_csv_data_file(logging: logging.Logger, file_source_path: str, file_name: str, df: pd.DataFrame):
    file_source = os.path.join(file_source_path, file_name)
    logging.info(f"Reading source data file: {file_source}")
    try:
        df = pd.read_csv(file_source)
        df = df.rename(columns=str.lower)
        logging.info(f"Read {len(df)} records from source data file: {file_source}")
        return df
    except:
        logging.error(f"Failed to read file: {file_source}")
    return df

In [5]:
def transform_data(logging: logging.Logger, df: pd.DataFrame):
    logging.info("Transforming dataframe.")
    column_list = ['agency', 'agency_name'] # replace with columns corresponding to the dimensions
    df = df[column_list]
    df = df.drop_duplicates()
    return df

In [6]:
def create_bigquery_client(logging):
    try:
        bqclient = bigquery.Client.from_service_account_json('keys/new-cis4400-381214-f4f2229d6853.json') # replace with your own SA keys
        logging.info("Created BigQuery Client: %s", bqclient)
        return bqclient
    except Exception as err:
        logging.error("Failed to create BigQuery Client.", err)
    return bqclient

In [7]:
def upload_bigquery_table(logging, bqclient, table_path, write_disposition, df):
    try:
        logging.info("Creating BigQuery Job configuration with write_disposition=%s", write_disposition)
        job_config = bigquery.LoadJobConfig(write_disposition=write_disposition)
        logging.info("Submitting the BigQuery job")
        job = bqclient.load_table_from_dataframe(df, table_path, job_config=job_config)  
        logging.info("Job  results: %s",job.result())
    except Exception as err:
        logging.error("Failed to load BigQuery Table. %s", err)

In [8]:
def bigquery_table_exists(bqclient, table_path):  
    try:
        bqclient.get_table(table_path)
        return True
    except NotFound:
        return False

In [9]:
def query_bigquery_table(logging, table_path, bqclient, surrogate_key): 
    bq_df = pd.DataFrame
    sql_query = 'SELECT * EXCEPT ( update_timestamp, '+surrogate_key+') FROM `' + table_path + '`'
    logging.info("Running query: %s", sql_query)
    try:
        bq_df = bqclient.query(sql_query).to_dataframe()
    except Exception as err:
        logging.info("Error querying the table. %s", err)
    return bq_df

In [10]:
def add_surrogate_key(df, dimension_name='agency', offset=1):
    df.reset_index(drop=True, inplace=True)
    df.insert(0, dimension_name+'_dim_id', df.index+offset)
    return df

In [11]:
def add_update_date(df, current_date):
    df['update_date'] = pd.to_datetime(current_date)
    return df

In [12]:
def add_update_timestamp(df):
    df['update_timestamp'] = pd.to_datetime('now', utc=True).replace(microsecond=0)
    return df

In [13]:
def build_new_table(logging, bqclient, dimension_table_path, dimension_name, df):
    logging.info("Target dimension table %s does not exit", dimension_table_path)
    df = add_surrogate_key(df, dimension_name, 1)
    df = add_update_timestamp(df)
    upload_bigquery_table(logging, bqclient, dimension_table_path, "WRITE_TRUNCATE", df)

In [14]:
def insert_existing_table(logging, bqclient, dimension_table_path, dimension_name, surrogate_key, df):
    bq_df = pd.DataFrame
    logging.info("Target dimension table %s exits. Checking for differences.", dimension_table_path)
    bq_df = query_bigquery_table(logging, dimension_table_path, bqclient, surrogate_key)
    new_records_df = pd.concat([df, bq_df]).drop_duplicates(keep=False)
    logging.info("Found %d new records.", new_records_df.shape[0])
    if new_records_df.shape[0] > 0:
        new_surrogate_key_value = bq_df.shape[0]+1
        new_records_df = add_surrogate_key(new_records_df, dimension_name, new_surrogate_key_value)
        new_records_df = add_update_timestamp(new_records_df)
        upload_bigquery_table(logging, bqclient, dimension_table_path, "WRITE_APPEND", new_records_df)  

In [15]:
if __name__ == "__main__":
    df = pd.DataFrame
    df = load_csv_data_file(logging, file_source_path, "311_traffic_signal_complaints_2023.csv", df)
    df = transform_data(logging, df)
    bqclient = create_bigquery_client(logging)
    target_table_exists = bigquery_table_exists(bqclient, dimension_table_path)
    if not target_table_exists:
        build_new_table(logging, bqclient, dimension_table_path, dimension_name, df)
    if target_table_exists:
        insert_existing_table(logging, bqclient, dimension_table_path, dimension_name, surrogate_key, df)
    logging.shutdown()