# AlloyDB Data Prep Notebook

This notebook downloads the dataset ["Financial Transactions Dataset: Analytics"](https://www.kaggle.com/datasets/computingvictor/transactions-fraud-datasets) from Kaggle and runs various transformations to import the data into an AlloyDB for PostgreSQL database. 

See the following documentation for more details on the steps performed in the notebook: [Import a CSV File to AlloyDB](https://cloud.google.com/alloydb/docs/import-csv-file)

## Basic Setup

### Define Notebook Variables

Update the variables below to match your environment.

In [None]:
project_id = "my-project"
region = "my-region"
alloydb_cluster = "my-alloydb-cluster"
alloydb_instance = "my-alloydb-instance"
alloydb_database = "finance"
alloydb_password = input("Please provide a password to be used for 'postgres' database user: ")
gcs_bucket_name = f"project-files-{project_id}"
export_staging_directory = f"gs://{gcs_bucket_name}/staging"
export_output_directory = f"gs://{gcs_bucket_name}/output"
vpc = "demo-vpc"

### Connect to your Google Cloud Project

In [None]:
# Configure gcloud.
!gcloud config set project {project_id}

### Configure Logging

In [None]:
import logging
import sys

# Configure the root logger to output messages with INFO level or above
logging.basicConfig(level=logging.INFO, stream=sys.stdout, format='%(asctime)s[%(levelname)5s][%(name)14s] - %(message)s',  datefmt='%H:%M:%S', force=True)

### Install Dependencies

In [None]:
! pip install --quiet google-cloud-storage==2.19.0 \
                      google-cloud-aiplatform==1.74.0 \
                      asyncpg==0.30.0 \
                      google.cloud.alloydb.connector==1.9.0 \
                      google-genai==1.4.0 \
                      kagglehub==0.3.12

### Define Helper Functions

#### rest_api_helper()

In [None]:
import requests
import google.auth
import json

# Get an access token based upon the current user
creds, _ = google.auth.default()
authed_session = google.auth.transport.requests.AuthorizedSession(creds)
access_token=creds.token

if project_id:
  authed_session.headers.update({"x-goog-user-project": project_id}) # Required to workaround a project quota bug

def rest_api_helper(
    session: requests.Session,
    url: str,
    http_verb: str,
    request_body: dict = None,
    params: dict = None
  ) -> dict:
  """Calls a REST API using a pre-authenticated requests Session."""

  headers = {"Content-Type": "application/json"}

  try:

    if http_verb == "GET":
      response = session.get(url, headers=headers, params=params)
    elif http_verb == "POST":
      response = session.post(url, json=request_body, headers=headers, params=params)
    elif http_verb == "PUT":
      response = session.put(url, json=request_body, headers=headers, params=params)
    elif http_verb == "PATCH":
      response = session.patch(url, json=request_body, headers=headers, params=params)
    elif http_verb == "DELETE":
      response = session.delete(url, headers=headers, params=params)
    else:
      raise ValueError(f"Unknown HTTP verb: {http_verb}")

    # Raise an exception for bad status codes (4xx or 5xx)
    response.raise_for_status()

    # Check if response has content before trying to parse JSON
    if response.content:
        return response.json()
    else:
        return {} # Return empty dict for empty responses (like 204 No Content)

  except requests.exceptions.RequestException as e:
      # Catch potential requests library errors (network, timeout, etc.)
      # Log detailed error information
      print(f"Request failed: {e}")
      if e.response is not None:
          print(f"Request URL: {e.request.url}")
          print(f"Request Headers: {e.request.headers}")
          print(f"Request Body: {e.request.body}")
          print(f"Response Status: {e.response.status_code}")
          print(f"Response Text: {e.response.text}")
          # Re-raise a more specific error or a custom one
          raise RuntimeError(f"API call failed with status {e.response.status_code}: {e.response.text}") from e
      else:
          raise RuntimeError(f"API call failed: {e}") from e
  except json.JSONDecodeError as e:
      print(f"Failed to decode JSON response: {e}")
      print(f"Response Text: {response.text}")
      raise RuntimeError(f"Invalid JSON received from API: {response.text}") from e



#### run_query()

In [None]:
# Create AlloyDB Query Helper Function
import sqlalchemy
from sqlalchemy import text, exc
import pandas as pd

async def run_query(pool, sql: str, params = None, output_as_df: bool = True):
    """Executes a SQL query or statement against the database pool.

    Handles various SQL statements:
    - SELECT/WITH: Returns results as a DataFrame (if output_as_df=True)
      or ResultProxy. Supports parameters. Does not commit.
    - EXPLAIN/EXPLAIN ANALYZE: Executes the explain, returns the query plan
      as a formatted multi-line string. Ignores output_as_df.
      Supports parameters. Does not commit.
    - INSERT/UPDATE/DELETE/CREATE/ALTER etc.: Executes the statement,
      commits the transaction, logs info, and returns the ResultProxy.
      Supports single or bulk parameters (executemany).

    Args:
      pool: An asynchronous SQLAlchemy connection pool.
      sql: A string containing the SQL query or statement template.
      params: Optional.
        - None: Execute raw SQL (Use with caution for non-SELECT/EXPLAIN).
        - dict or tuple: Parameters for a single execution.
        - list of dicts/tuples: Parameters for bulk execution (executemany).
      output_as_df (bool): If True and query is SELECT/WITH, return pandas DataFrame.
                           Ignored for EXPLAIN and non-data-returning statements.

    Returns:
      pandas.DataFrame | str | sqlalchemy.engine.Result | None:
        - DataFrame: For SELECT/WITH if output_as_df=True.
        - str: For EXPLAIN/EXPLAIN ANALYZE, containing the formatted query plan.
        - ResultProxy: For non-SELECT/WITH/EXPLAIN statements, or SELECT/WITH
                       if output_as_df=False.
        - None: If a SQLAlchemy ProgrammingError or other specific error occurs.

    Raises:
        Exception: Catches and logs `sqlalchemy.exc.ProgrammingError`, returning None.
                   May re-raise other database exceptions.

    Example Execution:
      Single SELECT:
        sql_select = "SELECT ticker, company_name from investments LIMIT 5"
        df_result = await run_query(pool, sql_select)

      Single non-SELECT - Parameterized (Safe!):
        Parameterized INSERT:
          sql_insert = "INSERT INTO investments (ticker, company_name) VALUES (:ticker, :name)"
          params_insert = {"ticker": "NEW", "name": "New Company"}
          insert_result = await run_query(pool, sql_insert, params_insert)

        Parameterized UPDATE:
          sql_update = "UPDATE products SET price = :price WHERE id = :product_id"
          params_update = {"price": 99.99, "product_id": 123}
          update_result = await run_query(pool, sql_update, params_update)

      Bulk Update:
        docs = pd.DataFrame([
            {'id': 101, 'sparse_embedding': '[0.1, 0.2]'},
            {'id': 102, 'sparse_embedding': '[0.3, 0.4]'},
            # ... more rows
        ])

        update_sql_template = '''
            UPDATE products
            SET sparse_embedding = :embedding,
                sparse_embedding_model = 'BM25'
            WHERE id = :product_id
        ''' # Using named parameters :param_name

        # Prepare list of dictionaries for params
        data_to_update = [
            {"embedding": row.sparse_embedding, "product_id": row.id}
            for row in docs.itertuples(index=False)
        ]

        if data_to_update:
          bulk_result = await run_query(pool, update_sql_template, data_to_update)
          # bulk_result is the SQLAlchemy ResultProxy

    """
    sql_lower_stripped = sql.strip().lower()
    is_select_with = sql_lower_stripped.startswith(('select', 'with'))
    is_explain = sql_lower_stripped.startswith('explain')

    # Determine if the statement is expected to return data rows or a plan
    is_data_returning = is_select_with or is_explain

    # Determine actual DataFrame output eligibility (only for SELECT/WITH)
    effective_output_as_df = output_as_df and is_select_with

    # Check if params suggest a bulk operation (for logging purposes)
    is_bulk_operation = isinstance(params, (list, tuple)) and len(params) > 0 and isinstance(params[0], (dict, tuple, list))

    async with pool.connect() as conn:
        try:
          # Execute with or without params
          if params:
              result = await conn.execute(text(sql), params)
          else:
              # Add warning for raw SQL only if it's NOT data-returning
              #if not is_data_returning:
                  #logging.warning("Executing non-SELECT/EXPLAIN raw SQL without parameters. Ensure SQL is safe.")
              result = await conn.execute(text(sql))

          # --- Handle statements that return data or plan ---
          if is_data_returning:
              if is_explain:
                  # Fetch and format EXPLAIN output as a string
                    try:
                        plan_rows = result.fetchall()
                        # EXPLAIN output is usually text in the first column
                        query_plan = "\n".join([str(row[0]) for row in plan_rows])
                        #logging.info(f"EXPLAIN executed successfully for: {sql[:100]}...")
                        return query_plan
                    except Exception as e:
                        logging.error(f"Error fetching/formatting EXPLAIN result: {e}")
                        return None
              else: # Handle SELECT / WITH
                  if effective_output_as_df:
                      try:
                          rows = result.fetchall()
                          column_names = result.keys()
                          df = pd.DataFrame(rows, columns=column_names)
                          #logging.info(f"SELECT/WITH executed successfully, returning DataFrame for: {sql[:100]}...")
                          return df
                      except Exception as e:
                          logging.error(f"Error converting SELECT result to DataFrame: {e}")
                          logging.info(f"Returning raw ResultProxy for SELECT/WITH due to DataFrame conversion error for: {sql[:100]}...")
                          return result # Fallback to raw result
                  else:
                      # Return raw result proxy for SELECT/WITH if df output not requested
                      #logging.info(f"SELECT/WITH executed successfully, returning ResultProxy for: {sql[:100]}...")
                      return result

          # --- Handle Non-Data Returning Statements (INSERT, UPDATE, DELETE, CREATE, etc.) ---
          else:
              await conn.commit() # Commit changes ONLY for these statements
              operation_type = sql.strip().split()[0].upper()
              row_count = result.rowcount # Note: rowcount behavior varies

              if is_bulk_operation:
                  print(f"Bulk {operation_type} executed for {len(params)} items. Result rowcount: {row_count}")
              elif operation_type in ['INSERT', 'UPDATE', 'DELETE']:
                  print(f"{operation_type} statement executed successfully. {row_count} row(s) affected.")
              else: # CREATE, ALTER, etc.
                  print(f"{operation_type} statement executed successfully. Result rowcount: {row_count}")
              return result # Return the result proxy

        except exc.ProgrammingError as e:
            # Log the error with context
            logging.error(f"SQL Programming Error executing query:\nSQL: {sql[:500]}...\nParams (sample): {str(params)[:500]}...\nError: {e}")
            # Rollback might happen automatically on context exit with error, but explicit can be clearer
            # await conn.rollback() # Consider if needed based on pool/transaction settings
            return None # Return None on handled programming errors
        except Exception as e:
            # Log other unexpected errors
            logging.error(f"An unexpected error occurred during query execution:\nSQL: {sql[:500]}...\nError: {e}")
            # await conn.rollback() # Consider if needed
            raise # Re-raise unexpected errors



## Prepare Dataset

### Download Kaggle Dataset

In [None]:
import kagglehub
import os
import shutil

# Set the handle for the dataset you want to download
dataset = "computingvictor/transactions-fraud-datasets"

dataset_files = ["transactions_data.csv", "cards_data.csv", "users_data.csv", "mcc_codes.json", "train_fraud_labels.json"]

os.makedirs('../data/', exist_ok=True) 

for file_to_download in dataset_files:
    # Set the local path where you want to save the file
    # In this case, it will be saved in the ../data/ directory
    local_file_path = '../data/' + file_to_download

    # Download the specified file from the dataset
    # The function returns the path to the downloaded file
    downloaded_path = kagglehub.dataset_download(
        dataset,
        path=file_to_download,
    )

    # The file is downloaded to a cache directory managed by kagglehub.
    # This part of the code will move it to your desired local path.
    # If a file with the same name already exists at the destination, it will be overwritten.
    shutil.move(downloaded_path, local_file_path)

    print(f"File '{file_to_download}' downloaded to: {os.path.abspath(local_file_path)}")

### Reformat JSON files as CSV

#### Inspect File Structure

In [None]:
import json
import os

# --- 1. Create a sample JSON file for demonstration ---
# In your actual use case, you would already have this file.
files = ['../data/train_fraud_labels.json', '../data/mcc_codes.json']

# --- 2. Load the JSON files and inspect their structure ---
for file_path in files:
    try:
        # Open the file and load the JSON data into a Python object
        with open(file_path, 'r') as f:
            data = json.load(f)

        print("Successfully loaded JSON file into memory.")
        print(f"\n--- Inspecting Object Structure for {file_path} ---")

        # Check the type of the top-level object
        print(f"1. Type of the loaded object: {type(data)}")

        # Check the top-level keys
        if isinstance(data, dict):
            top_level_keys = list(data.keys())
            print(f"2. Top-level keys: {top_level_keys}")

            # Inspect the nested object under the 'target' key
            if 'target' in top_level_keys:
                target_object = data['target']
                print("\n3. Inspecting the nested object under the 'target' key:")
                print(f"  - Type of nested object: {type(target_object)}")

                if isinstance(target_object, dict):
                    num_items = len(target_object)
                    print(f"  - Number of items (transaction ID/label pairs): {num_items}")

                    # Show a few example items without printing the whole dictionary
                    print("  - Example items (first 5):")
                    count = 0
                    for key, value in target_object.items():
                        print(f"    '{key}': '{value}'")
                        count += 1
                        if count >= 5:
                            break
        else:
            print("The JSON file does not contain a top-level dictionary as expected.")


    except FileNotFoundError:
        print(f"Error: The file '{file_path}' was not found.")
    except json.JSONDecodeError:
        print(f"Error: The file '{file_path}' is not a valid JSON file.")
    finally:
        print("\n--- Inspection Complete ---")

#### Transform Files

In [None]:
import json
import csv
import os

def convert_json_to_csv(json_path, csv_path, processor_func):
    """
    A generic function to convert a JSON file to CSV using a specific processor.
    """
    try:
        print(f"Processing {json_path}...")

        # Step 1: Read the entire JSON file into memory and automatically close it.
        with open(json_path, 'r') as json_file:
            data = json.load(json_file)

        # Step 2: Use the provided processor function to get the header and rows.
        header, rows = processor_func(data)
        
        # Step 3: Open the destination CSV file and write the data.
        with open(csv_path, 'w', newline='') as csv_file:
            writer = csv.writer(csv_file)
            writer.writerow(header)
            writer.writerows(rows)
        
        print(f"Successfully created {csv_path}")

    except FileNotFoundError:
        print(f"ERROR: File not found at {json_path}")
    except Exception as e:
        print(f"An error occurred while processing {json_path}: {e}")


def process_fraud_labels(data):
    """Processor for the fraud labels JSON structure."""
    header = ['transaction_id', 'is_fraud']
    # The actual data is nested under the 'target' key
    target_data = data.get('target', {})
    rows = list(target_data.items())
    return header, rows


def process_mcc_codes(data):
    """Processor for the MCC codes JSON structure."""
    header = ['mcc', 'description']
    rows = list(data.items())
    return header, rows


# --- Main execution ---
def main():
    # Define file paths
    data_directory = '../data'
    
    # Paths for the first conversion
    fraud_json_path = os.path.join(data_directory, 'train_fraud_labels.json')
    fraud_csv_path = os.path.join(data_directory, 'fraud_labels.csv') # Output file

    # Paths for the second conversion
    mcc_json_path = os.path.join(data_directory, 'mcc_codes.json')
    mcc_csv_path = os.path.join(data_directory, 'mcc_codes.csv') # Output file
    
    # 3. Run the conversions using the generic function and specific processors
    convert_json_to_csv(fraud_json_path, fraud_csv_path, process_fraud_labels)
    print() # Add a newline for better readability
    convert_json_to_csv(mcc_json_path, mcc_csv_path, process_mcc_codes)

    print("\n--- Conversion complete. ---")
    
main()

#### Inspect CSV Files

In [None]:
!head -n 5 ../data/fraud_labels.csv

In [None]:
!head -n 5 ../data/mcc_codes.csv

#### Delete JSON Files

In [None]:
# Remove json files after validating new csv files are good
files_to_remove = ['../data/train_fraud_labels.json', '../data/mcc_codes.json']
for file in files_to_remove:
    os.remove(file)

### Remove Header Rows

This is required for the AlloyDB import later in the notebook.

In [None]:
import os
from tempfile import NamedTemporaryFile
from shutil import move

def remove_first_line(filepath):
    """
    Removes the first line from a given file by writing all but the first
    line to a temporary file and then replacing the original file.
    """
    # Create a temporary file in the same directory to ensure it's on the same filesystem
    # which allows for an atomic move operation.
    file_dir = os.path.dirname(filepath)
    try:
        with open(filepath, 'r', encoding='utf-8') as f_in, NamedTemporaryFile(mode='w', encoding='utf-8', delete=False, dir=file_dir, suffix='.tmp') as f_out:
            
            temp_path = f_out.name
            
            # Skip the first line of the input file
            next(f_in)
            
            # Copy the rest of the lines to the temporary file
            for line in f_in:
                f_out.write(line)

        # Replace the original file with the temporary file
        move(temp_path, filepath)
        print(f"Successfully removed the first line from: {filepath}")

    except StopIteration:
        # This handles the case where the file is empty
        print(f"File is empty, no lines to remove: {filepath}")
        if 'temp_path' in locals() and os.path.exists(temp_path):
            os.remove(temp_path)
            
    except Exception as e:
        print(f"An error occurred while processing {filepath}: {e}")
        # Clean up the temporary file if an error occurs
        if 'temp_path' in locals() and os.path.exists(temp_path):
            os.remove(temp_path)

def process_all_csv_in_directory(directory_path):
    """
    Scans a directory for .csv files and removes the first line from each.
    """
    # Check if the directory exists
    if not os.path.isdir(directory_path):
        print(f"Error: Directory not found at '{directory_path}'")
        return

    print(f"Scanning for CSV files in '{directory_path}'...")
    
    # Iterate over all files in the given directory
    for filename in os.listdir(directory_path):
        # Check if the file is a CSV file
        if filename.lower().endswith('.csv'):
            # Construct the full file path
            file_path = os.path.join(directory_path, filename)
            
            # Call the function to remove the first line
            remove_first_line(file_path)

# --- Main execution ---
def main():
    # Specify the directory containing the CSV files
    data_directory = '../data/'
    
    # Run the process
    process_all_csv_in_directory(data_directory)
    
main()

### Clean Transactions and Cards Data

Some of the formatting in the transactions and cards data causes errors on import to AlloyDB. This section cleans up the data so that it can be imported successfully.

In [None]:
import pandas as pd

# --- Step 1: Clean the transactions data ---

print("Cleaning transactions data...")

# Define column names based on your table structure
transaction_cols = [
    'id', 'date', 'client_id', 'card_id', 'amount', 'use_chip',
    'merchant_id', 'merchant_city', 'merchant_state', 'zip', 'mcc', 'errors'
]

# Read the CSV file into a pandas DataFrame, assuming no header row
try:
    df_trans = pd.read_csv('../data/transactions_data.csv', header=None, names=transaction_cols)

    # Clean the 'amount' column: remove '$' and convert to numeric
    # The .loc accessor is used to safely modify the DataFrame
    df_trans.loc[:, 'amount'] = df_trans['amount'].astype(str).str.replace('$', '').astype(float)

    # Clean the 'zip' column: convert float (e.g., 58523.0) to integer then string
    # Handling potential missing values (NaN) before converting
    df_trans.loc[:, 'zip'] = df_trans['zip'].dropna().astype(int).astype(str)

    # Save the cleaned data to a new CSV file without headers or index
    cleaned_transactions_file = '../data/transactions_data.csv'
    df_trans.to_csv(cleaned_transactions_file, index=False, header=False)

    print(f"Successfully cleaned transactions data and saved to '{cleaned_transactions_file}'")
    print("\nPreview of cleaned transactions data:")
    print(df_trans.head().to_string())

except FileNotFoundError:
    print("Error: 'transactions.csv' not found. Please ensure the file is in the correct directory.")
except Exception as e:
    print(f"An error occurred while processing transactions: {e}")


# --- Step 2: Clean the cards data ---

print("\n" + "="*50 + "\n")
print("Cleaning cards data...")

# Define column names based on your table structure
card_cols = [
    'id', 'client_id', 'card_brand', 'card_type', 'card_number', 'expires',
    'cvv', 'has_chip', 'num_cards_issued', 'credit_limit', 'acct_open_date',
    'year_pin_last_changed', 'card_on_dark_web'
]

# Read the CSV file, assuming no header row
try:
    df_cards = pd.read_csv('../data/cards_data.csv', header=None, names=card_cols)

    # Clean 'acct_open_date': convert 'MM/YYYY' to 'YYYY-MM-01'
    df_cards.loc[:, 'acct_open_date'] = pd.to_datetime(df_cards['acct_open_date'], format='%m/%Y').dt.strftime('%Y-%m-01')

    # Clean 'credit_limit': remove '$' (PostgreSQL's MONEY type can be picky)
    df_cards.loc[:, 'credit_limit'] = df_cards['credit_limit'].astype(str).str.replace('$', '').astype(float)

    # Save the cleaned data to a new CSV file
    cleaned_cards_file = '../data/cards_data.csv'
    df_cards.to_csv(cleaned_cards_file, index=False, header=False)

    print(f"Successfully cleaned cards data and saved to '{cleaned_cards_file}'")
    print("\nPreview of cleaned cards data:")
    print(df_cards.head().to_string())

except FileNotFoundError:
    print("Error: 'cards.csv' not found. Please ensure the file is in the correct directory.")
except Exception as e:
    print(f"An error occurred while processing cards: {e}")

In [None]:
!head -n 5 ../data/transactions_data.csv

In [None]:
!head -n 5 ../data/cards_data.csv

### Upload Files to GCS

In [None]:
import os
from google.cloud import storage

def upload_directory_to_gcs(bucket_name, source_directory, destination_blob_prefix):
    """Uploads all files from a directory to a GCS bucket."""

    # Ensure the source directory exists
    if not os.path.isdir(source_directory):
        print(f"Error: The source directory '{source_directory}' does not exist.")
        return

    # Initialize the Google Cloud Storage client
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)

    print(f"Uploading files from '{source_directory}' to 'gs://{bucket_name}/{destination_blob_prefix}'...")

    # Walk through the source directory
    for dirpath, _, filenames in os.walk(source_directory):
        for filename in filenames:
            # Construct the full local path of the file
            local_file_path = os.path.join(dirpath, filename)

            # Construct the destination path in GCS
            relative_path = os.path.relpath(local_file_path, source_directory)
            destination_blob_name = os.path.join(destination_blob_prefix, relative_path)

            # Create a blob and upload the file
            blob = bucket.blob(destination_blob_name)
            blob.upload_from_filename(local_file_path)

            print(f"  Uploaded {local_file_path} to gs://{bucket_name}/{destination_blob_name}")

    print("Upload complete.")

def main():
    # The local directory you want to upload
    local_directory = "../data/"

    # The destination directory within your GCS bucket
    gcs_destination_directory = "data/"

    upload_directory_to_gcs(gcs_bucket_name, local_directory, gcs_destination_directory)
    
main()

## Setup Database

### Connect to the AlloyDB Cluster

This function will create a connection pool to your AlloyDB instance using the AlloyDB Python connector. The AlloyDB Python connector will automatically create secure connections to your AlloyDB instance using mTLS.

In [None]:
import asyncpg

import sqlalchemy
from sqlalchemy.ext.asyncio import AsyncEngine, create_async_engine

from google.cloud.alloydb.connector import AsyncConnector, IPTypes

async def init_connection_pool(connector: AsyncConnector, db_name: str = alloydb_database, pool_size: int = 5) -> AsyncEngine:
    # initialize Connector object for connections to AlloyDB
    connection_string = f"projects/{project_id}/locations/{region}/clusters/{alloydb_cluster}/instances/{alloydb_instance}"

    async def getconn() -> asyncpg.Connection:
        conn: asyncpg.Connection = await connector.connect(
            connection_string,
            "asyncpg",
            user="postgres",
            password=alloydb_password,
            db=db_name,
            ip_type=IPTypes.PRIVATE, # Optionally use IPTypes.PUBLIC
        )
        return conn

    pool = create_async_engine(
        "postgresql+asyncpg://",
        async_creator=getconn,
        pool_size=pool_size,
        max_overflow=0,
        isolation_level='AUTOCOMMIT'
    )
    return pool

connector = AsyncConnector()

postgres_db_pool = await init_connection_pool(connector, "postgres") # Need to connect to postgres database first to create the application database
finance_db_pool = await init_connection_pool(connector, f"{alloydb_database}")

### Create the Finance Database

In [None]:
# Close existing connections to the database
sql = f"""SELECT pg_terminate_backend(pg_stat_activity.pid)
FROM pg_stat_activity
WHERE pg_stat_activity.datname = '{alloydb_database}'
  AND pid <> pg_backend_pid();"""
await run_query(postgres_db_pool, sql)

# Uncomment this section and run this cell to drop the existing database if you want to replace it.
# Drop the database
#sql = f"DROP DATABASE {alloydb_database};"
#await run_query(postgres_db_pool, sql)

# Reinitiate the connection pool
finance_db_pool = await init_connection_pool(connector, f"{alloydb_database}")

In [None]:
# Use postgres_db_pool to create the database
sql = f"CREATE DATABASE {alloydb_database};"
result = await run_query(postgres_db_pool, sql)

### Create Tables and Extensions

In [None]:
# Use finance_db_pool to create the rest of the schema objects

sql_array = []

sql_array.append("CREATE EXTENSION IF NOT EXISTS vector;")

sql_array.append("CREATE EXTENSION IF NOT EXISTS google_ml_integration;")

sql_array.append("""CREATE TABLE transactions (
    id BIGINT PRIMARY KEY,
    date TIMESTAMP WITHOUT TIME ZONE,
    client_id INTEGER,
    card_id INTEGER,
    amount NUMERIC(10, 2),
    use_chip VARCHAR(255),
    merchant_id INTEGER,
    merchant_city VARCHAR(255),
    merchant_state VARCHAR(50),
    zip VARCHAR(10),
    mcc INTEGER,
    errors TEXT
);""")

sql_array.append("""CREATE TABLE cards (
    id BIGINT PRIMARY KEY,
    client_id INTEGER,
    card_brand VARCHAR(50),
    card_type VARCHAR(50),
    card_number VARCHAR(20),
    expires VARCHAR(7),
    cvv VARCHAR(4),
    has_chip BOOLEAN,
    num_cards_issued INTEGER,
    credit_limit MONEY,
    acct_open_date DATE,
    year_pin_last_changed INTEGER,
    card_on_dark_web BOOLEAN
);""")

sql_array.append("""CREATE TABLE users (
    id BIGINT PRIMARY KEY,
    current_age INTEGER,
    retirement_age INTEGER,
    birth_year INTEGER,
    birth_month INTEGER,
    gender VARCHAR(10),
    address VARCHAR(255),
    latitude NUMERIC(10, 8),
    longitude NUMERIC(11, 8),
    per_capita_income MONEY,
    yearly_income MONEY,
    total_debt MONEY,
    credit_score INTEGER,
    num_credit_cards INTEGER
);""")

sql_array.append("""CREATE TABLE mcc_codes (
    mcc SMALLINT PRIMARY KEY,
    description VARCHAR(255) NOT NULL
);""")

sql_array.append("""CREATE TABLE fraud_labels (
    transaction_id BIGINT PRIMARY KEY,
    is_fraud BOOLEAN NOT NULL
);""")

for sql in sql_array:
  await run_query(finance_db_pool, sql)

### Import Data

#### Run the Import

In [None]:
# Reference: https://cloud.google.com/alloydb/docs/import-csv-file#rest-v1
#            https://cloud.google.com/alloydb/docs/reference/rest/v1/projects.locations.operations/get

import time

# Define tables to import
# Format: [ [ table, [list-of-files] ] ]
import_array = [
    ['users',['users_data.csv']],
    ['mcc_codes',['mcc_codes.csv']],
    ['transactions',['transactions_data.csv']],
    ['fraud_labels',['fraud_labels.csv']],
    ['cards',['cards_data.csv']]
]

def import_csv_to_alloydb(import_array):

  operations = []

  for table, files in import_array:
    for f in files:
      url = f"https://alloydb.googleapis.com/v1/projects/{project_id}/locations/{region}/clusters/{alloydb_cluster}:import"
      request_body = {
        "gcsUri": f"gs://{gcs_bucket_name}/data/{f}",
        "database": f"{alloydb_database}",
        "user": "postgres",
        "csvImportOptions": {
          "table": f"{table}",
          #"columns": ["COLUMN1", "COLUMN2"],
          #"fieldDelimiter": "FIELD_DELIMITER",
          #"quoteCharacter": "QUOTE_CHARACTER",
          #"escapeCharacter": "ESCAPE_CHARACTER"
        }
      }
      response = rest_api_helper(authed_session, url, 'POST', request_body, {})
      operations.append([table, response['name']])
      print(response)

  for o in operations:
    operation_complete = False
    while operation_complete == False:
      print(f"Operation for table {o[0]} still running: {o[1]}")
      url = f"https://alloydb.googleapis.com/v1/{o[1]}"
      response = rest_api_helper(authed_session, url, 'GET', request_body, {})
      operation_complete = response['done']
      if operation_complete:
        print(f"Import complete for table: {o[0]}. \nResult: {response}")
        continue
      time.sleep(5)

  return "Import operation complete."

import_csv_to_alloydb(import_array)

#### Validate Row Counts

In [None]:
sql = """
SELECT 'users' AS table_name, (SELECT COUNT(*) FROM users) AS imported_count, 2000 AS target_row_count
UNION ALL
SELECT 'mcc_codes', (SELECT COUNT(*) FROM mcc_codes), 109
UNION ALL
SELECT 'transactions', (SELECT COUNT(*) FROM transactions), 13305915
UNION ALL
SELECT 'fraud_labels', (SELECT COUNT(*) FROM fraud_labels), 8914963
UNION ALL
SELECT 'cards', (SELECT COUNT(*) FROM cards), 6146;
"""

await run_query(finance_db_pool, sql)

### Setup Referential Integrity

#### Create Foreign Keys

In [None]:
sql_array = []

sql_array.append("""-- In the 'transactions' table, link 'client_id' to the 'users' table.
ALTER TABLE transactions
ADD CONSTRAINT fk_transactions_client
FOREIGN KEY (client_id) REFERENCES users(id);
""")

sql_array.append("""-- In the 'transactions' table, link 'card_id' to the 'cards' table.
ALTER TABLE transactions
ADD CONSTRAINT fk_transactions_card
FOREIGN KEY (card_id) REFERENCES cards(id);
""")

sql_array.append("""-- In the 'transactions' table, link 'mcc' to the 'mcc_codes' table.
ALTER TABLE transactions
ADD CONSTRAINT fk_transactions_mcc
FOREIGN KEY (mcc) REFERENCES mcc_codes(mcc);
""")

sql_array.append("""-- In the 'cards' table, link 'client_id' to the 'users' table.
ALTER TABLE cards
ADD CONSTRAINT fk_cards_client
FOREIGN KEY (client_id) REFERENCES users(id);
""")

sql_array.append("""-- In the 'fraud_labels' table, link 'transaction_id' to the 'transactions' table.
ALTER TABLE fraud_labels
ADD CONSTRAINT fk_fraud_labels_transaction
FOREIGN KEY (transaction_id) REFERENCES transactions(id);
""")

for sql in sql_array:
  await run_query(finance_db_pool, sql)

#### Validate Foreign Keys

In [None]:
sql_array = []

# Validate FK constraints
sql_array.append("ALTER TABLE transactions VALIDATE CONSTRAINT fk_transactions_client;")
sql_array.append("ALTER TABLE transactions VALIDATE CONSTRAINT fk_transactions_card;")
sql_array.append("ALTER TABLE transactions VALIDATE CONSTRAINT fk_transactions_mcc;")
sql_array.append("ALTER TABLE cards VALIDATE CONSTRAINT fk_cards_client;")
sql_array.append("ALTER TABLE fraud_labels VALIDATE CONSTRAINT fk_fraud_labels_transaction;")

for sql in sql_array:
  response = await run_query(finance_db_pool, sql)

### Create Indexes

In [None]:
sql_array = []

# FOREIGN KEY INDEXES

sql_array.append("""-- Index on transactions.client_id to speed up joins to the users table
CREATE INDEX idx_transactions_client_id ON transactions (client_id);
""")

sql_array.append("""-- Index on transactions.card_id to speed up joins to the cards table
CREATE INDEX idx_transactions_card_id ON transactions (card_id);
""")

sql_array.append("""-- Index on transactions.mcc to speed up joins to the mcc_codes table
CREATE INDEX idx_transactions_mcc ON transactions (mcc);
""")

sql_array.append("""-- Index on cards.client_id to speed up finding all cards for a given user
CREATE INDEX idx_cards_client_id ON cards (client_id);
""")

# INDEXES FOR COMMON QUERIES 

sql_array.append("""-- Index on the transaction date for time-series analysis
CREATE INDEX idx_transactions_date ON transactions (date);
""")

sql_array.append("""-- Index on merchant_id for quickly finding transactions for a specific merchant
CREATE INDEX idx_transactions_merchant_id ON transactions (merchant_id);
""")

sql_array.append("""-- Composite index on merchant location for geographic queries
CREATE INDEX idx_transactions_merchant_location ON transactions (merchant_state, merchant_city);
""")

sql_array.append("""-- Unique index on card_number for direct card lookups
CREATE UNIQUE INDEX idx_cards_card_number_unique ON cards (card_number);
""")

sql_array.append("""-- Index on credit_score in the users table for analytical queries
CREATE INDEX idx_users_credit_score ON users (credit_score);
""")

# SPECIALIZED PARTIAL INDEX

sql_array.append("""-- Highly efficient index that ONLY includes fraudulent transactions
CREATE INDEX idx_fraud_labels_is_fraud_true ON fraud_labels (transaction_id) WHERE is_fraud = TRUE;
""")

for sql in sql_array:
  response = await run_query(finance_db_pool, sql)

## Export to SQL File

You can now export the database to a .sql file to speed up and simplify future data loads for this dataset.

In [None]:
url = f"https://alloydb.googleapis.com/v1/projects/{project_id}/locations/{region}/clusters/{alloydb_cluster}:export"
request_body = {
  "gcs_destination": {
    "uri": f"gs://{gcs_bucket_name}/export/{alloydb_database}.sql"
  },
  "database": f"{alloydb_database}",
  "sql_export_options": {
    "schema_only": "false",
    "tables": [
        'users',
        'mcc_codes',
        'transactions',
        'fraud_labels',
        'cards'
    ],
    "clean_target_objects": "false",
    "if_exist_target_objects": "false"
  }
}

result = rest_api_helper(authed_session, url, 'POST', request_body, {})
print(f"Kicked off export: {result}")

operation_id = result['name']

operation_complete = False
while operation_complete == False:
  print(f"Export still running: {operation_id}")
  url = f"https://alloydb.googleapis.com/v1/{operation_id}"
  response = rest_api_helper(authed_session, url, 'GET', request_body, {})
  operation_complete = response['done']
  if operation_complete:
    print(f"Operation complete. \nResult: {response}")
    continue
  time.sleep(5)