In [15]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from pulp import *
import pandas as pd
import os, glob
import seaborn as sns
from scipy.stats import kruskal
import scikit_posthocs as sp
from scipy.stats import mannwhitneyu
from dotenv import load_dotenv

load_dotenv('./Credentials.env',override=True)

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] =str(os.getenv("GOOGLE_APPLICATION_CREDENTIALS"))
os.environ['GCLOUD_PROJECT'] = str(os.getenv("GCLOUD_PROJECT"))

%load_ext google.cloud.bigquery

from google.cloud import bigquery
client=bigquery.Client()

The google.cloud.bigquery extension is already loaded. To reload it, use:
  %reload_ext google.cloud.bigquery


In [17]:
from google.cloud import bigquery

def all_available_tables(dataset_name='shc_core_2024', project_id='som-nero-phi-jonc101'):
    try:
        # Initialize BigQuery Client
        client = bigquery.Client()

        # Construct dataset reference
        dataset_ref = client.dataset(dataset_name, project=project_id)

        # List tables in the dataset
        tables = client.list_tables(dataset_ref)

        # Extract table names
        table_names = [table.table_id for table in tables]

        return table_names

    except Exception as e:
        print(f"Error fetching tables: {e}")
        return None

In [20]:
project_id='som-nero-phi-jonc101'
dataset_name='shc_core_2024'
table_names=all_available_tables(dataset_name,project_id)
table_names

['adt',
 'alert',
 'alert_history',
 'alerts_orders',
 'allergy',
 'clinical_doc_meta',
 'culture_sensitivity',
 'demographic',
 'dep_map',
 'diagnosis',
 'drg_code',
 'encounter',
 'f_ip_hsp_admission',
 'family_hx',
 'flowsheet',
 'geolocation_from_omop',
 'ib_messages',
 'lab_result',
 'lda',
 'mapped_meds',
 'med_orderset',
 'myc_mesg',
 'ndc_code',
 'new_pats',
 'order_comment',
 'order_med',
 'order_proc',
 'pharmacy_mar',
 'proc_orderset',
 'procedure',
 'prov_map',
 'social_hx',
 'treatment_team',
 'zip']

In [21]:
from google.cloud import bigquery

def backup_tables(table_names, project_id, dataset_name, new_dataset_name='fatemeh_copy_shc_core_2023'):
    """Copies tables from one dataset to another within the same BigQuery project."""
    try:
        client = bigquery.Client()

        for table in table_names:
            source_table_id = f"{project_id}.{dataset_name}.{table}"
            destination_table_id = f"{project_id}.{new_dataset_name}.{table}"
            
            print(f"Copying table: {source_table_id} → {destination_table_id}")

            job = client.copy_table(source_table_id, destination_table_id)
            job.result()  # Waits for the copy job to complete

            print(f"Successfully copied {table}")

    except Exception as e:
        print(f" Error copying tables: {e}")

In [23]:
project_id='som-nero-phi-jonc101'
dataset_name='shc_core_2024'
# make a backup from all tables
backup_tables(table_names,project_id,dataset_name,new_dataset_name='copy_shc_core_2024')

Copying table: som-nero-phi-jonc101.shc_core_2024.adt → som-nero-phi-jonc101.fatemeh_db_2024.adt
✔ Successfully copied adt
Copying table: som-nero-phi-jonc101.shc_core_2024.alert → som-nero-phi-jonc101.fatemeh_db_2024.alert
✔ Successfully copied alert
Copying table: som-nero-phi-jonc101.shc_core_2024.alert_history → som-nero-phi-jonc101.fatemeh_db_2024.alert_history
✔ Successfully copied alert_history
Copying table: som-nero-phi-jonc101.shc_core_2024.alerts_orders → som-nero-phi-jonc101.fatemeh_db_2024.alerts_orders
✔ Successfully copied alerts_orders
Copying table: som-nero-phi-jonc101.shc_core_2024.allergy → som-nero-phi-jonc101.fatemeh_db_2024.allergy
✔ Successfully copied allergy
Copying table: som-nero-phi-jonc101.shc_core_2024.clinical_doc_meta → som-nero-phi-jonc101.fatemeh_db_2024.clinical_doc_meta
✔ Successfully copied clinical_doc_meta
Copying table: som-nero-phi-jonc101.shc_core_2024.culture_sensitivity → som-nero-phi-jonc101.fatemeh_db_2024.culture_sensitivity
✔ Successfull

In [25]:
def all_timestamp_columns(table_name, dataset_name='shc_core_2023', project_id='som-nero-phi-jonc101'):
    """
    Retrieves all columns of type TIMESTAMP or DATETIME from a BigQuery table.
    """
    try:
        # Initialize BigQuery client
        client = bigquery.Client()

        # Get table schema
        full_table_name = f"{project_id}.{dataset_name}.{table_name}"
        table = client.get_table(full_table_name)

        # Extract TIMESTAMP or DATETIME columns
        datetime_cols = [field.name for field in table.schema if field.field_type in ['TIMESTAMP', 'DATETIME']]

        return datetime_cols

    except Exception as e:
        print(f"Error fetching schema for {full_table_name}: {e}")
        return None

In [41]:
from google.cloud import bigquery

def convert_string_to_datetime(table_name, dataset_name, project_id):
    """
    Identifies STRING columns in a BigQuery table that can be converted to DATETIME or DATE.
    Returns lists of columns that can be converted.
    """
    try:
        client = bigquery.Client()

        # Construct table reference
        full_table_name = f"{project_id}.{dataset_name}.{table_name}"
        table = client.get_table(full_table_name)

        # Identify STRING columns
        string_columns = [field.name for field in table.schema if field.field_type == "STRING"]

        string_datetime_cols = []
        string_date_cols = []

        for column in string_columns:
            query = f"""
                SELECT 
                    COUNTIF(SAFE.PARSE_DATETIME('%Y-%m-%d %H:%M:%S', {column}) IS NOT NULL) AS datetime_count,
                    COUNTIF(SAFE.PARSE_DATE('%Y-%m-%d', {column}) IS NOT NULL) AS date_count
                FROM `{full_table_name}`;
            """

            results = client.query(query).result()

            for row in results:
                if row.datetime_count > 0:
                    string_datetime_cols.append(column)
                if row.date_count > 0:
                    string_date_cols.append(column)

        return string_datetime_cols, string_date_cols

    except Exception as e:
        print(f"❌ Error processing table {full_table_name}: {e}")
        return None, None

In [42]:
def fix_datetime(datetime_columns, date_columns, table_name, dataset_name, project_id):
    """
    Converts specified STRING columns into DATETIME or DATE format and updates the BigQuery table.
    """
    try:
        client = bigquery.Client()

        # Construct table references
        full_table_name = f"{project_id}.{dataset_name}.{table_name}"
        new_table_name = full_table_name  # Keeps the same table name

        # Merge datetime and date columns
        all_columns = datetime_columns + date_columns

        # Generate SQL conversion expressions
        formatted_expressions = []
        for col in datetime_columns:
            formatted_expressions.append(f"SAFE.PARSE_DATETIME('%Y-%m-%d %H:%M:%S', {col}) AS {col}")

        for col in date_columns:
            formatted_expressions.append(f"SAFE.PARSE_DATE('%Y-%m-%d', {col}) AS {col}")

        query = f"""
            CREATE OR REPLACE TABLE `{new_table_name}` AS
            SELECT * EXCEPT({', '.join(all_columns)}), {', '.join(formatted_expressions)}
            FROM `{full_table_name}`;
        """

        print("Executing query:\n", query)

        # Run the query
        job = client.query(query)
        job.result()  # Wait for completion

        print(f"✔ Successfully updated table `{new_table_name}` with corrected datetime formats.")

    except Exception as e:
        print(f"❌ Error converting datetime columns: {e}")

In [44]:
from google.cloud import bigquery

def convert_la_to_utc(columns, table_name, dataset_name='shc_core_2023', project_id='som-nero-phi-jonc101'):
    """
    Converts specified timestamp columns from America/Los_Angeles (PST/PDT) to UTC in a BigQuery table.
    """
    try:
        client = bigquery.Client()

        # Construct table references
        full_table_name = f"{project_id}.{dataset_name}.{table_name}"
        new_table_name = full_table_name  # Keeps the same table name

        # Generate SQL query for UTC conversion
        utc_columns = [
            f"TIMESTAMP({col}, 'America/Los_Angeles') AS {col}_utc"
            for col in columns
        ]
        
        query = f"""
            CREATE OR REPLACE TABLE `{new_table_name}` AS
            SELECT *, {', '.join(utc_columns)}
            FROM `{full_table_name}`;
        """

        print("Executing query:\n", query)

        # Run the query
        job = client.query(query)
        job.result()  # Wait for completion

        print(f"✔ Successfully updated table `{new_table_name}` with UTC timestamps.")

    except Exception as e:
        print(f"❌ Error converting timestamps: {e}")


In [46]:
for table in table_names:
    
    datetimecolumns=all_timestamp_columns(table_name=table,dataset_name=dataset_name,project_id=project_id)
    print(datetimecolumns)
    if len(datetimecolumns)>0:
        convert_la_to_utc(columns=datetimecolumns,table_name=table,dataset_name=dataset_name,project_id=project_id)

adt
['effective_time_jittered', 'event_time_jittered']
Executing query:
 
            CREATE OR REPLACE TABLE `som-nero-phi-jonc101.shc_core_2024.adt` AS
            SELECT *, TIMESTAMP(effective_time_jittered, 'America/Los_Angeles') AS effective_time_jittered_utc, TIMESTAMP(event_time_jittered, 'America/Los_Angeles') AS event_time_jittered_utc
            FROM `som-nero-phi-jonc101.shc_core_2024.adt`;
        
✔ Successfully updated table `som-nero-phi-jonc101.shc_core_2024.adt` with UTC timestamps.
****************************************************
alert
['update_date_jittered']
Executing query:
 
            CREATE OR REPLACE TABLE `som-nero-phi-jonc101.shc_core_2024.alert` AS
            SELECT *, TIMESTAMP(update_date_jittered, 'America/Los_Angeles') AS update_date_jittered_utc
            FROM `som-nero-phi-jonc101.shc_core_2024.alert`;
        
✔ Successfully updated table `som-nero-phi-jonc101.shc_core_2024.alert` with UTC timestamps.
*****************************************