In [None]:
from datetime import datetime, timedelta
import time

import os
import shutil
import subprocess

import paramiko

import re
import shlex
import glob

import sqlparse

from jinja2 import Environment, FileSystemLoader

import json
import pandas as pd

from google.cloud import storage
from google.cloud import bigquery_migration_v2
from google.cloud import bigquery

import logging

In [None]:
def read_contents_file(file_path, encoding_scheme='cp1252', read_lines=False):
    inputfile = open(file_path, 'r', encoding=encoding_scheme)
    if read_lines:
        return inputfile.readlines()
    else:
        return inputfile.read()

In [None]:
#Write file to local directory
def write_file_local(path,file_data,is_list=False):
    
    with open(path, 'w') as file:
        if is_list:
            file_string = '\n'.join(file_data)
        else:
            file_string = file_data
        file.write(file_string)

In [None]:
def sftp_download_bteqs(hostname, username, password, remotedirectory, localdirectory, srcsysname, file_prefix, add_source_dir_prefix):

    try:
        ssh_client=paramiko.SSHClient()
        ssh_client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
        ssh_client.connect(hostname=hostname,username=username,password=password)
        sftp_client=ssh_client.open_sftp()

        file_list = sftp_client.listdir(path=remotedirectory)
        for remote_file in file_list:
            if remote_file.endswith('.sql') or remote_file.endswith('.prm') or remote_file.endswith('.sh'):
                if remote_file.startswith(file_prefix) and file_prefix.strip() != "" or file_prefix.strip() == "":
                    file_name = os.path.basename(remote_file)
                    
                    # print("Downloading {}".format(file_name))
                    if add_source_dir_prefix == "Yes":
                        local_file_path = os.path.join(localdirectory, srcsysname + '_' + file_name)
                    else:
                        local_file_path = os.path.join(localdirectory, file_name)
                    remote_file_path = os.path.join(remotedirectory, file_name)
                    try: 
                        sftp_client.get(remote_file_path, local_file_path)
                    except:
                        print("Error downloading file {}".format(remote_file_path))        

    except paramiko.AuthenticationException:
        print("Authentication failed. Please check your credentials.")
    except paramiko.SSHException:
        print("An error occurred while establishing an SSH connection.")
    except Exception as e:
        print("Error: {}".format(str(e)))

In [None]:
def sftp_download_bteqs_multiple_sources(hostname, username, password, unix_etl_folder, unix_lob, localdirectory, lob_sub_folders, file_prefix="", add_source_dir_prefix="No", parmfolder_before_lob_subfolder="No"):

    if os.path.exists(localdirectory):
        shutil.rmtree(localdirectory, ignore_errors=True)
    os.makedirs(localdirectory)
    print("Created folder : {}".format(localdirectory))

    lobbasefolder = f"{unix_etl_folder}{unix_lob}"
    print(lobbasefolder)
    for srcfolder in lob_sub_folders:
        if srcfolder == 'ParmFiles':
            remotedirectory = f"{lobbasefolder}/{srcfolder}/"
            sftp_download_bteqs(hostname, username, password, remotedirectory,  localdirectory, unix_lob, file_prefix, add_source_dir_prefix)
        elif parmfolder_before_lob_subfolder == 'Yes':
            remotedirectory = f"{lobbasefolder}/ParmFiles/{srcfolder}/"
            sftp_download_bteqs(hostname, username, password, remotedirectory,  localdirectory, srcfolder, file_prefix, add_source_dir_prefix)
        else:
            remotedirectory = f"{lobbasefolder}/{srcfolder}/ParmFiles/"
            sftp_download_bteqs(hostname, username, password, remotedirectory,  localdirectory, srcfolder, file_prefix, add_source_dir_prefix)

    print("Copied Shell Files") 

In [None]:
def capture_variables_and_gen_td_sql_files(parmfile_path, conversion_source_path_folder, sub_folder_list):
    try:
        print("Read Shell Files") 
        
        control_rec_list = []

        regexp_edw = re.compile(r"(edw)",re.IGNORECASE)
        regexp_core = re.compile(r"("+'|'.join(core_views_list)+")\.",re.IGNORECASE)
        regexp_staging = re.compile(r"("+'|'.join(source_dataset_list)+")\.",re.IGNORECASE)
        regexp_select = re.compile(r"select .*as source_string",re.IGNORECASE)
        
        for folder in sub_folder_list:
            sub_folder = f"{conversion_source_path_folder}\{folder}"
            if os.path.exists(sub_folder):
                shutil.rmtree(sub_folder, ignore_errors=True)
            os.makedirs(sub_folder)
            print("Created folder : {}".format(sub_folder))

        ########################## CAPTURE EXPECTED & ACTUAL VALIDATION SQLs ##########################

        for filename in os.listdir(parmfile_path):

            file_relative_path = os.path.join(parmfile_path, filename)
            validation_sql_contents = read_contents_file(file_relative_path)
            
            filename = filename.lower().split('.')[0]
            if filename.endswith("_prm"):
                filename = filename[:-len("_prm")]
            
            #Remove Empty Lines
            validation_sql_contents = os.linesep.join([s for s in validation_sql_contents.splitlines() if s])

            #Remove Export keywords as they won't be parsed by shlex
            processed = validation_sql_contents.replace('export ', '')
            
            shell_vars = dict()

            # Parse Shell Script to capture coding lines and ignore comments
            for line in shlex.split(processed, comments=True):
                # Capture Variable & Value
                var, _, var_value = line.partition('=')

                # Remove WhiteSpace and then remove additional space characters, lowercase the data
                # shell_vars[var] = re.sub(" +", " ",re.sub("[\r]|[\n]|[\t]"," ",var_value)).lower().strip()
                shell_vars[var] = var_value

            if job_var in shell_vars:
                job_name = shell_vars[job_var]
            else:
                job_name = ""
            if tol_percent_var in shell_vars:
                tolerance_percent = shell_vars[tol_percent_var]
            else:
                tolerance_percent = '0'
            
            if act_sql_var in shell_vars:
                findstring = "$JOBNAME"
                replacestring = job_name
                pattern = re.compile(re.escape(findstring), re.IGNORECASE)
                shell_vars[act_sql_var] = pattern.sub(replacestring, shell_vars[act_sql_var])
                for findstring, replacestring in pre_process_find_replace_list:                        
                    pattern = re.compile(re.escape(findstring), re.IGNORECASE)
                    shell_vars[act_sql_var] = pattern.sub(replacestring, shell_vars[act_sql_var])
            
            if exp_sql_var in shell_vars:
                findstring = "$JOBNAME"
                replacestring = job_name
                pattern = re.compile(re.escape(findstring), re.IGNORECASE)
                shell_vars[exp_sql_var] = pattern.sub(replacestring, shell_vars[exp_sql_var])
                for findstring, replacestring in pre_process_find_replace_list:                        
                    pattern = re.compile(re.escape(findstring), re.IGNORECASE)
                    shell_vars[exp_sql_var] = pattern.sub(replacestring, shell_vars[exp_sql_var])

            # Identify Process Type and capture SQLs
            source_sql = ""
            target_sql = ""
            
            if exp_sql_var in shell_vars and act_sql_var in shell_vars:
                source_sql = re.sub(" +", " ",re.sub("[\r]|[\n]|[\t]"," ",shell_vars[exp_sql_var])).lower().strip()
                target_sql = re.sub(" +", " ",re.sub("[\r]|[\n]|[\t]"," ",shell_vars[act_sql_var])).lower().strip()
                
                # if any(substring.lower()+"." in shell_vars[act_sql_var].lower() for substring in core_views_list):
                if regexp_core.search(target_sql):
                    process = "Integration"
                else:
                    process = "Source Table Ingestion"
            else:
                process = "Source File Ingestion"                
                if act_sql_var in shell_vars:
                    target_sql = re.sub(" +", " ",re.sub("[\r]|[\n]|[\t]"," ",shell_vars[act_sql_var])).lower().strip()
                
            # Create SQL Files with Teradata SQL Script for conversions
            # source_select = ""
            # target_select = ""
            staging_table = ""
            if regexp_staging.search(source_sql) and regexp_edw.search(target_sql):
                status = "Create Template"
                # source_select = regexp_select.search(source_sql).group()
                # target_select = regexp_select.search(target_sql).group()
                conversion_source_exp_file_path = f"{conversion_source_exp_path_folder}\{filename}{output_file_extension}"
                write_file_local(conversion_source_exp_file_path, shell_vars[exp_sql_var])
                regexp_subquery = re.compile(r"(?:"+'|'.join(staging_dataset_list)+")\.([a-z|0-9|_]+)\s*",re.IGNORECASE | re.DOTALL)
                staging_table_find = regexp_subquery.search(shell_vars[exp_sql_var])
                if staging_table_find is None:
                    staging_table = ""
                else:
                    staging_table = staging_table_find.groups()[0].lower()
                
                conversion_source_act_file_path = f"{conversion_source_act_path_folder}\{filename}{output_file_extension}"
                write_file_local(conversion_source_act_file_path, shell_vars[act_sql_var])
            elif regexp_edw.search(target_sql,re.IGNORECASE):
                status = "Convert Target"
                # target_select = regexp_select.search(target_sql).group()
                conversion_source_act_file_path = f"{conversion_source_act_path_folder}\{filename}{output_file_extension}"
                write_file_local(conversion_source_act_file_path, shell_vars[act_sql_var])
            else:
                status = "Ignore All"

            # control_rec_list.append(f"{job_name}^{filename}^{tolerance_percent}^{process}^{status}")
            control_rec = {'job_name':job_name,'filename':filename,'tolerance_percent':tolerance_percent,'process':process,'status':status,'staging_table':staging_table,'col_count':'0'}
            control_rec_list.append(control_rec)
            shell_vars.clear()

        print("Created Expected and Actual Files") 

        return control_rec_list

    except Exception as e1:
        print(e1)
        print("File Name : {}".format(filename))
        pass

In [None]:
def scan_file_for_shellvariables(file_path, variable_set):
    pattern = r'\$[a-zA-Z_][a-zA-Z0-9_]*'
    try:
        with open(file_path, 'r') as file:
            for line in file:
                 if not line.strip().startswith('--'):
                    variables = re.findall(pattern, line)
                    variable_set.update(variables)
    except Exception as e:
        print(f"Error reading file '{file_path}': {str(e)}")

In [None]:
def preprocess_source_string(formatted_text):

    regexp_source_string = re.compile(r"SELECT (.*) AS SOURCE_STRING",re.IGNORECASE)
    regexp_cast_varchar = re.compile(r"(.*)CAST[ ]*\([ ]*(.*)[ ]+AS VARCHAR[ ]*\([0-9]*\)[ ]*\)(.*)",re.IGNORECASE)
    # regexp_trim = re.compile(r"(.*)TRIM\([ ]*(.*)[ ]*\)(.*)",re.IGNORECASE)

    formatted_text = re.sub(r"[ ]+", " ",formatted_text)

    source_string_cols = re.search(regexp_source_string, formatted_text)

    col_count = 0

    if source_string_cols is not None:
        
        column_expression = ''.join(source_string_cols.groups())
        column_expression_split = column_expression.split("||")
        column_expression_split[:] = (value for value in column_expression_split if value.strip() != "','")
        column_expression_split.pop(0) # Remove First Element which is the job

        # column_expression_split[:] = (value.strip() for value in column_expression_split)

        # source_string_parsed = "SELECT CONCAT(" + ', '.join(column_expression_split) + ") AS SOURCE_STRING"

        column_expr_list = []

        col_count = len(column_expression_split)

        for sql_expr in column_expression_split:
            sqlexp_cast_varchar_match = regexp_cast_varchar.match(sql_expr)
            if sqlexp_cast_varchar_match is not None:
                sqlexp_novarchar_string = ''.join(sqlexp_cast_varchar_match.groups())
                # sqlexp_trim_match = regexp_trim.match(sqlexp_novarchar_string)
                # if sqlexp_trim_match is not None:
                #     sqlexp_notrim_string = ''.join(sqlexp_trim_match.groups())
                #     column_expr_list.append(sqlexp_notrim_string)
                # else:
                column_expr_list.append(sqlexp_novarchar_string)
            else:
                # sqlexp_trim_match = regexp_trim.match(sql_expr)
                # if sqlexp_trim_match is not None:
                #     sqlexp_notrim_string = ''.join(sqlexp_trim_match.groups())
                #     column_expr_list.append(sqlexp_notrim_string)
                # else:
                column_expr_list.append(sql_expr)

        column_expr_list[:] = (value.strip() for value in column_expr_list)

        if template_version == "validation_script_template_v2.j2":
            source_string_parsed = "SELECT CONCAT(" + ', '.join(column_expr_list) + ") AS SOURCE_STRING"
        elif template_version == "validation_script_template_v1.j2":
            source_string_parsed = "SELECT " + ', '.join(column_expr_list)

        formatted_text_min = formatted_text.replace(source_string_cols.group().strip(), source_string_parsed)
        
    else:
        formatted_text_min = formatted_text
    
    return formatted_text_min, col_count

In [None]:
def preprocess_sql_file(source_file_path, copy_file_path, pre_process_find_replace_list):
        
        # preferred_encodings = ["cp1252", "utf-8", "iso-8859-1"]
        preferred_encodings = ["cp1252"]
        for encoding in preferred_encodings:
            try:
                lines = read_contents_file(source_file_path, encoding, True)
            except UnicodeDecodeError as e:
                print(f"UnicodeDecodeError with {encoding}: {e}")
                continue
                 
        sqltext = ''
        regexp_cast_varchar = re.compile(r"(.*)CAST[ ]*\([ ]*(.*)[ ]+AS VARCHAR[ ]*\([0-9]*\)[ ]*\)(.*)",re.IGNORECASE)
        
        for line in lines:
            for findstring, replacestring in pre_process_find_replace_list:                        
                pattern = re.compile(re.escape(findstring), re.IGNORECASE)
                line = pattern.sub(replacestring, line)
                    
            sqlexp_cast_varchar_match = regexp_cast_varchar.match(line)

            if sqlexp_cast_varchar_match is not None:
                line = ''.join(sqlexp_cast_varchar_match.groups())
            
            if "EOF"  in line \
                or line.strip().startswith('#') \
                    or line.lower().strip().startswith('locking') :
                    line = f"--{line}"
                
            sqltext += line
                
        formattedtext = sqlparse.format(sqltext, reindent=True, keyword_case='upper', strip_comments=False)

        formattedtext, col_count = preprocess_source_string(formattedtext)

        write_file_local(copy_file_path,formattedtext)
        # print("Removed shell variables, Shell Comments, Collect Stats, EOF & Renamed  {}".format(os.path.basename(copy_file_path)))

        return col_count

In [None]:
def preprocess_sqls(conversion_source_path_folder, preprocessed_tdsqlfiles_path, sub_folder_list, control_rec_list):

    print("Pre Processing has begun") 
    
    for folder in sub_folder_list:
        sub_folder = f"{preprocessed_tdsqlfiles_path}\{folder}"
        if os.path.exists(sub_folder):
            shutil.rmtree(sub_folder, ignore_errors=True)
        os.makedirs(sub_folder)
        print("Created folder : {}".format(sub_folder))
    
    for zip_file in metadatazip_file_list:
        metadatazip_file = os.path.join(metadata_folder, zip_file)

        if not os.path.isfile(metadatazip_file):
            raise FileNotFoundError(f"Metadata Zip file '{metadatazip_file}' not found.")
            print("Metadata Zip file '{metadatazip_file}' not found.")
        else:
            shutil.copy2(metadatazip_file, preprocessed_tdsqlfiles_path)
            print("Copied metadatazip_file {}".format(metadatazip_file))
        
    yaml_file_list = glob.glob(os.path.join(metadata_folder  , '*.yaml' ))

    for yaml_file_path in yaml_file_list:
        filename = os.path.basename(yaml_file_path)
        destination_path = os.path.join(preprocessed_tdsqlfiles_path, filename)
        shutil.copy2(yaml_file_path, destination_path)
        print("Copied yaml file {}".format(filename))

    print("Pre Processing is completed") 

    shell_variables = set()
    #create empty set for holding shell variables

    for folder in sub_folder_list:
        conversion_source_sub_path_folder = f"{conversion_source_path_folder}\{folder}"
        for file in os.listdir(conversion_source_sub_path_folder):
            if file.endswith('.sql'):
                conversion_source_file_path = os.path.join(conversion_source_sub_path_folder, file)
                scan_file_for_shellvariables(conversion_source_file_path, shell_variables)
                preprocess_file_path = os.path.join(preprocessed_tdsqlfiles_path, folder, file).lower()
                col_count = preprocess_sql_file(conversion_source_file_path, preprocess_file_path, pre_process_find_replace_list)
                idx = next((idx for idx, item in enumerate(control_rec_list) if item["filename"].lower() == file.split(".")[0].lower()), None)
                control_rec_list[idx]['col_count'] = str(col_count)
    
    print("Printing any shell variables found in sqls")
    for variable in shell_variables:
            print(variable)

    return control_rec_list

In [None]:
def bqms_translator(bqms_in_folder, convertedsql_path, sub_folder_list, bucket_name, gcsfolder, run_time, v_mode):

    client = storage.Client()
    bucket = client.bucket(bucket_name)

    gcs_input_path = gcsfolder + '/' + run_time + '/input' 
    gcs_converted_path = gcsfolder + '/' +   run_time + '/output'
    translation_display_name = 'Validation_Conversion_' + run_time

    print("Running {} BQMS Translation {}".format(v_mode, translation_display_name))

    print("Uploading Preprocessed files")

    command = 'gsutil -m cp -r ' + bqms_in_folder + '\ gs://' + bucket_name  + '/' +  gcs_input_path 

    try:
        completed_process = subprocess.run(command, shell=True, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
        print(completed_process.stdout)
    except subprocess.CalledProcessError as e:
        print("Error:", e)
        print(e.output)

    print(f"Begin: All Objects Uploaded, Now Create a Migration Workflow") 

    parent = f"projects/{translation_project_id}/locations/us"

    client = bigquery_migration_v2.MigrationServiceClient()
    source_dialect = bigquery_migration_v2.Dialect()

    if v_mode == 'BTEQ':
        source_dialect.teradata_dialect = bigquery_migration_v2.TeradataDialect(
            mode=bigquery_migration_v2.TeradataDialect.Mode.BTEQ
        )
    elif v_mode == 'SQL':
        source_dialect.teradata_dialect = bigquery_migration_v2.TeradataDialect(
            mode=bigquery_migration_v2.TeradataDialect.Mode.SQL
        )
    else:
        raise Exception(f"Unknown Mode {v_mode}.")
    target_dialect = bigquery_migration_v2.Dialect()
    target_dialect.bigquery_dialect = bigquery_migration_v2.BigQueryDialect()
    
    translation_config = bigquery_migration_v2.TranslationConfigDetails(
        gcs_source_path= "gs://" + bucket_name + '/' + gcs_input_path,
        gcs_target_path="gs://" + bucket_name + '/' + gcs_converted_path,
        source_dialect=source_dialect,
        target_dialect=target_dialect,
        source_env = bigquery_migration_v2.SourceEnv(default_database=project_id, schema_search_path=lob_datasets)
        
    )

    migration_task = bigquery_migration_v2.MigrationTask(
        type_="Translation_Teradata2BQ", translation_config_details=translation_config
    )

    workflow = bigquery_migration_v2.MigrationWorkflow(
        display_name=translation_display_name
    )

    workflow.tasks["translation-task"] = migration_task  

    request = bigquery_migration_v2.CreateMigrationWorkflowRequest(
        parent=parent,
        migration_workflow=workflow,
    )
    response = client.create_migration_workflow(request=request)
    print("State : " + str(response.state))

    # while str(response.state) != "State.COMPLETED":
    while str(response.state) != "4":
        time.sleep(5)
        response = client.get_migration_workflow(name=response.name)  
        # print(str(response.state))
    
    print("Migration Workflow is completed")

    for folder in sub_folder_list:
        sub_folder = f"{convertedsql_path}\{folder}"
        if os.path.exists(sub_folder):
            shutil.rmtree(sub_folder, ignore_errors=True)
        os.makedirs(sub_folder)
        print("Created folder : {}".format(sub_folder))

        command = f'gsutil -m cp gs://{bucket_name}/{gcs_converted_path}/{folder}/*.sql {convertedsql_path}\{folder}'

        print("Downloaded Translated files from {}".format(f"{bucket_name}/{gcs_converted_path}/{folder}"))

        try:
            completed_process = subprocess.run(command, shell=True, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
            print(completed_process.stdout)
        except subprocess.CalledProcessError as e:
            print("Error:", e)
            print(e.output)

In [None]:
def postprocess_bqsqls(convertedsql_path, postprocess_sql_path, sub_folder_list):

    print("Post Processing has begun") 
    
    for folder in sub_folder_list:
        sub_folder = f"{postprocess_sql_path}\{folder}"
        if os.path.exists(sub_folder):
            shutil.rmtree(sub_folder, ignore_errors=True)
        os.makedirs(sub_folder)
        print("Created folder : {}".format(sub_folder))

        file_list = [filename for filename in os.listdir(f"{convertedsql_path}\{folder}") if filename.endswith(".sql")]
        for filename in file_list:
            source_file_path = os.path.join(convertedsql_path, folder, filename).lower()
            converted_sql = read_contents_file(source_file_path)

            for i in range(len(post_process_find_replace_list)):
                converted_sql = converted_sql.replace(post_process_find_replace_list[i]["search"], post_process_find_replace_list[i]['replace'])

            for i in range(len(post_process_regex_find_replace_list)):
                regexp_pattern = re.compile(post_process_regex_find_replace_list[i]["search"], re.IGNORECASE)
                regexp_repl_pattern = post_process_regex_find_replace_list[i]["replace"]
                converted_sql = re.sub(regexp_pattern, regexp_repl_pattern, converted_sql)

            formattedsql = converted_sql.strip()
            formattedsql = sqlparse.format(formattedsql, reindent=True, keyword_case='upper')

            for i in range(len(post_process_find_replace_list)):
                formattedsql = formattedsql.replace(post_process_find_replace_list[i]["search"], post_process_find_replace_list[i]['replace'])  

            for i in range(len(post_process_regex_find_replace_list)):
                regexp_pattern = re.compile(post_process_regex_find_replace_list[i]["search"], re.IGNORECASE)
                regexp_repl_pattern = post_process_regex_find_replace_list[i]["replace"]
                formattedsql = re.sub(regexp_pattern, regexp_repl_pattern, formattedsql)

            formattedsql = formattedsql.strip(' ;')
            copy_file_path = os.path.join(postprocess_sql_path, folder, filename).lower()
            # print(copy_file_path)
            write_file_local(copy_file_path,formattedsql)

    print("Post Processing is completed") 

In [None]:
def execute_sql(postprocess_sql_path, sub_folder_list):
    print(f"executing SQL files in  {postprocess_sql_path}" )
    client = bigquery.Client(project=project_id)

    for folder in sub_folder_list:
        sub_folder = f"{postprocess_sql_path}\{folder}"
        for file_name in os.listdir(sub_folder):
            if file_name.endswith('.sql'):
                sql_file_path = os.path.join(sub_folder, file_name)
                sql_query = read_contents_file(sql_file_path)
                for i in range(len(execution_time_find_replace_list)):
                    sql_query = sql_query.replace(execution_time_find_replace_list[i]["search"], execution_time_find_replace_list[i]['replace'])
                try:         
                    client.query(sql_query, project=project_id, location='US').result()
                    logging.info(f"SQL file {file_name} executed successfully.")
                    print(f"SQL file {file_name} executed successfully.")
                except Exception as e:
                    logging.error(f"Error executing SQL file {file_name}: {e}")
                    print(f"Error executing SQL file {file_name}: {e}")

In [None]:
def print_execution_results(logfilename):
    total_lines = 0
    success_lines = 0
    failure_lines = 0
    missing_dbobjects = 0 
    dup_data_issues = 0

    try:
        with open(logfilename, "r") as file:
            print("=========Errors================")
            for line in file:
                if "SQL file" in line:
                    total_lines += 1
                    if "executed successfully" in line:
                        success_lines += 1
                    elif "Not found" in line:
                        missing_dbobjects  += 1    
                        print(line.strip())                    
                        failure_lines += 1
                    elif "Duplicates" in line or "at most one source row" in line :
                        dup_data_issues  += 1    
                        print(line.strip())                    
                        failure_lines += 1
                    else:
                        failure_lines += 1
                        print(line.strip())

        print("=========execution_results================")
        print("Total SQL files executed:", total_lines)
        print("SQL files executed successfully:", success_lines)
        print("SQL files failed with errors:", failure_lines)
        print(" --SQL files with Missing db objects:", missing_dbobjects)
        print(" --SQL files with Dup Data Issues:", dup_data_issues)


        logging.info("=========execution_results================")
        logging.info(f"Total SQL files executed: {total_lines}")
        logging.info(f"SQL files executed successfully: {success_lines}")
        logging.info(f"SQL files failed with errors: {failure_lines}")
        logging.info(f" --SQL files with Missing db objects: {missing_dbobjects}")
        logging.info(f" --SQL files with Dup Data Issues: {dup_data_issues}")
                     
    except FileNotFoundError:
        print(f"Error: File '{logfilename}' not found.")
    except Exception as e:
        print("An error occurred:", str(e))

In [None]:
def productionize_sqls(postprocessqls, prodready_bqsqls):

    for folder in sub_folder_list:
        sub_folder = f"{prodready_bqsqls}\{folder}"
        if os.path.exists(sub_folder):
            shutil.rmtree(sub_folder, ignore_errors=True)
        os.makedirs(sub_folder)
        print("Created folder : {}".format(sub_folder))
    
        for file in os.listdir(f"{postprocessqls}\{folder}"):
            if file.endswith('.sql'):
                source_file_path = os.path.join(postprocessqls, folder, file)
                post_process_sql = read_contents_file(source_file_path)

                for i in range(len(production_regex_find_replace_list)):
                    if production_regex_find_replace_list[i]["query_type"].lower() == folder.lower():
                        regexp_pattern = re.compile(production_regex_find_replace_list[i]["search"], re.IGNORECASE | re.DOTALL)
                        regexp_repl_pattern = production_regex_find_replace_list[i]["replace"]
                        post_process_sql = re.sub(regexp_pattern, regexp_repl_pattern, post_process_sql)

                for i in range(len(production_parms_find_replace_list)):
                    post_process_sql = post_process_sql.replace(production_parms_find_replace_list[i]["search"], production_parms_find_replace_list[i]["replace"])  

                formattedsql = post_process_sql.strip()
                formattedsql = sqlparse.format(formattedsql, reindent=True, keyword_case='upper')
                
                formattedtext = sqlparse.format(formattedsql, reindent=True, keyword_case='upper', strip_comments=False)
                copy_file_path = os.path.join(prodready_bqsqls, folder, file).lower()
                write_file_local(copy_file_path, formattedtext)
                # print("Replaced hardcoded schema names with Dag Parms  {}".format(os.path.basename(copy_file_path)))


In [None]:
def upload_to_dags(output_path_folder):
    # print(prodready_bqsqls)
    command = 'gsutil -m cp -r ' + output_path_folder + '\*\ gs://' + dag_bucket_path

    try:
        completed_process = subprocess.run(command, shell=True, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
        print(completed_process.stdout)
        print("Uploaded Dags to {}".format(dag_bucket_path))
    except subprocess.CalledProcessError as e:
        print("Error:", e)
        print(e.output)

In [None]:
def gen_validation_sqls_from_template(control_rec_list):
    ############################ POST CONVERSION + TEMPLATE CREATIONS ############################

    if os.path.exists(output_path_folder):
        shutil.rmtree(output_path_folder, ignore_errors=True)
    os.makedirs(output_path_folder)
    print("Created folder : {}".format(output_path_folder))
    
    # Jinja2 Environment
    env = Environment(loader = FileSystemLoader(jinja_template_path), trim_blocks=True, lstrip_blocks=True)
    template_filename = template_version
    template = env.get_template(template_filename)

    for idx, rec in enumerate(control_rec_list):
        job_name = rec['job_name']
        filename = rec['filename']
        tolerance_percent = rec['tolerance_percent']
        process = rec['process']
        status = rec['status']
        staging_table = rec['staging_table']
        col_count = rec['col_count']

        df_job_source = pd.read_csv("config\job_source_map.csv", index_col=None)
        df_job_source_match = df_job_source[df_job_source["job_name"]==filename]
        if df_job_source_match.empty:
            source_per_job_name = ""
        else:
            source_per_job_name = df_job_source_match["source"].iloc[0]

        if status == "Create Template":
            template_params = dict()
            template_params.update({"stage_dataset": '{{ params.param_' + lob_abbr_lower + '_stage_dataset_name }}'})
            template_params.update({"target_dataset": '{{ params.param_' + lob_abbr_lower + '_core_dataset_name }}'})
            template_params.update({"audit_dataset": '{{ params.param_' + lob_abbr_lower + '_audit_dataset_name }}'})
            template_params.update({"tol_percent": tolerance_percent})
            template_params.update({"source_per_job_name": source_per_job_name})
            template_params.update({"staging_table_name": staging_table})
            
            conversion_target_exp_file_path = f"{prodready_exp_path_folder}\{filename}{output_file_extension}"
            conv_source_sql = read_contents_file(conversion_target_exp_file_path)
            formattedtext = sqlparse.format(conv_source_sql, reindent=True, keyword_case='upper', strip_comments=True)
            template_params.update({"src_query": formattedtext})

            conversion_target_act_file_path = f"{prodready_act_path_folder}\{filename}{output_file_extension}"
            conv_target_sql = read_contents_file(conversion_target_act_file_path)
            formattedtext = sqlparse.format(conv_target_sql, reindent=True, keyword_case='upper', strip_comments=True)
            template_params.update({"tgt_query": formattedtext})

            if template_version == "validation_script_template_v2.j2":
                match process:
                    case 'Integration':
                        template_params.update({"variable_prefix": "VALIDATION_CNTRLID_"})
                    case 'Source Table Ingestion':
                        template_params.update({"variable_prefix": "INGEST_CNTRLID_"})
            elif template_version == "validation_script_template_v1.j2":
                control_count = int(col_count)
                control_ids = []
                for count_id in range(1,control_count+1):
                    control_rec = {"id":count_id}
                    control_ids.append(control_rec)
                template_params.update({"control_ids": control_ids})
            
            os.makedirs(f"{output_path_folder}\{source_per_job_name}", exist_ok=True)
            output_filename_path = f"{output_path_folder}\{source_per_job_name}\{filename}{output_file_extension}"
            write_file_local(output_filename_path, template.render(template_params) + "\n")
            template_params.clear()

In [None]:
def gen_control_file(control_rec_list):
    if not os.path.exists(output_controlpath_folder):
        os.makedirs(output_controlpath_folder)
        print("Created folder {}".format(output_controlpath_folder))


    control_rec_list_flatten = ['^'.join(rec.values()) for rec in control_rec_list]

    # Capture Summary of the Audit Control
    write_file_local(output_control_file_path, control_rec_list_flatten, True)
    print("Wrote Control File {}".format(output_control_file_path)) 

In [None]:
dt1 = datetime.now()
# run_time = (dt1).strftime('%Y%m%d_%H%M')
run_time = "20240906_0352"

with open('config/lob_config.json') as json_lob_config:
    config = json.load(json_lob_config)

lob = config['lob']
lob_abbr = config['lob_abbr'] # lobname in BQMS Script

lob_lower = lob.strip().lower()
lob_upper = lob.strip().upper()
lob_abbr_lower = lob_abbr.strip().lower()
lob_abbr_upper = lob_abbr.strip().upper()

parmfiles_folder = config['parmfiles_path']
parmfiles_path_folder = f"{parmfiles_folder}\{lob_abbr_lower}"

output_folder = config['output_folder']
output_path_folder = f"{output_folder}\{lob_abbr_lower}\{run_time}"

output_control_folder = config['output_control_folder']
output_controlpath_folder = f"{output_control_folder}\{lob_abbr_lower}"
output_control_file_path = f"{output_controlpath_folder}\{lob_lower}_{run_time}.csv"

log_folder = config['log_folder']
log_path_folder = f"{log_folder}\{lob_abbr_lower}"

if not os.path.exists(log_path_folder):
    os.makedirs(log_path_folder)

logfilename = lob_abbr_lower + '_bqsqlexecution_' + run_time + '.log'
logfilenamepath = os.path.join(log_path_folder, logfilename)
logging.getLogger().setLevel(logging.INFO)
logging.basicConfig(filename=logfilenamepath, level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

expected_folder = config['expected_folder']
actual_folder = config['actual_folder']
sub_folder_list = config['sub_folder_list']

conversion_source_folder = config['conversion_source_folder']
conversion_source_path_folder = f"{conversion_source_folder}\{lob_abbr_lower}\{run_time}"
conversion_source_exp_path_folder = f"{conversion_source_path_folder}\{expected_folder}"
conversion_source_act_path_folder = f"{conversion_source_path_folder}\{actual_folder}"

conversion_target_folder = config['conversion_target_folder']
conversion_target_path_folder = f"{conversion_target_folder}\{lob_abbr_lower}\{run_time}"

preprocessed_tdsqlfiles = config['preprocessed_tdsqlfiles']
preprocessed_tdsqlfiles_path = f"{preprocessed_tdsqlfiles}\{lob_abbr_lower}\{run_time}"

postprocessed_bqsqlsfiles = config['postprocessqls']
postprocessed_bqsqlsfiles_path = f"{postprocessed_bqsqlsfiles}\{lob_abbr_lower}\{run_time}"

prodready_bqsqls = config['prodready_bqsqls']
prodready_bqsqls_path = f"{prodready_bqsqls}\{lob_abbr_lower}\{run_time}"
prodready_exp_path_folder = f"{prodready_bqsqls_path}\{expected_folder}"
prodready_act_path_folder = f"{prodready_bqsqls_path}\{actual_folder}"

output_file_extension = config['output_file_extension']

jinja_template_path = config['jinja_template_path']
template_version = config['template_version']

exp_sql_var = config['exp_sql_var']
act_sql_var = config['act_sql_var']
job_var = config['job_var']
tol_percent_var = config['tol_percent_var']
tol_amount_var = config['tol_amount_var']

project_id = config['project_id']
translation_project_id = config['translation_project_id']
fns_project_id = project_id
fns_dataset = config['fns_dataset']
bucket_name = config['bucket_name']
gcsfolder = config['gcsfolder']
dag_bucket_path = config['dag_bucket_path']

source_dataset_list = config['source_dataset_list']
staging_dataset_list = config['staging_dataset_list']
core_dataset_list = config['core_dataset_list']
core_views_list = config['core_views_list']

lob_datasets = config['lob_datasets']

unix_server = config['unix_server']
username = config['username']
password = config['password']

unix_etl_folder = config['unix_etl_folder']
unix_lob = config['unix_lob']
lob_sub_folders = config['lob_sub_folders']
parmfolder_before_lob_subfolder = config['parmfolder_before_lob_subfolder']
file_prefix = config['file_prefix']
add_source_dir_prefix = config['add_source_dir_prefix']

metadata_folder = config['metadata_folder']
metadatazip_file_list = config['metadatazip_file_list']

In [None]:
    # # PBS Replacements

    # pre_process_find_replace_list = [
    #     ('$NCR_TGT_SCHEMA','edwpbs'),
    #     ('$NCR_STG_SCHEMA','edwpbs_staging'),
    #     ('$NCR_STG_TGT','edwpbs_staging')  ,
    #     ('$NCR_BASE_VIEWS','edwpbs_base_views')  ,
    #     ('$NCR_TGT','edwpbs') , 
    #     ('!=','<>') , 
    #     ("','AS", "',' AS"),
    # ]

    # post_process_find_replace_list = [
    #     {'search':'syslib.length','replace':'length'},
    #     {'search':'syslib.isnumeric','replace':'`hca-hin-dev-cur-parallon`.bqutil_fns.isnumeric'},
    #     {'search':'`hca-hin-dev-cur-parallon`.edwfs_base_views.','replace':'`hca-hin-dev-cur-parallon`.auth_base_views.'},
    #     {'search':'`hca-hin-dev-cur-parallon`.edwfs.','replace':'`hca-hin-dev-cur-parallon`.auth_base_views.'},
    #     {'search':'`hca-hin-dev-cur-parallon`.edwpf_base_views.fact_rcom_pars_discrepancy','replace':'`hca-hin-dev-cur-parallon`.edwpbs_base_views.fact_rcom_pars_discrepancy'},
    #     {'search':'`hca-hin-dev-cur-parallon`.edwpf_base_views.','replace':'`hca-hin-dev-cur-parallon`.auth_base_views.'},
    #     {'search':'`hca-hin-dev-cur-parallon`.edwpf_views.','replace':'`hca-hin-dev-cur-parallon`.auth_base_views.'},
    #     {'search':'`hca-hin-dev-cur-parallon`.edwpf.','replace':'`hca-hin-dev-cur-parallon`.auth_base_views.'},
    # ]

    # post_process_regex_find_replace_list = [
    #     {'search':r"bqutil\.fn\.cw_td_strtok\([ ]*([.|a-z|_|0-9]+)[ ]*,[ ]*'(.)'[ ]*,[ ]*([0-9]+)\)",
    #     'replace':r"SPLIT(\1, '\2')[ORDINAL(\3)]"},
    #     {'search':r"bqutil\.fn\.", 'replace':r"`hca-hin-dev-cur-parallon`.bqutil_fns."}
    # ]

    # execution_time_find_replace_list = [
    # #    {'search':'`hca-hin-dev-cur-parallon`.edw_pub_views.','replace':'`hca-hin-dev-cur-parallon`.edw_pub_views.'},
    # ]

    # production_parms_find_replace_list = [
    #     {'search':'`hca-hin-dev-cur-parallon`.edwpbs_staging.','replace':'{{ params.param_pbs_stage_dataset_name }}.'},        
    #     {'search':'`hca-hin-dev-cur-parallon`.edwpbs.','replace':'{{ params.param_pbs_core_dataset_name }}.'},
    #     {'search':'`hca-hin-dev-cur-parallon`.edwpbs_base_views.','replace':'{{ params.param_pbs_base_views_dataset_name }}.'},
    #     {'search':'`hca-hin-dev-cur-parallon`.edwpbs_views.','replace':'{{ params.param_pbs_views_dataset_name }}.'},
    #     {'search':'`hca-hin-dev-cur-parallon`.bqutil_fns.','replace':'{{ params.param_pbs_bqutil_fns_dataset_name }}.'},
    #     {'search':'`hca-hin-dev-cur-parallon`.edwpf_staging.','replace':'{{ params.param_pf_stage_dataset_name }}.'},
    #     {'search':'`hca-hin-dev-cur-parallon`.auth_base_views.','replace':'{{ params.param_auth_base_views_dataset_name }}.'},
    #     {'search':'`hca-hin-dev-cur-parallon`.edwpf.','replace':'{{ params.param_pf_core_dataset_name }}.'},
    #     {'search':'`hca-hin-dev-cur-parallon`.edwpf_base_views.','replace':'{{ params.param_pf_base_views_dataset_name }}.'},
    #     {'search':'`hca-hin-dev-cur-parallon`.edwpf_views.','replace':'{{ params.param_pf_views_dataset_name }}.'},
    #     {'search':'`hca-hin-dev-cur-parallon`.edwra_staging.','replace':'{{ params.param_ra_stage_dataset_name }}.'},
    #     {'search':'`hca-hin-dev-cur-parallon`.edwra_base_views.','replace':'{{ params.param_ra_base_views_dataset_name }}.'},
    #     {'search':'`hca-hin-dev-cur-parallon`.edwra_views.','replace':'{{ params.param_ra_views_dataset_name }}.'},
    #     {'search':'`hca-hin-dev-cur-parallon`.edw_dim_base_views.','replace':'{{ params.param_dim_base_views_dataset_name }}.'},
    #     {'search':'`hca-hin-dev-cur-parallon`.edw_pub_views.','replace':'{{ params.param_pub_views_dataset_name }}.'},      
    #     {'search':'`hca-hin-dev-cur-parallon`.edwfs.','replace':'{{ params.param_fs_core_dataset_name }}.'},
    #     {'search':'`hca-hin-dev-cur-parallon`.edwfs_base_views.','replace':'{{ params.param_fs_base_views_dataset_name }}.'},
    #     {'search':'`hca-hin-dev-cur-parallon`.edwcm_views.','replace':'{{ params.param_cm_views_dataset_name }}.'},
    # ]

    # production_regex_find_replace_list = [
    #     {'search':r"((?:"+'|'.join(core_dataset_list)+")\.[a-z|0-9|_]+[ ]*[AS]*[ ]*(?!ON)[a-z|0-9|_]*)",
    #     'replace':r"\1 FOR SYSTEM_TIME AS OF TIMESTAMP(tableload_start_time,'US/Central')",
    #     'query_type':"EXP"},
    #     {'search':r"DATE\(([a-z|0-9|_|.]+)\)[ ]*=[ ]*current_date\('US\/Central'\)[ ]*", 
    #     'replace':r"\1 >= tableload_start_time  - INTERVAL 1 MINUTE ",
    #     'query_type':"ACT"}
    # ]

In [None]:
    # # MHB Replacements

    # pre_process_find_replace_list = [
    #     ('$NCR_TGT_SCHEMA','edwci'),
    #     ('$NCR_STG_SCHEMA','edwci_staging'),
    #     ('$NCR_STG_TGT','edwci_staging')  ,
    #     ('$NCR_BASE_VIEWS','edwci_base_views')  ,
    #     ('$NCR_TGT','edwci') , 
    #     ('!=','<>') , 
    #     ("','AS", "',' AS"),
    # ]

    # post_process_find_replace_list = [
    #     {'search':'syslib.length','replace':'length'},
    #     {'search':'syslib.isnumeric','replace':'`hca-hin-dev-cur-clinical`.bqutil_fns.isnumeric'},
    #     # {'search':'`hca-hin-dev-cur-clinical`.edwfs_base_views.','replace':'`hca-hin-dev-cur-clinical`.auth_base_views.'},
    # ]

    # post_process_regex_find_replace_list = [
    #     {'search':r"bqutil\.fn\.cw_td_strtok\([ ]*([.|a-z|_|0-9]+)[ ]*,[ ]*'(.)'[ ]*,[ ]*([0-9]+)\)",
    #     'replace':r"SPLIT(\1, '\2')[ORDINAL(\3)]"},
    #     {'search':r"bqutil\.fn\.", 'replace':r"`hca-hin-dev-cur-clinical`.bqutil_fns."}
    # ]

    # execution_time_find_replace_list = [
    # #    {'search':'`hca-hin-dev-cur-parallon`.edw_pub_views.','replace':'`hca-hin-dev-cur-parallon`.edw_pub_views.'},
    # ]

    # production_parms_find_replace_list = [
    #     {'search':'`hca-hin-dev-cur-clinical`.edwci_staging.','replace':'{{ params.param_clinical_ci_stage_dataset_name }}.'},        
    #     {'search':'`hca-hin-dev-cur-clinical`.edwci.','replace':'{{ params.param_clinical_ci_core_dataset_name }}.'},
    #     {'search':'`hca-hin-dev-cur-clinical`.edwci_base_views.','replace':'{{ params.param_clinical_ci_base_views_dataset_name }}.'},
    #     {'search':'`hca-hin-dev-cur-clinical`.bqutil_fns.','replace':'{{ params.param_clinical_bqutil_fns_dataset_name }}.'},
    #     {'search':'`hca-hin-dev-cur-clinical`.auth_base_views.','replace':'{{ params.param_clinical_cdm_auth_base_views_dataset_name }}.'},
    #     {'search':'`hca-hin-dev-cur-clinical`.edwcl_base_views.','replace':'{{ params.param_clinical_cl_base_views_dataset_name }}.'}
    # ]

    # production_regex_find_replace_list = [
    #     {'search':r"((?:"+'|'.join(core_dataset_list)+")\.[a-z|0-9|_]+[ ]*[AS]*[ ]*(?!ON)[a-z|0-9|_]*)",
    #     'replace':r"\1 FOR SYSTEM_TIME AS OF TIMESTAMP(tableload_start_time,'US/Central')",
    #     'query_type':"EXP"},
    #     {'search':r"DATE\(([a-z|0-9|_|.]+)\)[ ]*=[ ]*current_date\('US\/Central'\)[ ]*", 
    #     'replace':r"\1 >= tableload_start_time  - INTERVAL 1 MINUTE ",
    #     'query_type':"ACT"}
    # ]

In [None]:
    # # CA Replacements

    # pre_process_find_replace_list = [
    #     ('$NCR_TGT_SCHEMA','edwcdm'),
    #     ('$NCR_STG_SCHEMA','edwcdm_staging'),
    #     ('$NCR_STG_TGT','edwcdm_staging')  ,
    #     ('$NCR_BASE_VIEWS','edwcdm_base_views')  ,
    #     ('$NCR_TGT','edwcdm') , 
    #     ('!=','<>') , 
    #     ("','AS", "',' AS"),
    # ]

    # post_process_find_replace_list = [
    #     {'search':'syslib.length','replace':'length'},
    #     {'search':'syslib.isnumeric','replace':'`hca-hin-dev-cur-clinical`.bqutil_fns.isnumeric'},
    # ]

    # post_process_regex_find_replace_list = [
    #     {'search':r"bqutil\.fn\.cw_td_strtok\([ ]*([.|a-z|_|0-9]+)[ ]*,[ ]*'(.)'[ ]*,[ ]*([0-9]+)\)",
    #     'replace':r"SPLIT(\1, '\2')[ORDINAL(\3)]"},
    #     {'search':r"bqutil\.fn\.", 'replace':r"`hca-hin-dev-cur-clinical`.bqutil_fns."}
    # ]

    # execution_time_find_replace_list = [
    # #    {'search':'`hca-hin-dev-cur-parallon`.edw_pub_views.','replace':'`hca-hin-dev-cur-parallon`.edw_pub_views.'},
    # ]

    # production_parms_find_replace_list = [
    #     {'search':'`hca-hin-dev-cur-clinical`.edwcdm_staging.','replace':'{{ params.param_clinical_cdm_stage_dataset_name }}.'},        
    #     {'search':'`hca-hin-dev-cur-clinical`.edwcdm.','replace':'{{ params.param_clinical_cdm_core_dataset_name }}.'},
    #     {'search':'`hca-hin-dev-cur-clinical`.edwcdm_base_views.','replace':'{{ params.param_clinical_cdm_base_views_dataset_name }}.'},
    #     {'search':'`hca-hin-dev-cur-clinical`.bqutil_fns.','replace':'{{ params.param_clinical_bqutil_fns_dataset_name }}.'},
    #     {'search':'`hca-hin-dev-cur-clinical`.auth_base_views.','replace':'{{ params.param_clinical_cdm_auth_base_views_dataset_name }}.'},
    #     {'search':'`hca-hin-dev-cur-clinical`.edwcl_base_views.','replace':'{{ params.param_clinical_cl_base_views_dataset_name }}.'},
    #     {'search':'`hca-hin-dev-cur-clinical`.edwcdm_views.','replace':'{{ params.param_clinical_cdm_views_dataset_name }}.'}
    # ]

    # production_regex_find_replace_list = [
    #     {'search':r"((?:"+'|'.join(core_dataset_list)+")\.[a-z|0-9|_]+[ ]*[AS]*[ ]*(?!ON)[a-z|0-9|_]*)",
    #     'replace':r"\1 FOR SYSTEM_TIME AS OF TIMESTAMP(tableload_start_time,'US/Central')",
    #     'query_type':"EXP"},
    #     {'search':r"DATE\(([a-z|0-9|_|.]+)\)[ ]*=[ ]*current_date\('US\/Central'\)[ ]*", 
    #     'replace':r"\1 >= tableload_start_time  - INTERVAL 1 MINUTE ",
    #     'query_type':"ACT"}
    # ]

In [None]:
    # CR Replacements

    # pre_process_find_replace_list = [
    #     ('$NCR_TGT_SCHEMA','edwcr'),
    #     ('${NCR_TGT_SCHEMA}','edwcr') ,
    #     ('$NCR_STG_SCHEMA','edwcr_staging'),
    #     ('${NCR_STG_SCHEMA}','edwcr_staging'),
    #     ('$NCR_STG_TGT','edwcr_staging')  ,
    #     ('$NCR_BASE_VIEWS','edwcr_base_views')  ,
    #     ('$EDWCR_BASE_VIEWS','edwcr_base_views')  ,
    #     ('$NCR_TGT','edwcr') ,
    #     ('$NCR_AC_VIEW','edwcr_dmx_ac_base_views') ,
    #     ('${NCR_AC_VIEW}', 'edwcr_dmx_ac_base_views') ,
    #     ('$NCR_AC_SCHEMA', 'edwcr_dmx_ac') ,
    #     ('${NCR_AC_SCHEMA}', 'edwcr_dmx_ac') ,
    #     ('$EDW_PUB_VIEWS', 'edw_pub_views') ,
    #     ('!=','<>') , 
    #     ("','AS", "',' AS"),
    # ]

    # post_process_find_replace_list = [
    #     {'search':'syslib.length','replace':'length'},
    #     {'search':'syslib.isnumeric','replace':'`hca-hin-dev-cur-ops`.bqutil_fns.isnumeric'},
    # ]

    # post_process_regex_find_replace_list = [
    #     {'search':r"bqutil\.fn\.cw_td_strtok\([ ]*([.|a-z|_|0-9]+)[ ]*,[ ]*'(.)'[ ]*,[ ]*([0-9]+)\)",
    #     'replace':r"SPLIT(\1, '\2')[ORDINAL(\3)]"},
    #     {'search':r"bqutil\.fn\.", 'replace':r"`hca-hin-dev-cur-ops`.bqutil_fns."}
    # ]

    # execution_time_find_replace_list = [
    # #    {'search':'`hca-hin-dev-cur-parallon`.edw_pub_views.','replace':'`hca-hin-dev-cur-parallon`.edw_pub_views.'},
    # ]

    # production_parms_find_replace_list = [
    #     {'search':'`hca-hin-dev-cur-ops`.edwcr_staging.','replace':'{{ params.param_cr_stage_dataset_name }}.'},        
    #     {'search':'`hca-hin-dev-cur-ops`.edwcr.','replace':'{{ params.param_cr_core_dataset_name }}.'},
    #     {'search':'`hca-hin-dev-cur-ops`.edwcr_base_views.','replace':'{{ params.param_cr_base_views_dataset_name }}.'},
    #     {'search':'`hca-hin-dev-cur-ops`.bqutil_fns.','replace':'{{ params.param_cr_bqutil_fns_dataset_name }}.'},
    #     {'search':'`hca-hin-dev-cur-ops`.auth_base_views.','replace':'{{ params.param_cr_auth_base_views_dataset_name }}.'},
    #     {'search':'`hca-hin-dev-cur-ops`.edwcr_views.','replace':'{{ params.param_cr_views_dataset_name }}.'}
    # ]

    # production_regex_find_replace_list = [
    #     {'search':r"((?:"+'|'.join(core_dataset_list)+")\.[a-z|0-9|_]+\s*[AS]*\s*(?!ON)(?!WHERE)(?!INNER)(?!JOIN)[a-z|0-9|_]*)",
    #     'replace':r"\1 FOR SYSTEM_TIME AS OF TIMESTAMP(tableload_start_time,'US/Central')",
    #     'query_type':"EXP"},
    #     {'search':r"DATE\(([a-z|0-9|_|.]+)\)\s*=\s*current_date\('US\/Central'\)\s*", 
    #     'replace':r"\1 >= tableload_start_time - INTERVAL 1 MINUTE ",
    #     'query_type':"ACT"},
    #     {'search':r"([a-z|0-9|_]+.dw_last_update_date_time)\s*[<>=]\s*\(\s*SELECT\s*MAX\(etl_job_run\.job_start_date_time\).*FROM.*etl_job_run.*WHERE.*=.*'\s*\)", 
    #     'replace':r"\1 = current_date('US/Central') ",
    #     'query_type':"EXP"},
    #     {'search':r"\(\s*SELECT\s*MAX\(etl_job_run\.job_start_date_time\).*FROM.*etl_job_run.*WHERE.*=.*'\s*\)", 
    #     'replace':r" tableload_start_time - INTERVAL 1 MINUTE ",
    #     'query_type':"ACT"}
    # ]

In [None]:
    # IM Replacements

    pre_process_find_replace_list = [
        ('$NCR_TGT_SCHEMA','edwim'),
        ('${NCR_TGT_SCHEMA}','edwim') ,
        ('$NCR_STG_SCHEMA','edwim_staging'),
        ('${NCR_STG_SCHEMA}','edwim_staging'),
        ('$NCR_STG_TGT','edwim_staging')  ,
        ('$NCR_BASE_VIEWS','edwim_base_views')  ,
        ('$EDWCR_BASE_VIEWS','edwim_base_views')  ,
        ('$NCR_TGT','edwim') ,
        ('$NCR_AC_VIEW','edwim_dmx_ac_base_views') ,
        ('${NCR_AC_VIEW}', 'edwim_dmx_ac_base_views') ,
        ('$NCR_AC_SCHEMA', 'edwim_dmx_ac') ,
        ('${NCR_AC_SCHEMA}', 'edwim_dmx_ac') ,
        ('$EDW_PUB_VIEWS', 'edw_pub_views') ,
        ('!=','<>') , 
        ("','AS", "',' AS"),
    ]

        post_process_find_replace_list = [
            {'search':'syslib.length','replace':'length'},
            {'search':'syslib.isnumeric','replace':'`hca-hin-dev-cur-pub`.bqutil_fns.isnumeric'},
    ]

    post_process_regex_find_replace_list = [
        {'search':r"bqutil\.fn\.cw_td_strtok\([ ]*([.|a-z|_|0-9]+)[ ]*,[ ]*'(.)'[ ]*,[ ]*([0-9]+)\)",
        'replace':r"SPLIT(\1, '\2')[ORDINAL(\3)]"},
        {'search':r"bqutil\.fn\.", 'replace':r"`hca-hin-dev-cur-pub`.bqutil_fns."},
        {'search':r"`hca-hin-dev-cur-comp`\.edwim_base_views\.deficiency_audit", 'replace':r"`hca-hin-dev-cur-comp`.auth_base_views.deficiency_audit"},
        {'search':r"`hca-hin-dev-cur-comp`\.edw_pub_views\.clinical_facility", 'replace':r"`hca-hin-dev-cur-comp`.auth_base_views.clinical_facility"},
        {'search':r"`hca-hin-dev-cur-comp`\.edwim_base_views\.encnt_to_role", 'replace':r"`hca-hin-dev-cur-comp`.auth_base_views.encnt_to_role"},
        {'search':r"`hca-hin-dev-cur-comp`\.edwim_base_views\.clinical_health_care_provider", 'replace':r"`hca-hin-dev-cur-comp`.auth_base_views.clinical_health_care_provider"},
        {'search':r"`hca-hin-dev-cur-comp`\.edwcdm_base_views\.prctnr_role_idfn", 'replace':r"`hca-hin-dev-cur-comp`.auth_base_views.prctnr_role_idfn"},
        {'search':r"`hca-hin-dev-cur-comp`\.edwim_base_views\.clinical_user_patient_audit", 'replace':r"`hca-hin-dev-cur-comp`.auth_base_views.clinical_user_patient_audit"},
        {'search':r"`hca-hin-dev-cur-comp`\.edwim_base_views\.pk_encounter", 'replace':r"`hca-hin-dev-cur-comp`.auth_base_views.pk_encounter"},
        {'search':r"`hca-hin-dev-cur-comp`\.edwim_base_views\.junc_pk_user_access_level", 'replace':r"`hca-hin-dev-cur-comp`.auth_base_views.junc_pk_user_access_level"},
        {'search':r"`hca-hin-dev-cur-comp`\.edwim_base_views\.ref_pk_data_base_instance", 'replace':r"`hca-hin-dev-cur-comp`.auth_base_views.ref_pk_data_base_instance"},
        {'search':r"`hca-hin-dev-cur-comp`\.edwim_base_views\.document_work_flow_instance", 'replace':r"`hca-hin-dev-cur-comp`.auth_base_views.document_work_flow_instance"},
        {'search':r"`hca-hin-dev-cur-comp`\.edwim_base_views\.pk_login_information", 'replace':r"`hca-hin-dev-cur-comp`.auth_base_views.pk_login_information"},
        {'search':r"`hca-hin-dev-cur-comp`\.edwdw_base_views\.document_work_flow_instance", 'replace':r"`hca-hin-dev-cur-comp`.auth_base_views.document_work_flow_instance"}
    ]

    execution_time_find_replace_list = [
    #    {'search':'`hca-hin-dev-cur-parallon`.edw_pub_views.','replace':'`hca-hin-dev-cur-parallon`.edw_pub_views.'},
    ]

    production_parms_find_replace_list = [
        {'search':'`hca-hin-dev-cur-comp`.edwim_staging.','replace':'{{ params.param_im_stage_dataset_name }}.'},        
        {'search':'`hca-hin-dev-cur-comp`.edwim.','replace':'{{ params.param_im_core_dataset_name }}.'},
        {'search':'`hca-hin-dev-cur-comp`.edwim_base_views.','replace':'{{ params.param_im_base_views_dataset_name }}.'},
        {'search':'`hca-hin-dev-cur-comp`.edwim_views.','replace':'{{ params.param_im_views_dataset_name }}.'},
        {'search':'`hca-hin-dev-cur-comp`.auth_base_views.','replace':'{{ params.param_im_auth_base_views_dataset_name }}.'},
        {'search':'`hca-hin-dev-cur-pub`.bqutil_fns.','replace':'{{ params.param_im_bqutil_fns_dataset_name }}.'}
    ]

    production_regex_find_replace_list = [
        {'search':r"((?:"+'|'.join(core_dataset_list)+")\.[a-z|0-9|_]+\s*[AS]*\s*(?!ON)(?!WHERE)(?!INNER)(?!JOIN)[a-z|0-9|_]*)",
        'replace':r"\1 FOR SYSTEM_TIME AS OF TIMESTAMP(tableload_start_time,'US/Central')",
        'query_type':"EXP"},
        {'search':r"DATE\(([a-z|0-9|_|.]+)\)\s*=\s*current_date\('US\/Central'\)\s*", 
        'replace':r"\1 >= tableload_start_time - INTERVAL 1 MINUTE ",
        'query_type':"ACT"},
        {'search':r"([a-z|0-9|_]+.dw_last_update_date_time)\s*[<>=]\s*\(\s*SELECT\s*MAX\(etl_job_run\.job_start_date_time\).*FROM.*etl_job_run.*WHERE.*=.*'\s*\)", 
        'replace':r"\1 = current_date('US/Central') ",
        'query_type':"EXP"},
        {'search':r"\(\s*SELECT\s*MAX\(etl_job_run\.job_start_date_time\).*FROM.*etl_job_run.*WHERE.*=.*'\s*\)", 
        'replace':r" tableload_start_time - INTERVAL 1 MINUTE ",
        'query_type':"ACT"}
    ]

In [None]:
#get the DDLs for the tables from teradata 
def gen_bq_validation_sql():

    # Download the ParmFiles from required source system folder under the LOB
    # sftp_download_bteqs_multiple_sources(unix_server, username, password, unix_etl_folder, unix_lob, parmfiles_path_folder, lob_sub_folders, file_prefix, add_source_dir_prefix, parmfolder_before_lob_subfolder)

    # Create Raw SQL Files for BQMS Conversions
    control_rec_list = capture_variables_and_gen_td_sql_files(parmfiles_path_folder, conversion_source_path_folder, sub_folder_list)
    # print(control_rec_list)

    # Preprocess SQL Files to remove undesired code and format SQL
    control_rec_list = preprocess_sqls(conversion_source_path_folder, preprocessed_tdsqlfiles_path, sub_folder_list, control_rec_list)
    # print(control_rec_list)

    ######################################## RUN BQMS HERE ########################################
    # Run BQMS Process Here
    # bqms_translator(preprocessed_tdsqlfiles_path, conversion_target_path_folder, sub_folder_list, bucket_name, gcsfolder, run_time, 'BTEQ')

    # Postprocess SQL Files
    # postprocess_bqsqls(conversion_target_path_folder, postprocessed_bqsqlsfiles_path, sub_folder_list)

    # Execute Postprocessed SQL FIles
    # execute_sql(postprocessed_bqsqlsfiles_path, sub_folder_list)

    # Print Execution Results
    # print_execution_results(logfilenamepath)

    # Parameterize the sqls
    # productionize_sqls(postprocessed_bqsqlsfiles_path, prodready_bqsqls_path)

    # Create Validation SQLs Files from Template
    # gen_validation_sqls_from_template(control_rec_list)

    # Upload the Templated SQLs to DAG
    upload_to_dags(output_path_folder)
    
    # Create Control File
    gen_control_file(control_rec_list)
    
    dt2 = datetime.now()
    print(dt2-dt1)

In [None]:
print("Begin of Processing")

gen_bq_validation_sql()

print("End of Processing")