In [37]:
# !py -3 -m pip install pandas

In [38]:
from datetime import datetime, timedelta
import os
import shutil
import pandas as pd
import numpy as np
import math
import json
from jinja2 import Environment, FileSystemLoader
import logging
import re

In [39]:
#Write file to local directory
def write_file_local(path,file_data):
    
    with open(path, 'w') as file:
        file_string = file_data
        file.write(file_string)

In [40]:
with open('config/lob_config.json') as json_lob_config:
    config = json.load(json_lob_config)

lob = config['lob']
lob_lower = lob.lower().strip()
domain_abbr = config['domain_abbr']
core_dataset_list = config['core_dataset_list']

regexp_subquery = re.compile(r"(?:"+'|'.join(core_dataset_list)+")\.([a-z|0-9|_]+)",re.IGNORECASE | re.DOTALL)

jinja_template_path = config['jinja_template_path']
jinja_extension = config['jinja_extension']
yaml_extension = config['yaml_extension']
output_parent_folder = config['output_parent_folder']
integrate_meta_folder = config['integrate_meta_folder']
integrate_meta_file_name = config['integrate_meta_file_name']
dag_trigger_map_file_name = config['dag_trigger_map_file_name']
dag_id_timings_file_name = config['dag_id_timings_file_name']
dag_schedule_file_name = config['dag_schedule_file_name']

domain_abbr_lower = domain_abbr.lower().strip()

input_path = f"{integrate_meta_folder}\{integrate_meta_file_name}"

# df_dag_trigger_map = pd.read_csv(f"{integrate_meta_folder}\{dag_trigger_map_file_name}", index_col=None)

# df_dag_id_timings = pd.read_csv(f"{integrate_meta_folder}\{dag_id_timings_file_name}", index_col=None)

df_dag_schedule_info = pd.read_csv(f"{integrate_meta_folder}\{dag_schedule_file_name}", index_col=None)

In [41]:
#get the DDLs for the tables from teradata 
def gen_integrate_yaml():
    dt1 = datetime.now()

    integrate_yaml_path = f"{output_parent_folder}\{lob.strip()}"
    if os.path.exists(integrate_yaml_path):
        shutil.rmtree(integrate_yaml_path, ignore_errors=True)
    os.makedirs(integrate_yaml_path)

    df = pd.read_csv(input_path, index_col=None)

    db_type_param = "sqlserver"
    env = Environment(loader = FileSystemLoader(jinja_template_path),   trim_blocks=True, lstrip_blocks=True)

    prev_source_system = ""
    prev_dag_suffix_id = ""
    trigger_dag_param = ""
    file_contents = "integrate:"
    primary_table_name = ""
    integrate_scripts = []
    validation_scripts = []

    cur_year = dt1.year
    cur_month = dt1.month
    cur_day = dt1.day

    for index, row in df.iterrows():
        try:

            integration_script = str(row['Integration_Script']).strip().lower()
            source_system = str(row['Source']).strip().lower()
            frequency = str(row['Frequency']).strip().lower()
            dag_suffix_id = str(row['Dag_Suffix_ID']).strip().lower()
            # sql_group_sequence = str(row['SQL_Group_Sequence']).strip().lower()
            # job_name = str(row['Job_Name']).strip().lower()
            validation_audit_sql_script_name = str(row['Validation_Audit_SQL_Script_Name']).strip().lower()
            table_name = str(row['Table_Name']).strip().lower()
            validation_script_present = str(row['Validation_Script_Present']).strip().lower()

            if prev_source_system != source_system:
                if index > 0:
                    output_file_path = f"{output_parent_folder}\{lob.strip()}\{prev_source_system}_integrate_dependency{yaml_extension}"
                    print(output_file_path)
                    write_file_local(output_file_path, file_contents)

            if prev_source_system != source_system or (prev_source_system == source_system and prev_dag_suffix_id != dag_suffix_id):
                dag_id = f"dag_integrate_{source_system}_{db_type_param}_{frequency}_{str(dag_suffix_id).zfill(2)}"
                # df_dag_trigger_map_match = df_dag_trigger_map[df_dag_trigger_map["Dag_Id"]==dag_id]
                df_dag_trigger_map_match = df_dag_schedule_info[df_dag_schedule_info["Dag_ID"]==dag_id || df_dag_schedule_info["Execution_Type"]=="Triggerred"]
                if df_dag_trigger_map_match.empty:
                    trigger_dag_param = []
                else:
                    trigger_dag_param = df_dag_trigger_map_match["Triggered_Dag_Id"].tolist()

                # df_dag_id_timings_match = df_dag_id_timings[df_dag_id_timings["Dag_ID"]==dag_id]
                df_dag_id_timings_match = df_dag_schedule_info[df_dag_schedule_info["Dag_ID"]==dag_id || df_dag_schedule_info["Execution_Type"]=="Scheduled"]
                if df_dag_id_timings_match.empty:
                    dag_schedule = "None"
                else:
                    dag_time = str(df_dag_id_timings_match["Time"].iloc[0]).strip().lower()
                    dag_frequency = str(df_dag_id_timings_match["Frequency"].iloc[0]).strip().lower()
                    dag_day_of_month = int(df_dag_id_timings_match["Day_of_Month"].iloc[0]) if not math.isnan(df_dag_id_timings_match["Day_of_Month"].iloc[0]) else np.nan
                    dag_day_of_week = int(df_dag_id_timings_match["Day_of_Week"].iloc[0]) if not math.isnan(df_dag_id_timings_match["Day_of_Week"].iloc[0]) else np.nan

                    hour_of_day = dag_time.split(":")[0] if len(dag_time.split(":")) > 1 else dag_time
                    if hour_of_day != "*":
                        hour_of_day = int(hour_of_day)

                    minute_of_day = dag_time.split(":")[1] if len(dag_time.split(":")) > 1 else '*'
                    if minute_of_day != "*":
                        minute_of_day = int(minute_of_day)

                    if dag_frequency == "monthly":
                        day_of_month = dag_day_of_month
                    else:
                        day_of_month = "*"

                    if dag_frequency == "weekly":
                        day_of_week = dag_day_of_week
                    else:
                        day_of_week = "*"

                    dag_schedule = f"{minute_of_day} {hour_of_day} {day_of_month} * {day_of_week}"

                replace_params = {
                    "source_system_param": f"{source_system}",
                    "frequency_param": f"{frequency}",
                    "dag_id_param": f"{dag_suffix_id}",
                    "year_param": f"{cur_year}",
                    "month_param": f"{cur_month}",
                    "day_param": f"{cur_day}",
                    "schedule_param": f"{dag_schedule}",
                    "db_type_param": f"{db_type_param}",
                    "trigger_dag_param": f"{trigger_dag_param}"
                }
                template = env.get_template(f"integrate_header{jinja_extension}")
                if prev_source_system != source_system and prev_dag_suffix_id != dag_suffix_id:
                    file_contents = "integrate:" + "\n"
                file_contents = file_contents + template.render(replace_params) + "\n"
            
            core_table_find = regexp_subquery.search(table_name)
            if core_table_find is None:
                primary_table_name = ""
                script_rec = {"script_name": integration_script}
                integrate_scripts.append(script_rec)
            else:
                primary_table_name = core_table_find.groups()[0].lower()
                replace_params = {
                    "table_param": f"{primary_table_name}"
                }
                template = env.get_template(f"integrate_dependency{jinja_extension}")
                file_contents = file_contents + template.render(replace_params) + "\n"

                script_rec = {"script_name": integration_script}
                integrate_scripts.append(script_rec)
                replace_params = {
                    "script_file_list": integrate_scripts
                }
                template = env.get_template(f"integrate_sql{jinja_extension}")
                file_contents = file_contents + template.render(replace_params)
                integrate_scripts = []

                if validation_script_present.lower() == 'yes':
                    script_rec = {"script_name": validation_audit_sql_script_name}
                else:
                    script_rec = {"script_name": 'NONE'}
                validation_scripts.append(script_rec)
                replace_params = {
                    "validation_file_list": validation_scripts
                }
                template = env.get_template(f"integrate_validation{jinja_extension}")
                file_contents = file_contents + template.render(replace_params)
                validation_scripts = []
                       
            prev_source_system = source_system
            prev_dag_suffix_id = dag_suffix_id

            if index == len(df) - 1:
                output_file_path = f"{output_parent_folder}\{lob.strip()}\{prev_source_system}_integrate_dependency{yaml_extension}"
                print(output_file_path)
                write_file_local(output_file_path, file_contents)

        except Exception as e1:
            print(e1)
            pass
    
    dt2 = datetime.now()
    print(dt2-dt1)

    print("Record Count : ", len(df))

In [42]:
print("Begin of Processing")

gen_integrate_yaml()

print("End of Processing")

Begin of Processing
LOB\CR\navadhoc_integrate_dependency.yaml
LOB\CR\varianedw_integrate_dependency.yaml
LOB\CR\metriq_integrate_dependency.yaml
0:00:00.222020
Record Count :  150
End of Processing
