<h1 style="color:pink;">📊 Excel Data ETL Script</h1>
<h3 style="color:purple;">This script involves: </h3>

        🔍 Detecting & Handling Excel File Formats
        🔐 Decrypting Encrypted Files
        📐 Analyzing Workbook Structures
        🔄 Consolidating Data into DataFrames

=========================================================================================================================

    Read sql procs and list out relations

In [None]:
import os
import re
from collections import defaultdict

# Set your directory path
base_dir = r""

# Initialize dictionaries to hold relationships
analysis_procs = []
output_procs = []
validation_procs = []
relationships = defaultdict(list)

# Function to categorize files based on folder and name
def categorize_proc(proc_name):
    if proc_name.startswith("ANALYSIS.TFM_DRV_"):
        analysis_procs.append(proc_name)
    elif proc_name.startswith("OUTPUT.RPT_EXP_"):
        output_procs.append(proc_name)
    elif proc_name.startswith("VALIDATION.GEN_VAL_"):
        validation_procs.append(proc_name)

# Function to find references in SQL files (e.g., table or procedure names)
def find_references(file_path):
    with open(file_path, 'r') as file:
        content = file.read()
        # Regular expression to find references (e.g., other procedures or tables)
        references = re.findall(r'\b(?:ANALYSIS|OUTPUT|VALIDATION)\.\w+', content)
        return references

# Parse files and categorize by type
for root, dirs, files in os.walk(base_dir):
    for file in files:
        if file.endswith(".sql"):
            proc_name = file.replace(".sql", "")
            categorize_proc(proc_name)
            
            # Find references within the file
            file_path = os.path.join(root, file)
            references = find_references(file_path)
            
            # Add relationships to the dictionary
            for ref in references:
                relationships[proc_name].append(ref)

# Debugging: Print out categorized procedures
print("Analysis Procedures:", analysis_procs)
print("Output Procedures:", output_procs)
print("Validation Procedures:", validation_procs)

# Debugging: Print out references found in each procedure
for proc, refs in relationships.items():
    print(f"{proc} references: {refs}")

# Function to identify likely flows based on the convention
def get_flow_relationships():
    flow_relationships = []
    for analysis_proc in analysis_procs:
        for output_proc in output_procs:
            if output_proc in relationships[analysis_proc]:
                flow_relationships.append((analysis_proc, "->", output_proc))
    
    for validation_proc in validation_procs:
        for proc in analysis_procs + output_procs:
            if proc in relationships[validation_proc]:
                flow_relationships.append((proc, "->", validation_proc))
                
    return flow_relationships

# Display the relationships
flow_relationships = get_flow_relationships()

# Debugging: Print out final flow relationships
if flow_relationships:
    for relation in flow_relationships:
        print(" ".join(relation))
else:
    print("No relationships found based on the naming and reference patterns.")


    Read from a server

In [None]:
import pyodbc
from datetime import datetime
from sqlalchemy import create_engine

# Server and database configuration
server = ''
database = ''
connection_string = f"mssql+pyodbc://@{server}/{database}?driver=ODBC+Driver+17+for+SQL+Server&trusted_connection=yes"

# Create SQL Alchemy engine
engine = create_engine(connection_string, fast_executemany=True)



In [None]:
import pyodbc
from sqlalchemy import create_engine, text
import re
import networkx as nx
import matplotlib.pyplot as plt

# Server and database configuration
server = ''
database = ''
connection_string = f"mssql+pyodbc://@{server}/{database}?driver=ODBC+Driver+17+for+SQL+Server&trusted_connection=yes"

# Create SQL Alchemy engine
engine = create_engine(connection_string, fast_executemany=True)

# Create connection
connection = engine.connect()

# Retrieve stored procedures
stored_procs_query = text("""
SELECT 
    ROUTINE_NAME,
    ROUTINE_DEFINITION
FROM 
    INFORMATION_SCHEMA.ROUTINES
WHERE 
    ROUTINE_TYPE = 'PROCEDURE';
""")

stored_procs = connection.execute(stored_procs_query).fetchall()

# List of tables to focus on
relevant_tables = []

# Function to extract source and target table names from SQL
def extract_table_relations(sql_code):
    # Regex patterns to find table relationships
    insert_pattern = re.compile(r'\bINSERT\s+INTO\s+(\w+)', re.IGNORECASE)
    select_into_pattern = re.compile(r'\bSELECT\s+.*\bINTO\s+(\w+)', re.IGNORECASE)
    update_pattern = re.compile(r'\bUPDATE\s+(\w+)', re.IGNORECASE)
    from_pattern = re.compile(r'\bFROM\s+(\w+)', re.IGNORECASE)
    join_pattern = re.compile(r'\bJOIN\s+(\w+)', re.IGNORECASE)
    
    target_tables = set(insert_pattern.findall(sql_code) + select_into_pattern.findall(sql_code) + update_pattern.findall(sql_code))
    source_tables = set(from_pattern.findall(sql_code) + join_pattern.findall(sql_code))
    
    relations = []
    for target_table in target_tables:
        if target_table in relevant_tables:
            for source_table in source_tables:
                if source_table in relevant_tables:
                    relations.append((source_table, target_table))
    
    return relations

# Analyze stored procedures and create table relationships
table_relationships = []

for proc in stored_procs:
    proc_name = proc[0]  # Access tuple by index
    proc_definition = proc[1]  # Access tuple by index
    relations = extract_table_relations(proc_definition)
    table_relationships.extend(relations)

# Debugging: Print all extracted relationships
print("Extracted Table Relationships:")
for relation in table_relationships:
    print(relation)

# Filter out irrelevant tables
filtered_relationships = [(source, target) for source, target in table_relationships 
                          if source in relevant_tables and target in relevant_tables]

# Debugging: Print filtered relationships
print("\nFiltered Table Relationships:")
for relation in filtered_relationships:
    print(relation)

# Create a directed graph
G = nx.DiGraph()

# Add nodes and edges
for relation in filtered_relationships:
    source_table, target_table = relation
    G.add_edge(source_table, target_table)

# Debugging: Ensure nodes and edges are added
print("\nGraph Nodes and Edges:")
print(G.nodes())
print(G.edges())

# Draw the graph
if G.number_of_edges() > 0:
    plt.figure(figsize=(12, 8))
    pos = nx.spring_layout(G, seed=42)
    nx.draw(G, pos, with_labels=True, node_size=3000, node_color="skyblue", font_size=10, font_weight="bold", arrows=True)
    plt.title('Table Generation Flow Diagram')
    plt.show()
else:
    print("No table relationships to display in the diagram.")

# Close connection
connection.close()


    The end