In [9]:
import os
import pandas as pd

# Define the folder containing the files
folder_path = "Financial_Statements_16_Companies"

# Get all Excel files in the folder
excel_files = [f for f in os.listdir(folder_path) if f.endswith(".xlsx")]

# Function to clean the "Fiscal Quarter" column
def clean_fiscal_quarter(value):
    value = str(value)
    if "Current" in value:
        return " ".join(value.split()[-2:])  # Keep "Current YYYY"
    elif "Q" in value:
        parts = value.split()
        return f"{parts[-2]} {parts[-1]}"  # Keep "QX YYYY"
    return value

# Loop through each file
for file in excel_files:
    file_path = os.path.join(folder_path, file)
    
    # Explicitly specify 'openpyxl' for .xlsx files
    try:
        with pd.ExcelFile(file_path, engine="openpyxl") as xls:
            sheet_names = xls.sheet_names
            processed_sheets = {}

            for sheet in sheet_names:
                df = pd.read_excel(xls, sheet_name=sheet, engine="openpyxl")
                
                if "Fiscal Quarter" in df.columns:
                    df["Fiscal Quarter"] = df["Fiscal Quarter"].apply(clean_fiscal_quarter)

                # Store the cleaned DataFrame
                processed_sheets[sheet] = df

        # Save the cleaned data to a new file
        output_path = os.path.join(folder_path, f"cleaned_{file}")
        with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
            for sheet, df in processed_sheets.items():
                df.to_excel(writer, sheet_name=sheet, index=False)

        print(f"Processed and saved: {file}")

    except Exception as e:
        print(f"Error processing {file}: {e}")

print("Processing complete.")


Processed and saved: financial_statements_goog.xlsx
Processed and saved: financial_statements_vz.xlsx
Processed and saved: financial_statements_ko.xlsx
Processed and saved: financial_statements_mcd.xlsx
Processed and saved: cleaned_financial_statements_amd.xlsx
Processed and saved: financial_statements_meta.xlsx
Processed and saved: financial_statements_shel.xlsx
Processed and saved: financial_statements_t.xlsx
Processed and saved: financial_statements_jpm.xlsx
Processed and saved: financial_statements_pep.xlsx
Processed and saved: financial_statements_amd.xlsx
Processed and saved: financial_statements_amzn.xlsx
Processed and saved: financial_statements_ma.xlsx
Processed and saved: financial_statements_tsla.xlsx
Processed and saved: financial_statements_spgi.xlsx
Processed and saved: financial_statements_nflx.xlsx
Processed and saved: financial_statements_hsbc.xlsx
Processing complete.


In [10]:
import re

# Define the folder containing cleaned files
folder_path = "Financial_Statements_16_Companies/cleaned_data/clean_names/"

# Get all cleaned Excel files in the folder
excel_files = [f for f in os.listdir(folder_path) if f.startswith("cleaned_financial_statements_") and f.endswith(".xlsx")]

# Rename files
for file in excel_files:
    # Extract company name using regex
    match = re.search(r"cleaned_financial_statements_(.+)\.xlsx", file)
    if match:
        company_name = match.group(1)  # Extract company name
        new_file_name = f"{company_name}.xlsx"  # New filename
        
        # Define old and new file paths
        old_path = os.path.join(folder_path, file)
        new_path = os.path.join(folder_path, new_file_name)

        # Rename the file
        os.rename(old_path, new_path)
        print(f"Renamed: {file} -> {new_file_name}")

print("Renaming complete.")


Renamed: cleaned_financial_statements_jpm.xlsx -> jpm.xlsx
Renamed: cleaned_financial_statements_meta.xlsx -> meta.xlsx
Renamed: cleaned_financial_statements_t.xlsx -> t.xlsx
Renamed: cleaned_financial_statements_shel.xlsx -> shel.xlsx
Renamed: cleaned_financial_statements_amd.xlsx -> amd.xlsx
Renamed: cleaned_financial_statements_pep.xlsx -> pep.xlsx
Renamed: cleaned_financial_statements_goog.xlsx -> goog.xlsx
Renamed: cleaned_financial_statements_ma.xlsx -> ma.xlsx
Renamed: cleaned_financial_statements_ko.xlsx -> ko.xlsx
Renamed: cleaned_financial_statements_hsbc.xlsx -> hsbc.xlsx
Renamed: cleaned_financial_statements_nflx.xlsx -> nflx.xlsx
Renamed: cleaned_financial_statements_spgi.xlsx -> spgi.xlsx
Renamed: cleaned_financial_statements_amzn.xlsx -> amzn.xlsx
Renamed: cleaned_financial_statements_mcd.xlsx -> mcd.xlsx
Renamed: cleaned_financial_statements_vz.xlsx -> vz.xlsx
Renamed: cleaned_financial_statements_tsla.xlsx -> tsla.xlsx
Renaming complete.


In [11]:
import os
import pandas as pd

# Define the folder containing the cleaned Excel files
folder_path = "Financial_Statements_16_companies/cleaned_data/clean_names/"

# Get all Excel files in the folder
excel_files = [f for f in os.listdir(folder_path) if f.endswith(".xlsx")]

# Process each file
for file in excel_files:
    file_path = os.path.join(folder_path, file)
    
    try:
        # Load the Excel file
        with pd.ExcelFile(file_path, engine="openpyxl") as xls:
            sheet_names = xls.sheet_names  # Get all sheet names
            processed_sheets = {}

            for i, sheet in enumerate(sheet_names):
                df = pd.read_excel(xls, sheet_name=sheet, engine="openpyxl")

                # Keep only first 8 rows except for the last sheet, which gets 9 rows
                if i == len(sheet_names) - 1:
                    df = df.iloc[:10]  # Last sheet -> 9 rows
                else:
                    df = df.iloc[:9]  # Other sheets -> 8 rows

                processed_sheets[sheet] = df  # Store the processed DataFrame

        # Save the modified data back to the same file
        with pd.ExcelWriter(file_path, engine="openpyxl") as writer:
            for sheet, df in processed_sheets.items():
                df.to_excel(writer, sheet_name=sheet, index=False)

        print(f"Processed and saved: {file}")

    except Exception as e:
        print(f"Error processing {file}: {e}")

print("Processing complete.")


Processed and saved: meta.xlsx
Processed and saved: shel.xlsx
Processed and saved: mcd.xlsx
Processed and saved: ko.xlsx
Processed and saved: goog.xlsx
Processed and saved: vz.xlsx
Processed and saved: jpm.xlsx
Processed and saved: amzn.xlsx
Processed and saved: tsla.xlsx
Processed and saved: nflx.xlsx
Processed and saved: spgi.xlsx
Processed and saved: hsbc.xlsx
Processed and saved: ma.xlsx
Processed and saved: amd.xlsx
Processed and saved: pep.xlsx
Processed and saved: t.xlsx
Processing complete.


In [18]:
import os
import pandas as pd

# Define the folder containing the Excel files
folder_path = "Financial_Statements_16_companies/cleaned_data/clean_names/transposed/"

# Get all Excel files in the folder
excel_files = [f for f in os.listdir(folder_path) if f.endswith(".xlsx")]

# Process each file
for file in excel_files:
    file_path = os.path.join(folder_path, file)
    
    try:
        # Load the Excel file
        with pd.ExcelFile(file_path, engine="openpyxl") as xls:
            sheet_names = xls.sheet_names  # Get all sheet names
            transposed_sheets = {}

            for sheet in sheet_names:
                df = pd.read_excel(xls, sheet_name=sheet, engine="openpyxl")

                # Transpose the DataFrame but keep original column names in the first column
                df_transposed = df.set_index(df.columns[0]).T.reset_index()

                # Rename the first column to "Metrics"
                df_transposed.rename(columns={df_transposed.columns[0]: "Metrics"}, inplace=True)

                transposed_sheets[sheet] = df_transposed  # Store the transposed DataFrame

        # Overwrite the original file with transposed data
        with pd.ExcelWriter(file_path, engine="openpyxl") as writer:
            for sheet, df in transposed_sheets.items():
                df.to_excel(writer, sheet_name=sheet, index=False)

        print(f"Transposed and overwritten: {file}")

    except Exception as e:
        print(f"Error processing {file}: {e}")

print("Transposing complete.")


Transposed and overwritten: meta.xlsx
Transposed and overwritten: shel.xlsx
Transposed and overwritten: mcd.xlsx
Transposed and overwritten: ko.xlsx
Transposed and overwritten: goog.xlsx
Transposed and overwritten: vz.xlsx
Transposed and overwritten: jpm.xlsx
Transposed and overwritten: amzn.xlsx
Transposed and overwritten: tsla.xlsx
Transposed and overwritten: nflx.xlsx
Transposed and overwritten: spgi.xlsx
Transposed and overwritten: hsbc.xlsx
Transposed and overwritten: ma.xlsx
Transposed and overwritten: amd.xlsx
Transposed and overwritten: pep.xlsx
Transposed and overwritten: t.xlsx
Transposing complete.


In [23]:
import os
import pandas as pd
import re

# Define input and output folders
input_folder = "Financial_Statements_16_Companies/cleaned_data/clean_names/transposed/"
output_folder = "/Users/vexy/Documents/aip/code/scripts/data/cleaned_scraped_data/"
# Ensure output directory exists
os.makedirs(output_folder, exist_ok=True)

# Get all Excel files in the folder
excel_files = [f for f in os.listdir(input_folder) if f.endswith(".xlsx")]

# Function to clean column names
def clean_column_name(col_name):
    col_name = re.sub(r"[()]", "_", col_name)  # Replace brackets with underscores
    col_name = col_name.replace("&", "and")  # Replace '&' with 'and'
    col_name = re.sub(r"\s+", "_", col_name)  # Replace spaces with underscores
    return col_name

# Function to clean numerical values
def clean_numeric_value(value):
    if pd.isna(value) or value == "-":  # Replace missing values and "-"
        return "N/A"
    value = str(value).strip()
    if value.endswith("%"):  # Convert percentages to decimals
        try:
            return float(value.replace("%", "")) / 100
        except ValueError:
            return "N/A"
    try:
        return float(value)  # Convert normal numbers to float
    except ValueError:
        return "N/A"

# Process each file
for file in excel_files:
    file_path = os.path.join(input_folder, file)
    output_path = os.path.join(output_folder, file)  # Save in the output folder
    
    try:
        # Load the Excel file
        with pd.ExcelFile(file_path, engine="openpyxl") as xls:
            sheet_names = xls.sheet_names  # Get all sheet names
            cleaned_sheets = {}

            for sheet in sheet_names:
                df = pd.read_excel(xls, sheet_name=sheet, engine="openpyxl")

                # Clean column names
                df.columns = [clean_column_name(col) for col in df.columns]

                # Clean numerical values (except first column)
                for col in df.columns[1:]:  # Skip first column
                    df[col] = df[col].apply(clean_numeric_value)

                cleaned_sheets[sheet] = df  # Store the cleaned DataFrame

        # Save cleaned data
        with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
            for sheet, df in cleaned_sheets.items():
                df.to_excel(writer, sheet_name=sheet, index=False)

        print(f"Cleaned and saved: {file}")

    except Exception as e:
        print(f"Error processing {file}: {e}")

print("Cleaning complete. Files saved to:", output_folder)

Cleaned and saved: meta.xlsx
Cleaned and saved: shel.xlsx
Cleaned and saved: mcd.xlsx
Cleaned and saved: ko.xlsx
Cleaned and saved: goog.xlsx
Cleaned and saved: vz.xlsx
Cleaned and saved: jpm.xlsx
Cleaned and saved: amzn.xlsx
Cleaned and saved: tsla.xlsx
Cleaned and saved: nflx.xlsx
Cleaned and saved: spgi.xlsx
Cleaned and saved: hsbc.xlsx
Cleaned and saved: ma.xlsx
Cleaned and saved: amd.xlsx
Cleaned and saved: pep.xlsx
Cleaned and saved: t.xlsx
Cleaning complete. Files saved to: /Users/vexy/Documents/aip/code/scripts/data/cleaned_scraped_data/


In [52]:
import os
import pandas as pd
import oracledb

# Oracle DB connection details
user = "ADMIN"
password = "Passwordtestdb@1"
dsn = "testdb_medium"
wallet_location = r"/Users/vexy/Documents/Wallet_testdb/"

# Connect to Oracle DB
conn = oracledb.connect(
    user=user,
    password=password,
    dsn=dsn,
    config_dir=wallet_location,
    wallet_location=wallet_location,
    wallet_password=password
)
cursor = conn.cursor()

# Ensure the session is using the ADMIN schema
cursor.execute("ALTER SESSION SET CURRENT_SCHEMA = ADMIN")

# Define the folder containing the Excel files
folder_path = "cleaned_scraped_data"

# Get all Excel files in the folder
excel_files = [f for f in os.listdir(folder_path) if f.endswith(".xlsx")]

# Function to clean and ensure unique column names
def clean_column_names(columns):
    seen = set()
    new_columns = []
    for i, col in enumerate(columns):
        col = col.strip().lower().replace(" ", "_").replace("&", "and").replace("(", "_").replace(")", "_")
        if col == "":
            col = f"col_{i}"  # Ensure no empty column names
        original_col = col
        count = 1
        while col in seen:  # Ensure uniqueness
            col = f"{original_col}_{count}"
            count += 1
        seen.add(col)
        new_columns.append(col)
    return new_columns

# Function to format table name
def format_table_name(company, sheet_name):
    sheet_name = sheet_name.lower().replace(" ", "_")
    return f"{company}_{sheet_name}".upper()  # Convert to uppercase for Oracle

import time
import oracledb

import time
import oracledb

def drop_table_forcefully(cursor, conn, table_name, max_retries=5, wait_time=2):
    """Forcefully drops a table, retrying if it's locked."""
    for attempt in range(max_retries):
        try:
            print(f"Attempting to drop table {table_name} (Attempt {attempt + 1}/{max_retries})")
            
            # Set DDL lock timeout to wait before erroring out
            cursor.execute("ALTER SESSION SET DDL_LOCK_TIMEOUT = 10")  
            
            # Try dropping the table
            cursor.execute(f'DROP TABLE "{table_name}" CASCADE CONSTRAINTS PURGE')
            conn.commit()
            
            print(f"✅ Successfully dropped table {table_name}")
            return
        except oracledb.DatabaseError as e:
            error_code = e.args[0].code
            
            if error_code == 54:  # ORA-00054: resource busy
                print(f"⚠️ Table {table_name} is locked. Checking for active locks...")
                
                # Get session holding the lock
                cursor.execute(f"""
                    SELECT SID, SERIAL# FROM V$LOCK 
                    WHERE TYPE = 'TM' AND ID1 IN (
                        SELECT OBJECT_ID FROM DBA_OBJECTS WHERE OBJECT_NAME = '{table_name}'
                    )
                """)
                
                lock_info = cursor.fetchone()
                
                if lock_info:
                    sid, serial = lock_info
                    print(f"🔴 Lock detected on {table_name}: SID={sid}, SERIAL#={serial}")
                    
                    try:
                        cursor.execute(f"ALTER SYSTEM KILL SESSION '{sid},{serial}' IMMEDIATE")
                        conn.commit()
                        print(f"✅ Killed session {sid},{serial} to release lock.")
                    except oracledb.DatabaseError as kill_error:
                        print(f"⚠️ Could not kill session: {kill_error}")

                print(f"Retrying in {wait_time} seconds...")
                time.sleep(wait_time)
            else:
                raise  # Raise other errors if they're not lock-related

    print(f"❌ Failed to drop table {table_name} after {max_retries} attempts.")

def table_exists(cursor, table_name):
    """Checks if a table exists in the current schema."""
    cursor.execute(f"SELECT COUNT(*) FROM ALL_TABLES WHERE TABLE_NAME = '{table_name}'")
    return cursor.fetchone()[0] > 0

def create_table(cursor, conn, table_name, df):
    """Creates a table, dropping it first if it already exists."""
    df.columns = clean_column_names(df.columns)
    table_name = table_name.upper()

    print(f"Checking if table {table_name} exists...")

    # Drop table if it already exists
    if table_exists(cursor, table_name):
        drop_table_forcefully(cursor, conn, table_name)

    print(f"Creating table {table_name} with columns: {df.columns}")

    # Define columns
    columns = ['"METRICS" VARCHAR2(255)']
    for col in df.columns[1:]:
        if df[col].dropna().apply(lambda x: isinstance(x, (int, float))).all():
            columns.append(f'"{col.upper()}" FLOAT')
        else:
            columns.append(f'"{col.upper()}" VARCHAR2(255)')

    create_sql = f'CREATE TABLE "{table_name}" ({", ".join(columns)})'


    # Create new table
    cursor.execute(create_sql)
    conn.commit()  # Ensure table creation is saved
    print(f"Created table: {table_name}")



def insert_data(cursor, conn, table_name, df):
    """Inserts cleaned data into the table, ensuring NaN is properly converted to NULL."""
    
    table_name = table_name.upper()  # Ensure table name matches Oracle’s case

    # Fetch column names from Oracle
    cursor.execute(f"SELECT COLUMN_NAME FROM ALL_TAB_COLUMNS WHERE TABLE_NAME = '{table_name}' ORDER BY COLUMN_ID")
    db_columns = [row[0] for row in cursor.fetchall()]

    if not db_columns:
        print(f"Skipping {table_name} - Table does not exist.")
        return

    # Ensure column names match Oracle's uppercase format
    df = df.rename(columns=lambda x: x.upper())

    # Convert 'N/A' and empty strings to NaN (which will become NULL in Oracle)
    df.replace(["N/A", ""], np.nan, inplace=True)

    # Convert numeric columns to FLOAT (force invalid strings to NaN)
    for col in df.columns:
        if col in db_columns and col != "METRICS":  # Exclude the 'METRICS' column
            df[col] = pd.to_numeric(df[col], errors="coerce")  # Convert invalid strings to NaN

    # 🔹 Ensure all NaN values are explicitly converted to None (for Oracle NULL)
    df = df.astype(object).where(pd.notna(df), None)

    print(f"Inserting into {table_name} with columns: {db_columns}")  # Debugging

    # Prepare SQL statement
    cols = '", "'.join(db_columns)
    placeholders = ", ".join([f":{i+1}" for i in range(len(db_columns))])
    insert_sql = f'INSERT INTO "{table_name}" ("{cols}") VALUES ({placeholders})'

    for _, row in df.iterrows():
        values = []
        for val in row:
            if isinstance(val, (int, float)):  # If it's numeric, ensure it's float
                values.append(float(val))
            elif val is None:  # Ensure None is correctly inserted as NULL
                values.append(None)
            else:
                values.append(str(val))  # Ensure text columns remain strings

        # Ensure number of values matches columns
        if len(values) == len(db_columns):
            try:
                cursor.execute(insert_sql, values)
            except oracledb.DatabaseError as e:
                print(f"❌ Error inserting row: {values}")
                print(f"Oracle Error: {e}")
                continue  # Instead of skipping, continue inserting valid rows

    conn.commit()
    print(f"✅ Inserted data into: {table_name}")


# Process each file
for file in excel_files:
    file_path = os.path.join(folder_path, file)
    company_name = file.replace(".xlsx", "").lower()

    with pd.ExcelFile(file_path, engine="openpyxl") as xls:
        sheet_names = xls.sheet_names

        for sheet in sheet_names:
            df = pd.read_excel(xls, sheet_name=sheet, engine="openpyxl")

            table_name = format_table_name(company_name, sheet)

            df.rename(columns={df.columns[0]: "Metrics"}, inplace=True)

            create_table(cursor, conn, table_name, df)

            insert_data(cursor, conn, table_name, df)

# Close the connection
cursor.close()
conn.close()

print("All files and sheets uploaded successfully.")


Checking if table META_INCOME_QUARTERLY exists...
Attempting to drop table META_INCOME_QUARTERLY (Attempt 1/5)
✅ Successfully dropped table META_INCOME_QUARTERLY
Creating table META_INCOME_QUARTERLY with columns: Index(['metrics', 'q3_2024', 'q2_2024', 'q1_2024', 'q4_2023', 'q3_2023',
       'q2_2023', 'q1_2023', 'q4_2022', 'q3_2022'],
      dtype='object')
Created table: META_INCOME_QUARTERLY
Inserting into META_INCOME_QUARTERLY with columns: ['METRICS', 'Q3_2024', 'Q2_2024', 'Q1_2024', 'Q4_2023', 'Q3_2023', 'Q2_2023', 'Q1_2023', 'Q4_2022', 'Q3_2022']
✅ Inserted data into: META_INCOME_QUARTERLY
Checking if table META_BALANCE_SHEET_QUARTERLY exists...
Attempting to drop table META_BALANCE_SHEET_QUARTERLY (Attempt 1/5)
✅ Successfully dropped table META_BALANCE_SHEET_QUARTERLY
Creating table META_BALANCE_SHEET_QUARTERLY with columns: Index(['metrics', 'q3_2024', 'q2_2024', 'q1_2024', 'q4_2023', 'q3_2023',
       'q2_2023', 'q1_2023', 'q4_2022', 'q3_2022'],
      dtype='object')
Created t

In [53]:
import os
import oracledb

# Oracle DB connection details
user = "ADMIN"
password = "Passwordtestdb@1"
dsn = "testdb_medium"
wallet_location = r"/Users/vexy/Documents/Wallet_testdb/"

# Connect to Oracle DB
conn = oracledb.connect(
    user=user,
    password=password,
    dsn=dsn,
    config_dir=wallet_location,
    wallet_location=wallet_location,
    wallet_password=password
)
cursor = conn.cursor()

# Output directory for DDL files
output_dir = "/Users/vexy/Documents/Oracle_DDLs"
os.makedirs(output_dir, exist_ok=True)

# Extract all table DDLs
cursor.execute("SELECT TABLE_NAME, DBMS_METADATA.GET_DDL('TABLE', TABLE_NAME) FROM USER_TABLES")
ddl_data = cursor.fetchall()

# Dictionary to store DDLs by company
company_ddls = {}

# Save all DDLs in a single file
all_ddl_file = os.path.join(output_dir, "all_tables.sql")
with open(all_ddl_file, "w", encoding="utf-8") as all_file:
    for table_name, ddl in ddl_data:
        ddl_text = ddl.read()  # Convert CLOB to string
        company_name = table_name.split("_")[0]  # Extract first word

        # Save DDL into the dictionary for respective company
        if company_name not in company_ddls:
            company_ddls[company_name] = []
        company_ddls[company_name].append(ddl_text)

        # Write to the all_tables.sql file
        all_file.write(f"-- DDL for {table_name}\n")
        all_file.write(ddl_text + "\n\n")

print(f"Saved all table DDLs to: {all_ddl_file}")

# Create individual .sql files for each company
for company, ddls in company_ddls.items():
    company_file = os.path.join(output_dir, f"{company}_ddl.sql")
    with open(company_file, "w", encoding="utf-8") as company_sql:
        for ddl in ddls:
            company_sql.write(ddl + "\n\n")
    print(f"Saved {company} DDLs to: {company_file}")

# Close the connection
cursor.close()
conn.close()

print("DDL extraction and file generation complete.")


Saved all table DDLs to: /Users/vexy/Documents/Oracle_DDLs/all_tables.sql
Saved jpm DDLs to: /Users/vexy/Documents/Oracle_DDLs/jpm_ddl.sql
Saved vz DDLs to: /Users/vexy/Documents/Oracle_DDLs/vz_ddl.sql
Saved amzn DDLs to: /Users/vexy/Documents/Oracle_DDLs/amzn_ddl.sql
Saved meta DDLs to: /Users/vexy/Documents/Oracle_DDLs/meta_ddl.sql
Saved shel DDLs to: /Users/vexy/Documents/Oracle_DDLs/shel_ddl.sql
Saved mcd DDLs to: /Users/vexy/Documents/Oracle_DDLs/mcd_ddl.sql
Saved ko DDLs to: /Users/vexy/Documents/Oracle_DDLs/ko_ddl.sql
Saved goog DDLs to: /Users/vexy/Documents/Oracle_DDLs/goog_ddl.sql
Saved tsla DDLs to: /Users/vexy/Documents/Oracle_DDLs/tsla_ddl.sql
Saved nflx DDLs to: /Users/vexy/Documents/Oracle_DDLs/nflx_ddl.sql
Saved spgi DDLs to: /Users/vexy/Documents/Oracle_DDLs/spgi_ddl.sql
Saved hsbc DDLs to: /Users/vexy/Documents/Oracle_DDLs/hsbc_ddl.sql
Saved ma DDLs to: /Users/vexy/Documents/Oracle_DDLs/ma_ddl.sql
Saved amd DDLs to: /Users/vexy/Documents/Oracle_DDLs/amd_ddl.sql
Saved 

In [None]:
import os
import re

# Define file paths
ddl_folder = "/path/to/ddl/files"  # Update with the correct path
metrics_file_path = "/path/to/metrics_values.txt"  # Update with the correct path

# Read the metrics file
with open(metrics_file_path, "r", encoding="utf-8") as f:
    metrics_content = f.read()

# Extract metric values grouped by table names
metrics_data = {}
matches = re.findall(r"-- Metrics for (\S+)\n([\s\S]+?)(?=\n-- Metrics for |\Z)", metrics_content)

for table_name, values in matches:
    table_name = table_name.strip().upper()  # Convert to uppercase for matching
    metrics_data[table_name] = [line.strip() for line in values.split("\n") if line.strip()]

# Process each DDL file
for ddl_file in os.listdir(ddl_folder):
    if ddl_file.endswith(".sql"):
        file_path = os.path.join(ddl_folder, ddl_file)

        # Read DDL content
        with open(file_path, "r", encoding="utf-8") as f:
            ddl_lines = f.readlines()

        cleaned_ddl = []
        in_insert_block = False

        for line in ddl_lines:
            # Remove old INSERT INTO statements
            if re.search(r'INSERT INTO "ADMIN"\."(\S+)"', line, re.IGNORECASE):
                in_insert_block = True
                continue  # Skip old insert statements
            if in_insert_block and line.strip() == "":  # End of insert block
                in_insert_block = False
                continue
            if not in_insert_block:
                cleaned_ddl.append(line)  # Keep everything else

        updated_ddl = []
        current_table = None

        for line in cleaned_ddl:
            updated_ddl.append(line)

            # Detect CREATE TABLE statements and extract table name
            match = re.search(r'CREATE TABLE "ADMIN"\."(\S+)"', line, re.IGNORECASE)
            if match:
                current_table = match.group(1).upper()  # Convert table name to uppercase for matching

                # Ensure CREATE TABLE uses uppercase table names
                updated_ddl[-1] = re.sub(r'CREATE TABLE "ADMIN"\."(\S+)"', 
                                         f'CREATE TABLE "ADMIN"."{current_table}"', 
                                         updated_ddl[-1], flags=re.IGNORECASE)

                if current_table in metrics_data:
                    # Construct a single INSERT statement with all values inside ""
                    values_str = ",\n".join([f'("{value}")' for value in metrics_data[current_table]])

                    insert_stmt = f'\n-- Insert values into the Metrics column for {current_table}\n'
                    insert_stmt += f'INSERT INTO "ADMIN"."{current_table}" ("Metrics") VALUES\n{values_str};\n'

                    updated_ddl.append(insert_stmt)  # Insert statement directly after CREATE TABLE

        # Save the cleaned and updated DDL file
        with open(file_path, "w", encoding="utf-8") as f:
            f.writelines(updated_ddl)

        print(f"✅ Successfully updated {ddl_file} with corrected INSERT statements.")

print("🎯 All DDLs processed successfully.")


✅ Successfully updated meta_ddl.sql with corrected INSERT statements.
✅ Successfully updated hsbc_ddl.sql with corrected INSERT statements.
✅ Successfully updated t_ddl.sql with corrected INSERT statements.
✅ Successfully updated ko_ddl.sql with corrected INSERT statements.
✅ Successfully updated amzn_ddl.sql with corrected INSERT statements.
✅ Successfully updated shel_ddl.sql with corrected INSERT statements.
✅ Successfully updated mcd_ddl.sql with corrected INSERT statements.
✅ Successfully updated goog_ddl.sql with corrected INSERT statements.
✅ Successfully updated spgi_ddl.sql with corrected INSERT statements.
✅ Successfully updated jpm_ddl.sql with corrected INSERT statements.
✅ Successfully updated all_tables.sql with corrected INSERT statements.
✅ Successfully updated DBTOOLS$EXECUTION_ddl.sql with corrected INSERT statements.
✅ Successfully updated nflx_ddl.sql with corrected INSERT statements.
✅ Successfully updated vz_ddl.sql with corrected INSERT statements.
✅ Successfully

In [76]:
cursor.close()
conn.close()

InterfaceError: DPY-1006: cursor is not open