<h1 style="color:pink;">📊 Flatfile Data ETL Script</h1>
<h3 style="color:purple;">This script involves: </h3>

        🔍 Detecting & Handling Excel File Formats
        🔐 Decrypting Encrypted Files
        📐 Analyzing Workbook Structures
        🔄 Consolidating Data into DataFrames

=========================================================================================================================

# Code Viewing Settings

In [None]:
import pandas as pd
import csv
import sas7bdat

pd.set_option('display.max_columns', None)
#pd.set_option('display.max_rows', None)
#pd.set_option('display.max_colwidth', None)
#pd.set_option('display.width', None)
#np.set_printoptions(threshold=np.inf)
#pd.set_option('display.float_format', lambda x: '%.5f' % x)

=========================================================================================================================

#    Read delimitered flatfile

In [None]:
from pathlib import Path
import csv

def read_and_combine_csv(master_path):
    all_csv_files = master_path.rglob('*.csv')  # Recursively find all CSV files #.parquet
    combined_dfs = {}  # Dictionary to store combined DataFrames

    for csv_file in all_csv_files:
        df = pd.read_csv(csv_file,encoding='utf-8',index_col=False,header=0,dtype=str,delimiter=','\
                         ,quoting=csv.QUOTE_MINIMAL)
#        df = pd.read_parquet(csv_file, engine='pyarrow')
#        df = df.astype(str)
        
        # Add ID column at the beginning
        df.insert(0, 'DTT_ID', range(1, len(df) + 1))

        # Add Filename column at the end
        df['DTT_FILENAME'] = csv_file.name

        columns_tuple = tuple(df.columns)  # Tuple of column names for comparison

        if columns_tuple in combined_dfs:
            # Append to the existing DataFrame with the same columns
            combined_dfs[columns_tuple] = pd.concat([combined_dfs[columns_tuple], df], ignore_index=True)
        else:
            # Create a new entry in the dictionary for these columns
            combined_dfs[columns_tuple] = df

    return combined_dfs

# Usage
master_folder_path = Path(r'')
combined_dataframes = read_and_combine_csv(master_folder_path)

# Displaying combined dataframes
for i, df in enumerate(combined_dataframes.values()):
    print(f"DataFrame {i+1} Preview:")
    display(df.head(2))
    print(f"Total rows in DataFrame {i+1}: {len(df)}\n")

In [None]:
dataframes_list = list(combined_dataframes.values())

# Access the second DataFrame (remember, Python uses 0-based indexing)
#second_dataframe = dataframes_list[1]

# Now you can work with second_dataframe
print("DataFrame Preview:")
display(dataframes_list[0].head(2))
print(f"Total rows in Second DataFrame: {len(dataframes_list[0])}\n")

    Remove double quotes and trim trailing space

In [None]:
def double_quotes_and_trim_dataframes(all_dfs):
    trimmed_dfs = []  # List to store cleaned DataFrames

    for df in all_dfs:
        # Create a copy to avoid modifying the original DataFrame
        df_copy = df.copy()

        # Select columns of object type
        cols = df_copy.select_dtypes(['object']).columns

        # Apply the string operations only to string values
        for col in cols:
            df_copy.loc[:, col] = df_copy[col].apply(lambda x: x.replace('"', '').strip() if isinstance(x, str) else x)
        
        trimmed_dfs.append(df_copy)

    return trimmed_dfs

# Apply the cleaning process to all DataFrames
trimmed_all_dfs = double_quotes_and_trim_dataframes(dataframes_list)

In [None]:
df1=dataframes_list[0]

In [None]:
df2=dataframes_list[1]

In [None]:
# %%time

# cols=df1.select_dtypes(['object']).columns
# df1[cols]=df1[cols].apply(lambda x: x.str.replace('"','').str.strip())

In [None]:
import pandas as pd

# List of your 4 DataFrames
dfs = [df1, df2]

# Define the exact target column order and names
target_columns = [
    'DTT_ID', 'Profit Centre', 'Engagement Partner', 'Engagement Manager',
       'Client Code', 'Client Name', 'WBS Level 2', 'Engagement Name',
       'Engagement Type', 'Contract Type', 'Total Hours',
       'Gross Services Revenue', 'Gross Revenue ADM',
       'Engagement Service Revenue', 'Event / Success Achieved Revenue',
       'Expenses Over / Under Recovery Revenue',
       'Unplanned Revenue Adjustments', 'Net Services Revenue', 'ADM Revenue',
       'Product Revenue', 'Asset Revenue', 'Other Revenue', 'Net Revenue',
       'Total Cost', 'Client Margin', 'Client Margin %', 'Net Rev/Hour',
       'Standard Recovery %', 'Adjusted Net Revenue',
       'Adjusted Client Margin %', 'Financial Year', 'Month', 'Period Type',
       'DTT_FILENAME'
]

# Standardize column names and align order
for i in range(len(dfs)):
    df = dfs[i]

    # Rename 'Year' to 'Financial Year' if present
    if 'Year' in df.columns:
        df = df.rename(columns={'Year': 'Financial Year'})

    # Reorder columns to match target list
    df = df[[col for col in target_columns if col in df.columns]]

    dfs[i] = df

# Combine all aligned DataFrames
df_combined_all = pd.concat(dfs, ignore_index=True)

# Final alignment to ensure column sequence
df_combined_all = df_combined_all[target_columns]

# Optional: quick preview
display(df_combined_all.head(2))
print(f"Total combined rows: {len(df_combined_all)}")

    Read spotlight configure files

In [None]:
# import re
# import os
# import glob

# log_directory = r''

# log_file_pattern = os.path.join(log_directory, '*.splog')

# def calculate_row_count(log_file_path):
#     total_row_count = 0
#     row_count_pattern = re.compile(r'Returned (\d+) rows from query JOURNALS')

#     with open(log_file_path, 'r') as log_file:
#         for line in log_file:
#             match = row_count_pattern.search(line)
#             if match:
#                 row_count = int(match.group(1))
#                 total_row_count += row_count

#     return total_row_count

# row_counts_per_file = {}

# log_files = glob.glob(log_file_pattern)

# for log_file_path in log_files:
#     row_count = calculate_row_count(log_file_path)
#     row_counts_per_file[log_file_path] = row_count

# total_row_count_all_files = sum(row_counts_per_file.values())

# print('Row count per log file:')
# for log_file_path, row_count in row_counts_per_file.items():
#     print(f'{log_file_path}: {row_count}')

# print(f'\nTotal row count across all log files: {total_row_count_all_files}')

In [None]:
# import xml.etree.ElementTree as ET
# import html

# # Define the file path
# file_path = r''

# # Parse the XML file
# tree = ET.parse(file_path)
# root = tree.getroot()

# # Function to decode and parse the embedded query XML
# def decode_and_print_query_xml(encoded_query_xml):
#     # Decode the HTML entities in the XML string
#     decoded_xml = html.unescape(encoded_query_xml)
    
#     # Parse the decoded XML string
#     query_tree = ET.ElementTree(ET.fromstring(decoded_xml))
#     query_root = query_tree.getroot()
    
#     # Print the parsed query details
#     for child in query_root:
#         print(f"{child.tag}: {child.text}")
#         if len(child):
#             for subchild in child:
#                 print(f"  {subchild.tag}: {subchild.text}")
#                 if len(subchild):
#                     for subsubchild in subchild:
#                         print(f"    {subsubchild.tag}: {subsubchild.text}")
#     print("---")

# # Find the 'Query' elements and process the 'msprop:AdvancedQueryXML' attribute
# for query in root.findall('.//xs:element[@name="Query"]', namespaces={'xs': 'http://www.w3.org/2001/XMLSchema'}):
#     advanced_query_xml = query.get('{urn:schemas-microsoft-com:xml-msprop}AdvancedQueryXML')
#     if advanced_query_xml:
#         decode_and_print_query_xml(advanced_query_xml)

In [None]:
# import xml.etree.ElementTree as ET

# # Define the file path
# file_path = r''

# # Parse the XML file
# tree = ET.parse(file_path)
# root = tree.getroot()

# # Function to print the content of a query element in a readable format
# def print_query(query):
#     name = query.find('Name').text if query.find('Name') is not None else 'N/A'
#     field_name = query.find('Field_x0020_Name').text if query.find('Field_x0020_Name') is not None else 'N/A'
#     description = query.find('Description').text if query.find('Description') is not None else 'N/A'
    
#     print(f"Name: {name}")
#     print(f"Field Name: {field_name}")
#     print(f"Description: {description}")
#     print("---")

# # Iterate through the XML tree and print each Query element
# for query in root.findall('Query'):
#     print_query(query)

    Read txt/csv in one go

In [None]:
# from pathlib import Path
# import pandas as pd
# import csv

# def read_and_combine_files(master_path):
#     # Find both .txt and .csv files recursively
#     all_data_files = list(master_path.rglob('*.txt')) + list(master_path.rglob('Hierachy JAN.csv'))
#     combined_dfs = {}

#     for file in all_data_files:
#         try:
#             # Determine delimiter based on file extension
#             delimiter = '\t' if file.suffix.lower() == '.txt' else ','
            
#             df = pd.read_csv(file,
#                              encoding='utf-8',
#                              index_col=False,
#                              header=0,
#                              dtype=str,
#                              delimiter=delimiter,
#                              quoting=csv.QUOTE_MINIMAL)
            
#             # Insert ID and filename
#             if 'DTT_ID' not in df.columns:
#                 df.insert(0, 'DTT_ID', range(1, len(df) + 1))

#             df['DTT_FILENAME'] = file.name
            
#             columns_tuple = tuple(df.columns)
#             if columns_tuple in combined_dfs:
#                 combined_dfs[columns_tuple] = pd.concat([combined_dfs[columns_tuple], df], ignore_index=True)
#             else:
#                 combined_dfs[columns_tuple] = df
#         except Exception as e:
#             print(f"Error processing file {file.name}: {e}")

#     return combined_dfs

# # Usage
# master_folder_path = Path(r'C:\Users\ele\OneDrive - Deloitte (O365D)\Projects\IB_Asset\Operate Revenue Reporting\12.MAY\PC')
# combined_dataframes = read_and_combine_files(master_folder_path)

# # Display previews
# for i, df in enumerate(combined_dataframes.values()):
#     print(f"DataFrame {i+1} Preview:")
#     display(df.head(2))
#     print(f"Total rows in DataFrame {i+1}: {len(df)}\n")

    Read SAS format

In [None]:
# from pathlib import Path
# import pyreadstat
# import pandas as pd

# def read_and_combine_csv(master_path):
#     all_sas_files = master_path.rglob('*.sas7bdat')  # Recursively find all SAS files
#     combined_dfs = {}  # Dictionary to store combined DataFrames

#     for sas_file in all_sas_files:
#         # Read the SAS file with pyreadstat
#         df, meta = pyreadstat.read_sas7bdat(sas_file)

#         # Convert all values to strings in a way that ensures no floating point transformation
#         df = df.applymap(lambda x: '{:.0f}'.format(x) if isinstance(x, float) and x.is_integer() else str(x))

#         # Add ID column at the beginning
#         df.insert(0, 'DTT_ID', range(1, len(df) + 1))

#         # Add Filename column at the end
#         df['DTT_FILENAME'] = sas_file.name

#         columns_tuple = tuple(df.columns)  # Tuple of column names for comparison

#         if columns_tuple in combined_dfs:
#             # Append to the existing DataFrame with the same columns
#             combined_dfs[columns_tuple] = pd.concat([combined_dfs[columns_tuple], df], ignore_index=True)
#         else:
#             # Create a new entry in the dictionary for these columns
#             combined_dfs[columns_tuple] = df

#     return combined_dfs

# # Usage
# master_folder_path = Path(r'C:\Users\ele\Downloads\20241016 Data request - Service Item Audit (1)')
# combined_dataframes = read_and_combine_csv(master_folder_path)

# # Displaying combined dataframes
# for i, df in enumerate(combined_dataframes.values()):
#     print(f"DataFrame {i+1} Preview:")
#     display(df.head(5))
#     print(f"Total rows in DataFrame {i+1}: {len(df)}\n")

    Read parquet format

In [None]:
# from pathlib import Path
# import pandas as pd

# def read_and_combine_parquet(master_path):
#     all_parquet_files = master_path.rglob('*.parquet')  # Recursively find all Parquet files
#     combined_dfs = {}  # Dictionary to store combined DataFrames

#     for parquet_file in all_parquet_files:
#         df = pd.read_parquet(parquet_file, engine='pyarrow')
#         df = df.astype(str)
        
#         # Add ID column at the beginning
#         df.insert(0, 'DTT_ID', range(1, len(df) + 1))

#         # Add Filename column at the end
#         df['DTT_FILENAME'] = parquet_file.name

#         columns_tuple = tuple(df.columns)  # Tuple of column names for comparison

#         if columns_tuple in combined_dfs:
#             # Append to the existing DataFrame with the same columns
#             combined_dfs[columns_tuple] = pd.concat([combined_dfs[columns_tuple], df], ignore_index=True)
#         else:
#             # Create a new entry in the dictionary for these columns
#             combined_dfs[columns_tuple] = df

#     return combined_dfs

# # Usage
# master_folder_path = Path(r'C:\Users\ele\Downloads')
# combined_dataframes = read_and_combine_parquet(master_folder_path)

# # Displaying combined dataframes
# for i, df in enumerate(combined_dataframes.values()):
#     print(f"DataFrame {i+1} Preview:")
#     display(df.head(5))
#     print(f"Total rows in DataFrame {i+1}: {len(df)}\n")

    Read fixedwidth

In [None]:
# import pandas as pd
# from pathlib import Path

# def read_and_combine_fwf(master_path, colspecs): #, colnames
#     all_fwf_files = master_path.rglob('*.csv')  # Replace with the correct extension for your files
#     combined_dfs = {}  # Dictionary to store combined DataFrames

#     for fwf_file in all_fwf_files:
#         df = pd.read_fwf(fwf_file, colspecs=colspecs, header=0, dtype=str) #, names=colnames

#         # Add ID column at the beginning
#         df.insert(0, 'DTT_ID', range(1, len(df) + 1))

#         # Add Filename column at the end
#         df['DTT_FILENAME'] = str(fwf_file.absolute())

#         columns_tuple = tuple(df.columns)  # Tuple of column names for comparison

#         if columns_tuple in combined_dfs:
#             # Append to the existing DataFrame with the same columns
#             combined_dfs[columns_tuple] = pd.concat([combined_dfs[columns_tuple], df], ignore_index=True)
#         else:
#             # Create a new entry in the dictionary for these columns
#             combined_dfs[columns_tuple] = df

#     return combined_dfs

# # Column specifications for your fixed-width files
# colspecs = [(0, 16), (16, 56), (56, 82), (82, 122), (122, 162), (162, 173), (173, 187), (187, 200), (200, 213), (213, 224), (224, 245), (245, 285), (285, 297), (297, 337), (337, 377), (377, 386), (386, 426), (426, 439), (439, 452), (452, 463), (463, 476),(476,1000)]
# #colnames = ['JournalEntryID', 'JounalLineNumber', 'JournalEntryDescription', 'FiscalPeriod', 'FiscalYear', 'PostedDate', 'EffectiveDate', 'UserID', 'StdIndicator', 'GLAcct', 'GLAcctName', 'TransactionAmount', 'CDIndicator', 'DebitAmount', 'CreditAmount', 'Currency', 'LocalAmount', 'SourceSystem', 'DocumentType', 'EntryDate', 'DocumentDate', 'ProfitCenter']

# # Usage
# master_folder_path = Path(r'')
# combined_dataframes = read_and_combine_fwf(master_folder_path, colspecs) #, colnames

# # Displaying combined dataframes
# for i, df in enumerate(combined_dataframes.values()):
#     print(f"DataFrame {i+1} Preview:")
#     display(df.head(5))
#     print(f"Total rows in DataFrame {i+1}: {len(df)}\n")

    Convert the dictionary values to a list

In [None]:
# dataframes_list = list(combined_dataframes.values())

# # Access the second DataFrame (remember, Python uses 0-based indexing)
# #second_dataframe = dataframes_list[1]

# # Now you can work with second_dataframe
# print("DataFrame Preview:")
# display(dataframes_list[0].head(2))
# print(f"Total rows in Second DataFrame: {len(dataframes_list[0])}\n")

In [None]:
# df2=dataframes_list[0]

    Rename using one of the rows to replace column header

In [None]:
# def replace_headers_with_second_row(df):
#     # Ensure DataFrame has at least two rows
#     if len(df) < 2:
#         raise ValueError("DataFrame must have at least two rows to use the second row as headers")

#     # Set the second row as the new header for middle columns
#     middle_columns = df.columns[1:-1]  # Select all columns except the first and last
#     new_header = df.iloc[1, 1:-1]  # Grab the second row for the middle columns header
#     df.iloc[1:, 1:-1].columns = new_header  # Set the new headers for the middle columns

#     # Keep the first and last column headers unchanged
#     df.columns = [df.columns[0]] + list(new_header) + [df.columns[-1]]

#     # Remove the first two rows (original header and the row used as new header)
#     df = df[2:]

#     # Reset the index of the DataFrame
#     df.reset_index(drop=True, inplace=True)
    
#     return df

In [None]:
# dataframes_list[1]=replace_headers_with_second_row(dataframes_list[1])

# print(dataframes_list[1].shape)

# dataframes_list[1].head(2)

In [None]:
# trimmed_all_dfs[0]['DTT_FILENAME'].unique()

In [None]:
# trimmed_all_dfs[0] = trimmed_all_dfs[0].rename(columns={
#     'ï»¿DataID': 'DataID'
# })

In [None]:
# trimmed_all_dfs[0]=trimmed_all_dfs[0][trimmed_all_dfs[0]['DataID'].notna()]
# print(trimmed_all_dfs[0].shape)
# trimmed_all_dfs[0].head(2)

In [None]:
# union_df = pd.concat([df_combined, df_2], ignore_index=True)

# print(union_df.shape)
# union_df.head(2)

In [None]:
# trimmed_all_dfs[0]=union_df

    Change column name

In [None]:
# rename_dict = {
#     'ï»¿Profit Centre': 'Profit Centre',
# #    'OldName2': 'NewName2',
#     # Add more columns as needed
# }

# for df in trimmed_all_dfs:
#     df.rename(columns=rename_dict, inplace=True)

# trimmed_all_dfs[0].head(2)

    If column name follows a certain pattern, can use following code

In [None]:
# import re

# def rename_columns_by_pattern(df, pattern, replacement):
#     new_column_names = {col: re.sub(pattern, replacement, col) for col in df.columns}
#     df.rename(columns=new_column_names, inplace=True)

# pattern = r'Old'  # Regex pattern to match in the column names
# replacement = 'New'  # Replacement string

# for df in trimed_all_dfs:
#     rename_columns_by_pattern(df, pattern, replacement)

# Upload data

In [None]:
# import pyodbc
# from datetime import datetime
# from sqlalchemy import create_engine

# # Server and database configuration
# server = ''
# database = ''
# connection_string = f"mssql+pyodbc://@{server}/{database}?driver=ODBC+Driver+17+for+SQL+Server&trusted_connection=yes"

# # Create SQL Alchemy engine
# engine = create_engine(connection_string, fast_executemany=True)

# def upload_dataframe_to_sql(df, table_name, initials):
#     # Define the full table name with date and initials
#     today = datetime.now().strftime('%Y%m%d')
#     full_table_name = f'{table_name}_{today}_{initials}'

#     # Upload the DataFrame to SQL Server
#     df.to_sql(full_table_name, engine, if_exists='replace', index=False, chunksize=10000)

#     # Return the number of rows uploaded and the full table name
#     return len(df), full_table_name


# # Table name mapping
# table_name_mapping = {
#     0: 'QA_TEST_CASE',

#     # Add mappings as needed
#     # i: 'TableName' i is index of each dataframe
# }

# initials = "EL"  # Replace with the desired initials

# # Specify the index from which to start uploading
# start_from_index = 0  # Change this to start from a different DataFrame
# end_from_index = 1   # Change this to end at a different DataFrame

# # Loop through trimmed_all_dfs and upload each DataFrame within the specified range
# for i, df in enumerate(trimmed_all_dfs[start_from_index:end_from_index], start=start_from_index):
#     base_table_name = table_name_mapping.get(i, f'{i}')  # Default name if not in mapping
#     row_count, full_table_name = upload_dataframe_to_sql(df, base_table_name, initials)
#     print(f"Uploaded {row_count} rows to table {full_table_name}")


In [None]:
import re
import pyodbc
from datetime import datetime
from sqlalchemy import create_engine

# Server and database configuration
server = ''
database = ''
connection_string = f"mssql+pyodbc://@{server}/{database}?driver=ODBC+Driver+17+for+SQL+Server&trusted_connection=yes"

# Create SQL Alchemy engine
engine = create_engine(connection_string, fast_executemany=True)

def _validate_schema_name(schema: str) -> str:
    if schema is None:
        return None
    if re.fullmatch(r"[A-Za-z_][A-Za-z0-9_]*", schema):
        return schema
    raise ValueError("Invalid schema name. Use letters, numbers, and underscores; cannot start with a number.")

def ensure_schema_exists(schema: str):
    """Create schema if it doesn't exist (SQL Server)."""
    if not schema:
        return
    safe_schema = _validate_schema_name(schema)
    tsql = f"""
IF NOT EXISTS (SELECT 1 FROM sys.schemas WHERE name = '{safe_schema}')
    EXEC('CREATE SCHEMA [{safe_schema}] AUTHORIZATION [dbo]');
"""
    with engine.begin() as conn:
        conn.exec_driver_sql(tsql)

def upload_dataframe_to_sql(df, table_name, initials, schema: str = None, prefix: str = "RAW"):
    # Validate DataFrame columns
    if df is None:
        raise ValueError("DataFrame is None.")
    if df.columns.hasnans or df.columns.duplicated().any():
        raise ValueError("DataFrame contains columns with None or duplicate names.")

    # Define the full table name with date and initials (unqualified; schema handled separately)
    today = datetime.now().strftime('%Y%m%d')
    base_table_name = f'{prefix}_{table_name}_{today}_{initials}'

    # Ensure target schema exists (if provided)
    ensure_schema_exists(schema)

    # Upload the DataFrame to SQL Server
    with engine.begin() as conn:
        df.to_sql(
            base_table_name,
            conn,
            schema=schema,  # <- write into desired schema (e.g., INPUT/OUTPUT/ANALYSIS)
            if_exists='replace',
            index=False,
            chunksize=10000
        )

    qualified_name = f"{schema}.{base_table_name}" if schema else base_table_name
    return len(df), qualified_name

initials = "EL"  # Replace with the desired initials

# Define the base table name
base_table_name = 'REVENUE'  # Replace with your desired base table name or use a mapped name

# Choose the target schema (e.g., 'INPUT', 'OUTPUT', 'ANALYSIS'); set to None to use dbo/default
target_schema = 'INPUT'

# Upload the DataFrame to SQL Server
if 'df_combined_all' in globals() and df_combined_all is not None:
    try:
        row_count, full_table_name = upload_dataframe_to_sql(df_combined_all, base_table_name, initials, schema=target_schema)
        print(f"Uploaded {row_count} rows to table {full_table_name}")
    except ValueError as e:
        print(f"Error: {e}")
else:
    print("The DataFrame is None and was not uploaded.")

# The end