In [1]:
import pandas as pd
from typing import Union, Tuple, List, Dict
import logging
from datetime import datetime

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def load_excel_file(file_path: str) -> pd.DataFrame:
    try:
        df = pd.read_excel(file_path)
        df.columns = df.columns.str.strip().str.replace('\ufeff', '')
        return df
    except Exception as e:
        logging.error(f"Error loading Excel file {file_path}: {str(e)}")
        return pd.DataFrame()

def process_table(table_name: str, df: pd.DataFrame) -> pd.DataFrame:
    table_info = df[df['index'] == table_name]
    
    if table_info.empty:
        logging.warning(f"No table found with name: {table_name}")
        return pd.DataFrame()
    
    table_info = table_info.iloc[0]
    
    database = table_name.split('.')[0] if '.' in table_name else ''
    
    columns = table_info.get('Column', '').split(', ')
    descriptions = table_info.get('Description', '').strip('()').split('), ')
    
    logging.info(f"Columns: {len(columns)}")
    logging.info(f"Descriptions: {len(descriptions)}")
    
    if len(columns) != len(descriptions):
        logging.warning(f"Mismatch in column and description counts for {table_name}")
        logging.warning(f"Columns: {columns}")
        logging.warning(f"Descriptions: {descriptions}")
        
        # Use the shorter length to avoid index errors
        min_length = min(len(columns), len(descriptions))
        columns = columns[:min_length]
        descriptions = descriptions[:min_length]
    
    data = {
        'index': [table_name] * len(columns),
        'name': [table_info.get('name', '')] * len(columns),
        'database': [database] * len(columns),
        'area': [table_info.get('area', '')] * len(columns),
        'Original Column': [f"{i+1}. {desc})" for i, desc in enumerate(descriptions)],
    }
    
    # Check if all arrays have the same length
    if len(set(len(v) for v in data.values())) != 1:
        logging.error(f"Inconsistent lengths in data for {table_name}")
        for k, v in data.items():
            logging.error(f"{k}: {len(v)} items")
        return pd.DataFrame()
    
    columns_df = pd.DataFrame(data)
    
    logging.info(f"Processed table: {table_name}")
    logging.info(f"Total rows: {len(columns_df)}")
    return columns_df

def update_excel_file(new_df: pd.DataFrame, file_name: str):
    try:
        existing_df = pd.read_excel(file_name)
        updated_df = pd.concat([existing_df, new_df], ignore_index=True)
    except FileNotFoundError:
        logging.warning(f"File {file_name} not found. Creating new file.")
        updated_df = new_df
    
    updated_df.to_excel(file_name, index=False)
    logging.info(f"Updated information has been saved to {file_name}")

def main():
    print(f"This script started on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    
    desired_dataset_index = input("Insert number for dataset or Database.table for table: ")
    print(desired_dataset_index)

    try:
        desired_dataset_index = int(desired_dataset_index)
        print(f"Dataset#: {desired_dataset_index}")
        DB_or_dataset = "Dataset"
        datasets_file_path = 'Fortydatasets.xlsx'
        analysed_columns_file_path = 'AnalysedColumns.xlsx'
    except ValueError:
        print(f"Database.table: {desired_dataset_index}")
        DB_or_dataset = "DB"
        datasets_file_path = 'all_selected_databases_info.xlsx'
        analysed_columns_file_path = 'AnalysedColumnsDB.xlsx'

    datasets_df = load_excel_file(datasets_file_path)
    analysed_columns_df = load_excel_file(analysed_columns_file_path)

    if datasets_df.empty or analysed_columns_df.empty:
        logging.error("Failed to load necessary Excel files. Exiting.")
        return

    logging.info(f"Columns in datasets_df: {datasets_df.columns.tolist()}")

    if DB_or_dataset == "DB":
        if desired_dataset_index not in datasets_df['index'].values:
            logging.error(f"Table {desired_dataset_index} not found in the dataset")
            return
        combined_df = process_table(desired_dataset_index, datasets_df)
    else:
        if desired_dataset_index > len(datasets_df):
            logging.error(f"Dataset index {desired_dataset_index} is out of range")
            return
        dataset_info = datasets_df.iloc[desired_dataset_index - 1]
        tables = dataset_info['Tables'].split(', ')
        combined_df = pd.DataFrame()
        for table in tables:
            table_df = process_table(f"{dataset_info['Database']}.{table}", datasets_df)
            combined_df = pd.concat([combined_df, table_df], ignore_index=True)

    if not combined_df.empty:
        update_excel_file(combined_df, "AllColumnsInfo.xlsx")

        new_rows = []
        if DB_or_dataset == "DB":
            table_info = datasets_df[datasets_df['index'] == desired_dataset_index].iloc[0]
            table_info['database'] = desired_dataset_index.split('.')[0] if '.' in desired_dataset_index else ''
            new_rows.append(table_info)
        else:
            new_rows.append(dataset_info)

        update_excel_file(pd.DataFrame(new_rows), "AllDatasetsInfo.xlsx")

        print(f"Processing completed for {'dataset' if DB_or_dataset == 'Dataset' else 'database table'}: {desired_dataset_index}")
        print(f"Database extracted: {combined_df['database'].iloc[0] if not combined_df.empty else 'N/A'}")
        print("\nFirst few rows of the combined DataFrame:")
        print(combined_df[['index', 'name', 'database', 'area']].head())
        print(f"\nTotal rows in combined DataFrame: {len(combined_df)}")
        print("\nUnique values in 'area' column:")
        print(combined_df['area'].unique())
    else:
        logging.warning("No data processed. Check your input and source files.")

if __name__ == "__main__":
    main()

This script started on: 2024-09-17 12:14:01
tpcds.store_sales
Database.table: tpcds.store_sales


2024-09-17 12:14:05,085 - INFO - Columns in datasets_df: ['index', 'name', 'database', 'area', 'instances', 'attributes', 'Column', 'Description', 'primary_key', 'foreign_keys']
2024-09-17 12:14:05,087 - INFO - Columns: 23
2024-09-17 12:14:05,087 - INFO - Descriptions: 23
2024-09-17 12:14:05,089 - INFO - Processed table: tpcds.store_sales
2024-09-17 12:14:05,090 - INFO - Total rows: 23
2024-09-17 12:14:05,199 - INFO - Updated information has been saved to AllColumnsInfo.xlsx
2024-09-17 12:14:05,244 - INFO - Updated information has been saved to AllDatasetsInfo.xlsx


Processing completed for database table: tpcds.store_sales
Database extracted: tpcds

First few rows of the combined DataFrame:
               index         name database    area
0  tpcds.store_sales  store_sales    tpcds  Retail
1  tpcds.store_sales  store_sales    tpcds  Retail
2  tpcds.store_sales  store_sales    tpcds  Retail
3  tpcds.store_sales  store_sales    tpcds  Retail
4  tpcds.store_sales  store_sales    tpcds  Retail

Total rows in combined DataFrame: 23

Unique values in 'area' column:
['Retail']


EXIT
