In [1]:
import pymysql
import pandas as pd

# Dictionary to map database names to their domains
domain_mapping = {
    "stats": "Education", "UW_std": "Education", "imdb_full": "Entertainment",
    "imdb_MovieLens": "Entertainment", "ccs": "Finance", "financial": "Finance",
    "Countries": "Geography", "Mondial": "Geography", "legalActs": "Government",
    "Mesh": "Industry", "trains": "Logistic", "Biodegradability": "Medicine",
    "Carcinogenesis": "Medicine", "Hepatitis_std": "Medicine", "PTE": "Medicine",
    "classicmodels": "Retail", "Credit": "Retail", "ftp": "Retail",
    "northwind": "Retail", "sakila": "Retail", "SalesDB": "Retail",
    "tpcds": "Retail", "AustralianFootball": "Sport", "Hockey": "Sport",
    "lahman_2014": "Sport"
}

def get_table_info(host, port, user, password, db_name, table_name):
    connection = pymysql.connect(host=host, port=port, user=user, password=password, database=db_name)
    
    try:
        with connection.cursor() as cursor:
            # Get column information
            cursor.execute(f"DESCRIBE `{table_name}`")
            columns = cursor.fetchall()
            
            # Get row count
            cursor.execute(f"SELECT COUNT(*) FROM `{table_name}`")
            row_count = cursor.fetchone()[0]
            
            # Get primary key(s)
            cursor.execute(f"""
                SELECT GROUP_CONCAT(COLUMN_NAME)
                FROM INFORMATION_SCHEMA.KEY_COLUMN_USAGE
                WHERE TABLE_SCHEMA = '{db_name}'
                AND TABLE_NAME = '{table_name}'
                AND CONSTRAINT_NAME = 'PRIMARY'
                GROUP BY TABLE_NAME
            """)
            primary_keys = cursor.fetchone()
            
            # Get foreign keys
            cursor.execute(f"""
                SELECT COLUMN_NAME, REFERENCED_TABLE_NAME, REFERENCED_COLUMN_NAME
                FROM INFORMATION_SCHEMA.KEY_COLUMN_USAGE
                WHERE TABLE_SCHEMA = '{db_name}'
                AND TABLE_NAME = '{table_name}'
                AND REFERENCED_TABLE_NAME IS NOT NULL
            """)
            foreign_keys = cursor.fetchall()
            
            table_info = {
                'index': f"{db_name}.{table_name}",
                'name': table_name,
                'database': db_name,
                'area': domain_mapping.get(db_name, "Unknown"),  # Add this line
                'instances': row_count,
                'attributes': len(columns),
                'Column': ", ".join([col[0] for col in columns]),
                'Description': ", ".join([f"{col[0]} ({col[1]})" for col in columns]),
                'primary_key': primary_keys[0] if primary_keys else None,
                'foreign_keys': ", ".join([f"{fk[0]} -> {fk[1]}.{fk[2]}" for fk in foreign_keys]) if foreign_keys else None
            }
            
            return table_info
    
    finally:
        connection.close()

# Connection details
host = 'db.relational-data.org'
port = 3306
user = 'guest'
password = 'relational'

# List of databases we're interested in
databases = [
    "stats", "UW_std", "imdb_full", "imdb_MovieLens", "ccs", "financial", "Countries",
    "Mondial", "legalActs", "Mesh", "trains", "Biodegradability", "Carcinogenesis",
    "Hepatitis_std", "PTE", "classicmodels", "Credit", "ftp", "northwind", "sakila",
    "SalesDB", "tpcds", "AustralianFootball", "Hockey", "lahman_2014"
]

# Collect information for all tables in the selected databases
all_table_info = []
for db_name in databases:
    try:
        connection = pymysql.connect(host=host, port=port, user=user, password=password, database=db_name)
        with connection.cursor() as cursor:
            cursor.execute("SHOW TABLES")
            tables = [table[0] for table in cursor.fetchall()]
        
        for table_name in tables:
            try:
                table_info = get_table_info(host, port, user, password, db_name, table_name)
                all_table_info.append(table_info)
                print(f"Processed: {db_name}.{table_name}")
            except Exception as e:
                print(f"Error processing {db_name}.{table_name}: {str(e)}")
    except Exception as e:
        print(f"Error connecting to database {db_name}: {str(e)}")
    finally:
        if connection:
            connection.close()

# Create a DataFrame with the collected information
df = pd.DataFrame(all_table_info)

# Save to Excel
df.to_excel("all_selected_databases_info.xlsx", index=False)

print("Table information for all tables in selected databases has been saved to all_selected_databases_info.xlsx")
print(f"Total tables processed: {len(df)}")
print("\nUnique values in the 'area' column:")
print(df['area'].unique())

Processed: stats.badges
Processed: stats.comments
Processed: stats.postHistory
Processed: stats.postLinks
Processed: stats.posts
Processed: stats.tags
Processed: stats.users
Processed: stats.votes
Processed: UW_std.advisedBy
Processed: UW_std.course
Processed: UW_std.person
Processed: UW_std.taughtBy
Processed: imdb_full.actors
Processed: imdb_full.business
Processed: imdb_full.countries
Processed: imdb_full.directors
Processed: imdb_full.distributors
Processed: imdb_full.editors
Processed: imdb_full.genres
Processed: imdb_full.language
Processed: imdb_full.movies
Processed: imdb_full.movies2actors
Processed: imdb_full.movies2directors
Processed: imdb_full.movies2editors
Processed: imdb_full.movies2producers
Processed: imdb_full.movies2writers
Processed: imdb_full.prodcompanies
Processed: imdb_full.producers
Processed: imdb_full.ratings
Processed: imdb_full.runningtimes
Processed: imdb_full.writers
Processed: imdb_MovieLens.actors
Processed: imdb_MovieLens.directors
Processed: imdb_Mov