# EDA

Se verifican los tipos de datos que lee pandas de sqlite vs los tipos de datos de la base de datos NBA_HoopVision con el fin de identificar incompatibilidades y mejorar el proceso de ETL

In [1]:
import sqlite3
import pandas as pd
from sqlalchemy import create_engine, inspect
import urllib
import sys

# --- Configuración ---
sqlite_db_file = 'nba.sqlite'

sql_server_driver = 'ODBC Driver 17 for SQL Server' 
sql_server_name = 'IAMGROOT'
sql_server_db = 'NBA_HoopVision'

# --- Conexión SQL Server ---
print(f"🔌 Creando conexión a SQL Server: {sql_server_name}/{sql_server_db}...")
try:
    params = urllib.parse.quote_plus(
        f'DRIVER={{{sql_server_driver}}};'
        f'SERVER={sql_server_name};'
        f'DATABASE={sql_server_db};'
        f'Trusted_Connection=yes;'
    )
    sql_engine = create_engine(f"mssql+pyodbc:///?odbc_connect={params}")
    sql_inspector = inspect(sql_engine)
except Exception as e:
    print(f"❌ Error fatal al conectar a SQL Server: {e}")
    sys.exit()

# --- Funciones de Inspección ---

def get_sqlite_pandas_types(db_path):
    """Lee todas las tablas de SQLite y devuelve sus Dtypes de Pandas."""
    schemas = {}
    conn = None
    print(f"\n--- 🕵️‍♂️ Inspeccionando SQLite ({db_path}) ---")
    try:
        conn = sqlite3.connect(db_path)
        cursor = conn.cursor()
        cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
        tables = [row[0] for row in cursor.fetchall()]
        print(f"Tablas encontradas: {len(tables)}")

        for table_name in tables:
            try:
                # Leemos solo 1 fila para inferir tipos rápidamente
                df = pd.read_sql_query(f"SELECT * FROM \"{table_name}\" LIMIT 1", conn)
                schemas[table_name] = {col: str(dtype) for col, dtype in df.dtypes.items()}
            except Exception as e:
                print(f"   - ⚠️ No se pudo leer la tabla '{table_name}': {e}")
                schemas[table_name] = {"Error": str(e)}
        
    except Exception as e:
        print(f"❌ Error al inspeccionar SQLite: {e}")
    finally:
        if conn:
            conn.close()
    return schemas

def get_sql_server_types(inspector):
    """Lee todas las tablas de SQL Server y devuelve sus tipos."""
    schemas = {}
    print(f"\n--- 🕵️‍♀️ Inspeccionando SQL Server ({sql_server_db}) ---")
    try:
        schemas_db = inspector.get_schema_names()
        # A menudo nos interesa 'dbo', pero busquemos en todas por si acaso
        print(f"Esquemas encontrados: {schemas_db}")
        
        for schema in schemas_db:
             # Omitimos esquemas del sistema
            if schema in ['guest', 'INFORMATION_SCHEMA', 'sys', 'db_owner', 
                          'db_accessadmin', 'db_securityadmin', 'db_ddladmin', 
                          'db_backupoperator', 'db_datareader', 'db_datawriter',
                          'db_denydatareader', 'db_denydatawriter']:
                continue

            tables = inspector.get_table_names(schema=schema)
            print(f"Tablas encontradas en '{schema}': {len(tables)}")
            for table_name in tables:
                full_table_name = f"{schema}.{table_name}"
                try:
                    columns = inspector.get_columns(table_name, schema=schema)
                    schemas[table_name] = {col['name']: str(col['type']) for col in columns}
                except Exception as e:
                    print(f"   - ⚠️ No se pudo leer la tabla '{full_table_name}': {e}")
                    schemas[table_name] = {"Error": str(e)}

    except Exception as e:
        print(f"❌ Error al inspeccionar SQL Server: {e}")
    return schemas

def analyze_mismatch(pandas_type, sql_type):
    if not pandas_type or not sql_type:
        return "FALTA DATO"

    p_type = str(pandas_type).lower()
    # Para SQL Server, tomamos el tipo base (ej. 'NVARCHAR' de 'NVARCHAR(255)')
    s_type_full = str(sql_type).upper()
    s_type_base = s_type_full.split('(')[0] 

    # Casos OK (comunes)
    if p_type.startswith('int') and s_type_base in ['INT', 'INTEGER', 'BIGINT', 'SMALLINT', 'TINYINT']:
        return "✅ OK"
    if p_type.startswith('float') and s_type_base in ['FLOAT', 'REAL', 'DECIMAL', 'NUMERIC']:
        return "✅ OK"
    if p_type == 'object' and s_type_base in ['VARCHAR', 'NVARCHAR', 'TEXT', 'CHAR', 'NCHAR']:
        return "✅ OK"
    if p_type == 'bool' and s_type_base == 'BIT':
        return "✅ OK"
    if p_type.startswith('datetime64') and s_type_base in ['DATE', 'DATETIME', 'DATETIME2', 'SMALLDATETIME', 'TIME']:
        return "✅ OK"

    # Focos Rojos (comunes)
    if p_type.startswith('float') and s_type_base in ['INT', 'INTEGER', 'BIGINT', 'SMALLINT', 'TINYINT']:
        return f"🚨 ¡{p_type.upper()} vs {s_type_base}! (Necesita Int64 nullable)"
    if p_type == 'object' and s_type_base in ['INT', 'INTEGER', 'BIGINT', 'SMALLINT', 'TINYINT', 'FLOAT', 'REAL', 'DECIMAL', 'NUMERIC', 'BIT', 'DATE', 'DATETIME', 'DATETIME2']:
        return f"🚨 ¡OBJECT vs {s_type_base}! (Necesita conversión específica)"
    if p_type.startswith('int') and s_type_base == 'BIT':
        return f"⚠️ {p_type.upper()} vs BIT (¿Se mapea 0/1?)"

    # Casos menos comunes o que requieren inspección
    if p_type == 'object' and s_type_base not in ['VARCHAR', 'NVARCHAR', 'TEXT', 'CHAR', 'NCHAR']:
        return f"❓ OBJECT vs {s_type_base} (Revisar si es texto o necesita conversión)"

    return "❓ Revisar (Combinación no cubierta)"

# --- Ejecución y Comparación ---

sqlite_types = get_sqlite_pandas_types(sqlite_db_file)
sql_server_types = get_sql_server_types(sql_inspector)

print("\n\n--- 📊 Comparación de Esquemas ---")

# Usamos las tablas de SQLite como base para la comparación
all_tables = sorted(list(sqlite_types.keys()))

for table in all_tables:
    print(f"\n=======================================")
    print(f" Tabla: {table}")
    print(f"=======================================")
    
    sqlite_cols = sqlite_types.get(table, {})
    sql_cols = sql_server_types.get(table, {})

    if "Error" in sqlite_cols:
        print(f"  - Error en SQLite: {sqlite_cols['Error']}")
        continue
    if not sql_cols or "Error" in sql_cols:
        print(f"  - ⚠️ ¡Tabla NO encontrada o con Error en SQL Server!")
        continue

    all_cols = sorted(list(sqlite_cols.keys()))

    print(f"  {'Columna':<30} | {'SQLite (Pandas)':<15} | {'SQL Server':<15} | {'Análisis':<30}")
    print(f"  {'-'*30} | {'-'*15} | {'-'*15} | {'-'*30}")

    for col in all_cols:
        p_type = sqlite_cols.get(col, 'N/A')
        s_type = sql_cols.get(col, 'N/A')
        analysis = analyze_mismatch(p_type, s_type)
        print(f"  {col:<30} | {p_type:<15} | {s_type:<15} | {analysis}")

    # Verificamos si hay columnas en SQL Server que no están en SQLite
    missing_in_sqlite = [c for c in sql_cols.keys() if c not in sqlite_cols]
    if missing_in_sqlite:
        print("\n  Columnas ADICIONALES en SQL Server:")
        for col in missing_in_sqlite:
            print(f"    - {col} ({sql_cols[col]})")

print("\n--- Comparación Finalizada ---")

# Cerramos el motor de SQL Server
if sql_engine:
    sql_engine.dispose()
    print("🚪 Conexión a SQL Server cerrada.")

🔌 Creando conexión a SQL Server: IAMGROOT/NBA_HoopVision...

--- 🕵️‍♂️ Inspeccionando SQLite (nba.sqlite) ---
Tablas encontradas: 16

--- 🕵️‍♀️ Inspeccionando SQL Server (NBA_HoopVision) ---
Esquemas encontrados: ['db_accessadmin', 'db_backupoperator', 'db_datareader', 'db_datawriter', 'db_ddladmin', 'db_denydatareader', 'db_denydatawriter', 'db_owner', 'db_securityadmin', 'dbo', 'guest', 'INFORMATION_SCHEMA', 'sys']
Tablas encontradas en 'dbo': 11


--- 📊 Comparación de Esquemas ---

 Tabla: common_player_info
  Columna                        | SQLite (Pandas) | SQL Server      | Análisis                      
  ------------------------------ | --------------- | --------------- | ------------------------------
  birthdate                      | object          | DATE            | 🚨 ¡OBJECT vs DATE! (Necesita conversión específica)
  country                        | object          | NVARCHAR(100) COLLATE "SQL_Latin1_General_CP1_CI_AS" | ✅ OK
  display_fi_last                | object  