### Enviroment 

In [2]:
import pandas as pd
import psycopg2
from sqlalchemy import create_engine, text
import json

### Crear conexion a la BD postgres

In [36]:
credentials = "../credentials.json"

with open(credentials) as f:
    creds = json.load(f)

conn = psycopg2.connect(    
    host=creds["host"],
    database=creds["database"],
    user=creds["user"],
    password=creds["password"],
    port=creds["port"]
)

engine = create_engine(f'postgresql://{creds["user"]}:{creds["password"]}@{creds["host"]}:{creds["port"]}/{creds["database"]}')

In [37]:
# Crear nueva tabla (airbnb_EDA) con la misma información de la tabla ya enviada (airbnb_data)
df = pd.read_csv("../data/Airbnb_Open_Data.csv", low_memory=False, encoding='ISO-8859-1')

try:
    with engine.connect() as connection:
        # Paso 1: Eliminar tabla si existe
        connection.execute(text("DROP TABLE IF EXISTS airbnb_EDA;"))
        table_name = "airbnb_EDA"  # Nombre de la nueva tablan
        df.to_sql(table_name, engine, if_exists='replace', index=False)
        print(f"Tabla '{table_name}' creada y datos cargados en la base de datos '{creds['database']}'.")

        
        # Paso 2: Crear nueva tabla con estructura y datos
        connection.execute(text("""
            CREATE TABLE airbnb_EDA 
            AS 
            SELECT * FROM airbnb_data;
        """))
        
        # Paso 3: Verificar creación
        result = connection.execute(text("""
            SELECT COUNT(*) 
            FROM information_schema.tables 
            WHERE table_name = 'airbnb_EDA';
        """))
        
        if result.scalar() == 1:
            print("Tabla airbnb_EDA creada exitosamente!")
            
            # Verificar conteo de registros
            count_original = connection.execute(text("SELECT COUNT(*) FROM airbnb_data;")).scalar()
            count_copy = connection.execute(text("SELECT COUNT(*) FROM airbnb_EDA;")).scalar()
            
            print(f"\nRegistros en tabla original: {count_original}")
            print(f"Registros en tabla copia: {count_copy}")
            
            # Mostrar muestra de datos
            sample = pd.read_sql("SELECT * FROM airbnb_EDA LIMIT 5;", connection)
            print("\nMuestra de la nueva tabla:")
            print(sample)
            
        else:
            print("Error: No se pudo crear la tabla")

except Exception as e:
    print(f"Error durante la creación de la tabla: {str(e)}")
finally:
    engine.dispose()

Tabla 'airbnb_EDA' creada y datos cargados en la base de datos 'airbnb'.
Tabla airbnb_EDA creada exitosamente!

Registros en tabla original: 102599
Registros en tabla copia: 102599

Muestra de la nueva tabla:
        id                                              NAME      host id  \
0  1001254                Clean & quiet apt home by the park  80014485718   
1  1002102                             Skylit Midtown Castle  52335172823   
2  1002403               THE VILLAGE OF HARLEM....NEW YORK !  78829239556   
3  1002755                                              None  85098326012   
4  1003689  Entire Apt: Spacious Studio/Loft by central park  92037596077   

  host_identity_verified host name neighbourhood group neighbourhood  \
0            unconfirmed  Madaline            Brooklyn    Kensington   
1               verified     Jenna           Manhattan       Midtown   
2                   None     Elise           Manhattan        Harlem   
3            unconfirmed     Garry      

In [10]:
query = 'SELECT * FROM public."airbnb_EDA";'
df = pd.read_sql(query, conn)
df.head()

  df = pd.read_sql(query, conn)


Unnamed: 0,id,NAME,host id,host_identity_verified,host name,neighbourhood group,neighbourhood,lat,long,country,...,service fee,minimum nights,number of reviews,last review,reviews per month,review rate number,calculated host listings count,availability 365,house_rules,license
0,1001254,Clean & quiet apt home by the park,80014485718,unconfirmed,Madaline,Brooklyn,Kensington,40.64749,-73.97237,United States,...,$193,10.0,9.0,10/19/2021,0.21,4.0,6.0,286.0,Clean up and treat the home the way you'd like...,
1,1002102,Skylit Midtown Castle,52335172823,verified,Jenna,Manhattan,Midtown,40.75362,-73.98377,United States,...,$28,30.0,45.0,5/21/2022,0.38,4.0,2.0,228.0,Pet friendly but please confirm with me if the...,
2,1002403,THE VILLAGE OF HARLEM....NEW YORK !,78829239556,,Elise,Manhattan,Harlem,40.80902,-73.9419,United States,...,$124,3.0,0.0,,,5.0,1.0,352.0,"I encourage you to use my kitchen, cooking and...",
3,1002755,,85098326012,unconfirmed,Garry,Brooklyn,Clinton Hill,40.68514,-73.95976,United States,...,$74,30.0,270.0,7/5/2019,4.64,4.0,1.0,322.0,,
4,1003689,Entire Apt: Spacious Studio/Loft by central park,92037596077,verified,Lyndon,Manhattan,East Harlem,40.79851,-73.94399,United States,...,$41,10.0,9.0,11/19/2018,0.1,3.0,1.0,289.0,"Please no smoking in the house, porch or on th...",


In [55]:
def rename_columns_with_spaces(engine):
    with engine.connect() as connection:
        # Obtener nombres de columnas con espacios
        query = text("""
            SELECT column_name 
            FROM information_schema.columns 
            WHERE table_name = 'airbnb_eda'  -- Nombres en minúsculas por defecto en PostgreSQL
            AND column_name LIKE '% %'
        """)
        result = connection.execute(query)
        columns_to_rename = [row[0] for row in result]

        if not columns_to_rename:
            print("No hay columnas con espacios para renombrar.")
            return

        # Generar y ejecutar sentencias ALTER TABLE
        for old_name in columns_to_rename:
            new_name = old_name.replace(' ', '_')
            alter_query = text(f"""
                ALTER TABLE public.airbnb_eda
                RENAME COLUMN "{old_name}" TO "{new_name}"
            """)

            try:
                connection.execute(alter_query)
                print(f"Renombrada: '{old_name}' -> '{new_name}'")
            except Exception as e:
                print(f"Error renombrando {old_name}: {str(e)}")

rename_columns_with_spaces(engine)

# Verificar cambios
query = "SELECT column_name FROM information_schema.columns WHERE table_name = 'airbnb_eda'"
df = pd.read_sql(query, engine)
print("\nColumnas actuales en airbnb_eda:")
print(df.head())

# Cerrar la conexión
engine.dispose()


No hay columnas con espacios para renombrar.

Columnas actuales en airbnb_eda:
Empty DataFrame
Columns: [column_name]
Index: []


In [56]:
# Renombrar columnas de la base de datos con un espacio dentro de los nombres

def rename_columns_with_spaces():
    with engine.connect() as connection:
        # Obtener nombres de columnas con espacios
        query = text("""
            SELECT column_name 
            FROM information_schema.columns 
            WHERE table_name = 'airbnb_EDA' 
            AND column_name LIKE '% %'
        """)
        result = connection.execute(query)
        columns_to_rename = [row[0] for row in result]
        
        if not columns_to_rename:
            print("No hay columnas con espacios para renombrar")
            return

        # Generar y ejecutar sentencias ALTER TABLE
        for old_name in columns_to_rename:
            new_name = old_name.replace(' ', '_')
            alter_query = text(f"""
                ALTER TABLE public."airbnb_EDA"
                RENAME COLUMN "{old_name}" TO {new_name}
            """)
            
            try:
                connection.execute(alter_query)
                print(f"Renombrada: '{old_name}' -> '{new_name}'")
                connection.commit()
            except Exception as e:
                print(f"Error renombrando {old_name}: {str(e)}")
                connection.rollback()

if __name__ == "__main__":
    rename_columns_with_spaces()
    
    # Verificar cambios
    query = "SELECT column_name FROM information_schema.columns WHERE table_name = 'airbnb_EDA'"
    df = pd.read_sql(query, conn)
    df.head()

Renombrada: 'availability 365' -> 'availability_365'
Renombrada: 'minimum nights' -> 'minimum_nights'
Renombrada: 'number of reviews' -> 'number_of_reviews'
Renombrada: 'reviews per month' -> 'reviews_per_month'
Renombrada: 'review rate number' -> 'review_rate_number'
Renombrada: 'calculated host listings count' -> 'calculated_host_listings_count'
Renombrada: 'host id' -> 'host_id'
Renombrada: 'Construction year' -> 'Construction_year'
Renombrada: 'host name' -> 'host_name'
Renombrada: 'neighbourhood group' -> 'neighbourhood_group'
Renombrada: 'country code' -> 'country_code'
Renombrada: 'room type' -> 'room_type'
Renombrada: 'last review' -> 'last_review'
Renombrada: 'service fee' -> 'service_fee'


  df = pd.read_sql(query, conn)


In [57]:
query = text("""
ALTER TABLE public."airbnb_EDA"
DROP COLUMN IF EXISTS host_name,
DROP COLUMN IF EXISTS lat,
DROP COLUMN IF EXISTS long,
DROP COLUMN IF EXISTS country_code,
DROP COLUMN IF EXISTS country,
DROP COLUMN IF EXISTS house_rules;
""")
with engine.connect() as connection:
    connection.execute(query)
    connection.commit()

In [58]:
query = 'SELECT COUNT(*) FROM public."airbnb_EDA"'

# Usamos engine.connect() para ejecutar la consulta
with engine.connect() as connection:
    count = pd.read_sql(query, connection)

print(count)

    count
0  102599


In [13]:
# ELIMINAR REGISTROS REPETIDOS
'DELETE FROM public.airbnb_EDA WHERE id IN (SELECT id FROM (SELECT id, ROW_NUMBER() OVER (PARTITION BY id ORDER BY id) AS rnum FROM public.airbnb_EDA) t WHERE t.rnum > 1);'

'DELETE FROM public.airbnb_EDA WHERE id IN (SELECT id FROM (SELECT id, ROW_NUMBER() OVER (PARTITION BY id ORDER BY id) AS rnum FROM public.airbnb_EDA) t WHERE t.rnum > 1);'

In [None]:
#limpiar valores nullos (null) por fecha 01/01/1900 y cambiar las fechas a int AAADDMM  20240106
def execute_sql_transformations():
    with engine.connect() as connection:
        try:
            # Transformación de fechas y manejo de nulos - Convertir last_review a formato numérico
            query1 = text("""
                ALTER TABLE public."airbnb_EDA" 
                ALTER COLUMN last_review TYPE TEXT 
                USING last_review::TEXT;
            """)
            connection.execute(query1)
            connection.commit()
            
            query2 = text("""
                -- Crear columna temporal para fechas
                ALTER TABLE public."airbnb_EDA" ADD COLUMN temp_last_review DATE;
            """)
            connection.execute(query2)
            connection.commit()
            
            query3 = text("""
                -- Conversión segura con múltiples formatos
                UPDATE public."airbnb_EDA" 
                SET temp_last_review = 
                    CASE
                        WHEN last_review ~ '^\d{2}/\d{2}/\d{4}$' THEN TO_DATE(last_review, 'MM/DD/YYYY')
                        WHEN last_review ~ '^\d{4}-\d{2}-\d{2}$' THEN TO_DATE(last_review, 'YYYY-MM-DD')
                        ELSE NULL 
                    END;
            """)
            connection.execute(query3)
            connection.commit()
            
            query4 = text("""    
                -- Eliminar columna original y renombrar
                ALTER TABLE public."airbnb_EDA" DROP COLUMN last_review;
            """)
            connection.execute(query4)
            connection.commit()
            
            query5 = text("""
                ALTER TABLE public."airbnb_EDA" RENAME COLUMN temp_last_review TO last_review;
            """)
            connection.execute(query5)
            connection.commit()
            
            query5 = text("""
                ALTER TABLE public."airbnb_EDA" 
                ALTER COLUMN last_review TYPE INTEGER 
                USING (
                    CASE 
                        WHEN last_review IS NULL THEN 99999999
                        ELSE EXTRACT(YEAR FROM last_review) * 10000 +
                            EXTRACT(MONTH FROM last_review) * 100 +
                            EXTRACT(DAY FROM last_review)
                    END
                );
            """)
            connection.execute(query5)
            connection.commit()
            
            # Manejo de valores nulos - Para columnas de texto
            text_columns_query = text("""
                SELECT column_name 
                FROM information_schema.columns 
                WHERE table_name = 'airbnb_EDA' 
                AND data_type = 'text'
            """)
            
            text_cols = [row[0] for row in connection.execute(text_columns_query)]
            print(f"probar -> {text_cols}")
            
            for col in text_cols:
                update_text = text(f"""
                    UPDATE public."airbnb_EDA"
                    SET "{col}" = COALESCE("{col}", 'not fill')
                    WHERE "{col}" IS NULL;
                """)
                connection.execute(update_text)
                connection.commit()
            
            # Para columnas numéricas (excluyendo last_review)
            numeric_columns_query = text("""
                SELECT column_name 
                FROM information_schema.columns 
                WHERE table_name = 'airbnb_EDA' 
                AND data_type IN ('integer', 'bigint', 'double precision', 'numeric')
                AND column_name != 'last_review'
            """)
            
            numeric_cols = [row[0] for row in connection.execute(numeric_columns_query)]
            
            for col in numeric_cols:
                update_num = text(f"""
                    UPDATE public."airbnb_EDA"
                    SET "{col}" = COALESCE({col}, -1)
                    WHERE "{col}" IS NULL;
                """)
                connection.execute(update_num)
                connection.commit()
            
            print("Manejo de nulos completado exitosamente!")
        
        except Exception as e:
            print(f"Error durante las transformaciones: {str(e)}")
            connection.rollback()
            raise

if __name__ == "__main__":
    execute_sql_transformations()


  query3 = text("""


probar -> ['NAME', 'host_identity_verified', 'neighbourhood_group', 'neighbourhood', 'cancellation_policy', 'room_type', 'price', 'service_fee', 'license']
Manejo de nulos completado exitosamente!


In [29]:
with engine.connect() as connection:
    df = pd.read_sql('SELECT * FROM public."airbnb_EDA"', connection)

df.head()
#df.info()  
#df.describe()
#df.isnull().sum()

Unnamed: 0,id,NAME,host_id,host_identity_verified,neighbourhood_group,neighbourhood,instant_bookable,cancellation_policy,room_type,construction_year,price,service_fee,minimum_nights,number_of_reviews,last_review,reviews_per_month,review_rate_number,calculated_host_listings_count,availability_365,license
0,1001254,Clean & quiet apt home by the park,80014485718,unconfirmed,Brooklyn,Kensington,False,strict,Private room,2020.0,$966,$193,10.0,9.0,10/19/2021,0.21,4.0,6.0,286.0,
1,1002102,Skylit Midtown Castle,52335172823,verified,Manhattan,Midtown,False,moderate,Entire home/apt,2007.0,$142,$28,30.0,45.0,5/21/2022,0.38,4.0,2.0,228.0,
2,1002403,THE VILLAGE OF HARLEM....NEW YORK !,78829239556,,Manhattan,Harlem,True,flexible,Private room,2005.0,$620,$124,3.0,0.0,,,5.0,1.0,352.0,
3,1002755,,85098326012,unconfirmed,Brooklyn,Clinton Hill,True,moderate,Entire home/apt,2005.0,$368,$74,30.0,270.0,7/5/2019,4.64,4.0,1.0,322.0,
4,1003689,Entire Apt: Spacious Studio/Loft by central park,92037596077,verified,Manhattan,East Harlem,False,moderate,Entire home/apt,2009.0,$204,$41,10.0,9.0,11/19/2018,0.1,3.0,1.0,289.0,


- Eliminar duplicados
- Conversion de datos: cambiar nulos string a "not fill" y nulos int a -1 
- *Manejo de valores atipicos
- Normalizacion numerica: cambiar las fechas a int donde el orden del numero sea AAAAMMDD

- eliminar columnas 
- cambiar nombre de las columnas (hay espacios en los nombres)