### Enviroment 

In [1]:
import pandas as pd
import psycopg2
from sqlalchemy import create_engine, text
import json

### Crear conexion a la BD postgres

In [2]:
credentials = "../credentials.json"

with open(credentials) as f:
    creds = json.load(f)

conn = psycopg2.connect(    
    host=creds["host"],
    database=creds["database"],
    user=creds["user"],
    password=creds["password"],
    port=creds["port"]
)

engine = create_engine(f'postgresql://{creds["user"]}:{creds["password"]}@{creds["host"]}:{creds["port"]}/{creds["database"]}')

In [3]:
# Crear nueva tabla (airbnb_EDA) con la misma información de la tabla ya enviada (airbnb_data)
df = pd.read_csv("../data/Airbnb_Open_Data.csv", low_memory=False, encoding='ISO-8859-1')

try:
    with engine.connect() as connection:
        # Paso 1: Eliminar tabla si existe
        connection.execute(text("DROP TABLE IF EXISTS airbnb_EDA;"))
        table_name = "airbnb_EDA"  # Nombre de la nueva tablan
        df.to_sql(table_name, engine, if_exists='replace', index=False)
        print(f"Tabla '{table_name}' creada y datos cargados en la base de datos '{creds['database']}'.")

        
        # Paso 2: Crear nueva tabla con estructura y datos
        connection.execute(text("""
            CREATE TABLE airbnb_EDA 
            AS 
            SELECT * FROM airbnb_data;
        """))
        
        # Paso 3: Verificar creación
        result = connection.execute(text("""
            SELECT COUNT(*) 
            FROM information_schema.tables 
            WHERE table_name = 'airbnb_EDA';
        """))
        
        if result.scalar() == 1:
            print("Tabla airbnb_EDA creada exitosamente!")
            
            # Verificar conteo de registros
            count_original = connection.execute(text("SELECT COUNT(*) FROM airbnb_data;")).scalar()
            count_copy = connection.execute(text("SELECT COUNT(*) FROM airbnb_EDA;")).scalar()
            
            print(f"\nRegistros en tabla original: {count_original}")
            print(f"Registros en tabla copia: {count_copy}")
            
            # Mostrar muestra de datos
            sample = pd.read_sql("SELECT * FROM airbnb_EDA LIMIT 5;", connection)
            print("\nMuestra de la nueva tabla:")
            print(sample)
            
        else:
            print("Error: No se pudo crear la tabla")

except Exception as e:
    print(f"Error durante la creación de la tabla: {str(e)}")
#finally:
#    engine.dispose()

Tabla 'airbnb_EDA' creada y datos cargados en la base de datos 'airbnb'.
Tabla airbnb_EDA creada exitosamente!

Registros en tabla original: 102599
Registros en tabla copia: 102599

Muestra de la nueva tabla:
        id                                              NAME      host id  \
0  1001254                Clean & quiet apt home by the park  80014485718   
1  1002102                             Skylit Midtown Castle  52335172823   
2  1002403               THE VILLAGE OF HARLEM....NEW YORK !  78829239556   
3  1002755                                              None  85098326012   
4  1003689  Entire Apt: Spacious Studio/Loft by central park  92037596077   

  host_identity_verified host name neighbourhood group neighbourhood  \
0            unconfirmed  Madaline            Brooklyn    Kensington   
1               verified     Jenna           Manhattan       Midtown   
2                   None     Elise           Manhattan        Harlem   
3            unconfirmed     Garry      

In [4]:
# Renombrar columnas de la base de datos con un espacio dentro de los nombres

def rename_columns_with_spaces():
    engine = create_engine(f'postgresql://{creds["user"]}:{creds["password"]}@{creds["host"]}:{creds["port"]}/{creds["database"]}')
    
    with engine.connect() as connection:
        # Obtener nombres de columnas con espacios
        query = text("""
            SELECT column_name 
            FROM information_schema.columns 
            WHERE table_name = 'airbnb_EDA' 
            AND column_name LIKE '% %'
        """)
        result = connection.execute(query)
        columns_to_rename = [row[0] for row in result]
        
        if not columns_to_rename:
            print("No hay columnas con espacios para renombrar")
            return

        # Generar y ejecutar sentencias ALTER TABLE
        for old_name in columns_to_rename:
            new_name = old_name.replace(' ', '_')
            alter_query = text(f"""
                ALTER TABLE public."airbnb_EDA"
                RENAME COLUMN "{old_name}" TO {new_name}
            """)
            
            try:
                connection.execute(alter_query)
                print(f"Renombrada: '{old_name}' -> '{new_name}'")
                connection.commit()
            except Exception as e:
                print(f"Error renombrando {old_name}: {str(e)}")
                connection.rollback()

if __name__ == "__main__":
    rename_columns_with_spaces()
    
    # Verificar cambios
    query = "SELECT column_name FROM information_schema.columns WHERE table_name = 'airbnb_EDA'"
    df = pd.read_sql(query, conn)
    df.head()

Renombrada: 'availability 365' -> 'availability_365'
Renombrada: 'minimum nights' -> 'minimum_nights'
Renombrada: 'number of reviews' -> 'number_of_reviews'
Renombrada: 'reviews per month' -> 'reviews_per_month'
Renombrada: 'review rate number' -> 'review_rate_number'
Renombrada: 'calculated host listings count' -> 'calculated_host_listings_count'
Renombrada: 'host id' -> 'host_id'
Renombrada: 'Construction year' -> 'Construction_year'
Renombrada: 'host name' -> 'host_name'
Renombrada: 'neighbourhood group' -> 'neighbourhood_group'
Renombrada: 'country code' -> 'country_code'
Renombrada: 'room type' -> 'room_type'
Renombrada: 'last review' -> 'last_review'
Renombrada: 'service fee' -> 'service_fee'


  df = pd.read_sql(query, conn)


In [5]:
query = text("""
ALTER TABLE public."airbnb_EDA"
DROP COLUMN IF EXISTS host_name,
DROP COLUMN IF EXISTS lat,
DROP COLUMN IF EXISTS long,
DROP COLUMN IF EXISTS country_code,
DROP COLUMN IF EXISTS country,
DROP COLUMN IF EXISTS neighbourhood,
DROP COLUMN IF EXISTS house_rules;
""")
with engine.connect() as connection:
    connection.execute(query)
    connection.commit()

In [6]:
#limpiar valores nullos (null) por fecha 01/01/1900 y cambiar las fechas a int AAADDMM  20240106
def execute_sql_transformations():
    with engine.connect() as connection:
        try:
            # Transformación de fechas y manejo de nulos - Convertir last_review a formato numérico
            query1 = text("""
                ALTER TABLE public."airbnb_EDA" 
                ALTER COLUMN last_review TYPE TEXT 
                USING last_review::TEXT;
            """)
            connection.execute(query1)
            connection.commit()
            
            query2 = text("""
                -- Crear columna temporal para fechas
                ALTER TABLE public."airbnb_EDA" ADD COLUMN temp_last_review DATE;
            """)
            connection.execute(query2)
            connection.commit()
            
            query3 = text("""
                -- Conversión segura con múltiples formatos
                UPDATE public."airbnb_EDA" 
                SET temp_last_review = 
                    CASE
                        WHEN last_review ~ '^\d{2}/\d{2}/\d{4}$' THEN TO_DATE(last_review, 'MM/DD/YYYY')
                        WHEN last_review ~ '^\d{4}-\d{2}-\d{2}$' THEN TO_DATE(last_review, 'YYYY-MM-DD')
                        ELSE NULL 
                    END;
            """)
            connection.execute(query3)
            connection.commit()
            
            query4 = text("""    
                -- Eliminar columna original y renombrar
                ALTER TABLE public."airbnb_EDA" DROP COLUMN last_review;
            """)
            connection.execute(query4)
            connection.commit()
            
            query5 = text("""
                ALTER TABLE public."airbnb_EDA" RENAME COLUMN temp_last_review TO last_review;
            """)
            connection.execute(query5)
            connection.commit()
            
            query5 = text("""
                ALTER TABLE public."airbnb_EDA" 
                ALTER COLUMN last_review TYPE INTEGER 
                USING (
                    CASE 
                        WHEN last_review IS NULL THEN 99999999
                        ELSE EXTRACT(YEAR FROM last_review) * 10000 +
                            EXTRACT(MONTH FROM last_review) * 100 +
                            EXTRACT(DAY FROM last_review)
                    END
                );
            """)
            connection.execute(query5)
            connection.commit()
            
            # Manejo de valores nulos - Para columnas de texto
            text_columns_query = text("""
                SELECT column_name 
                FROM information_schema.columns 
                WHERE table_name = 'airbnb_EDA' 
                AND data_type = 'text'
            """)
            
            text_cols = [row[0] for row in connection.execute(text_columns_query)]
            print(f"probar -> {text_cols}")
            
            for col in text_cols:
                update_text = text(f"""
                    UPDATE public."airbnb_EDA"
                    SET "{col}" = COALESCE("{col}", 'not fill')
                    WHERE "{col}" IS NULL;
                """)
                connection.execute(update_text)
                connection.commit()
            
            # Para columnas numéricas (excluyendo last_review)
            numeric_columns_query = text("""
                SELECT column_name 
                FROM information_schema.columns 
                WHERE table_name = 'airbnb_EDA' 
                AND data_type IN ('integer', 'bigint', 'double precision', 'numeric')
                AND column_name != 'last_review'
            """)
            
            numeric_cols = [row[0] for row in connection.execute(numeric_columns_query)]
            
            for col in numeric_cols:
                update_num = text(f"""
                    UPDATE public."airbnb_EDA"
                    SET "{col}" = COALESCE({col}, -1)
                    WHERE "{col}" IS NULL;
                """)
                connection.execute(update_num)
                connection.commit()
            
            print("Manejo de nulos completado exitosamente!")
        
        except Exception as e:
            print(f"Error durante las transformaciones: {str(e)}")
            connection.rollback()
            raise

if __name__ == "__main__":
    execute_sql_transformations()


  query3 = text("""


probar -> ['NAME', 'host_identity_verified', 'neighbourhood_group', 'cancellation_policy', 'room_type', 'price', 'service_fee', 'license']
Manejo de nulos completado exitosamente!


In [7]:
query = 'SELECT * FROM public."airbnb_EDA";'
df = pd.read_sql(query, conn)
df.head()

query = 'SELECT COUNT(*) FROM public."airbnb_EDA"'

# Usamos engine.connect() para ejecutar la consulta
with engine.connect() as connection:
    count = pd.read_sql(query, connection)

print(count)

with engine.connect() as connection:
    df = pd.read_sql('SELECT * FROM public."airbnb_EDA"', connection)

df.head()
#df.info()  
#df.describe()
#df.isnull().sum()

  df = pd.read_sql(query, conn)


    count
0  102599


Unnamed: 0,id,NAME,host_id,host_identity_verified,neighbourhood_group,instant_bookable,cancellation_policy,room_type,construction_year,price,service_fee,minimum_nights,number_of_reviews,reviews_per_month,review_rate_number,calculated_host_listings_count,availability_365,license,last_review
0,1229988,"Spacious,Sunny, private one bedroom",9133292331,verified,Brooklyn,True,strict,Entire home/apt,2005.0,$507,$101,20.0,70.0,0.73,4.0,1.0,-1.0,not fill,99999999
1,3186787,East Village,21573101315,unconfirmed,Manhattan,False,moderate,Shared room,2016.0,"$1,087",$217,1.0,94.0,1.46,1.0,1.0,-2.0,not fill,99999999
2,6634244,Charming Park Slope 2 bed w/Garden,27810143460,unconfirmed,Brooklyn,True,strict,Entire home/apt,2022.0,$262,$52,7.0,14.0,0.37,1.0,1.0,10.0,not fill,99999999
3,17170471,Beautiful 1 Bedroom in Grammercy,64526422064,verified,Manhattan,False,moderate,Entire home/apt,2009.0,$590,$118,2.0,0.0,-1.0,5.0,1.0,0.0,not fill,99999999
4,22821604,Luxury Apartment Building,37843097773,verified,Queens,True,moderate,Private room,2009.0,$579,$116,1.0,0.0,-1.0,4.0,1.0,365.0,not fill,99999999


- Eliminar duplicados
- Conversion de datos: cambiar nulos string a "not fill" y nulos int a -1 
- *Manejo de valores atipicos
- Normalizacion numerica: cambiar las fechas a int donde el orden del numero sea AAAAMMDD

- eliminar columnas 
- cambiar nombre de las columnas (hay espacios en los nombres)

In [8]:
with engine.begin() as connection:  # Usar begin() para manejar la transacción
    connection.execute(text("""
        UPDATE public."airbnb_EDA"
        SET neighbourhood_group = CASE 
            WHEN neighbourhood_group = 'brookln' THEN 'Brooklyn'
            WHEN neighbourhood_group = 'manhatan' THEN 'Manhattan'
            ELSE neighbourhood_group
        END
        WHERE neighbourhood_group IN ('brookln', 'manhatan');
    """))
    
print("Datos actualizados correctamente en la columna 'neighbourhood_group'.")


Datos actualizados correctamente en la columna 'neighbourhood_group'.


In [9]:
df = pd.read_sql('SELECT DISTINCT neighbourhood_group FROM public."airbnb_EDA"', engine)
df

Unnamed: 0,neighbourhood_group
0,not fill
1,Brooklyn
2,Bronx
3,Manhattan
4,Queens
5,Staten Island
