### Enviroment 

In [2]:
import pandas as pd
import psycopg2
from sqlalchemy import create_engine, text
import json

### Crear conexion a la BD postgres

In [3]:
credentials = "../credentials.json"

with open(credentials) as f:
    creds = json.load(f)

conn = psycopg2.connect(    
    host=creds["host"],
    database=creds["database"],
    user=creds["user"],
    password=creds["password"],
    port=creds["port"]
)

engine = create_engine(f'postgresql://{creds["user"]}:{creds["password"]}@{creds["host"]}:{creds["port"]}/{creds["database"]}')

In [4]:
df = pd.read_csv("../data/Airbnb_Open_Data.csv", low_memory=False, encoding='ISO-8859-1')

try:
    with engine.connect() as connection:
        connection.execute(text("DROP TABLE IF EXISTS airbnb_EDA;"))
        table_name = "airbnb_EDA"  
        df.to_sql(table_name, engine, if_exists='replace', index=False)
        print(f"Tabla '{table_name}' creada y datos cargados en la base de datos '{creds['database']}'.")

        
        connection.execute(text("""
            CREATE TABLE airbnb_EDA 
            AS 
            SELECT * FROM airbnb_data;
        """))
        
      
        result = connection.execute(text("""
            SELECT COUNT(*) 
            FROM information_schema.tables 
            WHERE table_name = 'airbnb_EDA';
        """))
        
        if result.scalar() == 1:
            print("Tabla airbnb_EDA creada exitosamente!")
            
           
            count_original = connection.execute(text("SELECT COUNT(*) FROM airbnb_data;")).scalar()
            count_copy = connection.execute(text("SELECT COUNT(*) FROM airbnb_EDA;")).scalar()
            
            print(f"\nRegistros en tabla original: {count_original}")
            print(f"Registros en tabla copia: {count_copy}")
            
     
            sample = pd.read_sql("SELECT * FROM airbnb_EDA LIMIT 5;", connection)
            print("\nMuestra de la nueva tabla:")
            print(sample)
            
        else:
            print("Error: No se pudo crear la tabla")

except Exception as e:
    print(f"Error durante la creación de la tabla: {str(e)}")

Tabla 'airbnb_EDA' creada y datos cargados en la base de datos 'airbnb'.
Tabla airbnb_EDA creada exitosamente!

Registros en tabla original: 102599
Registros en tabla copia: 102599

Muestra de la nueva tabla:
        id                                              NAME      host id  \
0  1001254                Clean & quiet apt home by the park  80014485718   
1  1002102                             Skylit Midtown Castle  52335172823   
2  1002403               THE VILLAGE OF HARLEM....NEW YORK !  78829239556   
3  1002755                                              None  85098326012   
4  1003689  Entire Apt: Spacious Studio/Loft by central park  92037596077   

  host_identity_verified host name neighbourhood group neighbourhood  \
0            unconfirmed  Madaline            Brooklyn    Kensington   
1               verified     Jenna           Manhattan       Midtown   
2                   None     Elise           Manhattan        Harlem   
3            unconfirmed     Garry      

In [5]:
def rename_columns_with_spaces():
    engine = create_engine(f'postgresql://{creds["user"]}:{creds["password"]}@{creds["host"]}:{creds["port"]}/{creds["database"]}')
    
    with engine.connect() as connection:
        query = text("""
            SELECT column_name 
            FROM information_schema.columns 
            WHERE table_name = 'airbnb_EDA' 
            AND column_name LIKE '% %'
        """)
        result = connection.execute(query)
        columns_to_rename = [row[0] for row in result]
        
        if not columns_to_rename:
            print("No hay columnas con espacios para renombrar")
            return

        # Generar y ejecutar sentencias ALTER TABLE
        for old_name in columns_to_rename:
            new_name = old_name.replace(' ', '_')
            alter_query = text(f"""
                ALTER TABLE public."airbnb_EDA"
                RENAME COLUMN "{old_name}" TO {new_name}
            """)
            
            try:
                connection.execute(alter_query)
                print(f"Renombrada: '{old_name}' -> '{new_name}'")
                connection.commit()
            except Exception as e:
                print(f"Error renombrando {old_name}: {str(e)}")
                connection.rollback()

if __name__ == "__main__":
    rename_columns_with_spaces()

    query = "SELECT column_name FROM information_schema.columns WHERE table_name = 'airbnb_EDA'"
    df = pd.read_sql(query, conn)
    df.head()

Renombrada: 'availability 365' -> 'availability_365'
Renombrada: 'minimum nights' -> 'minimum_nights'
Renombrada: 'number of reviews' -> 'number_of_reviews'
Renombrada: 'reviews per month' -> 'reviews_per_month'
Renombrada: 'review rate number' -> 'review_rate_number'
Renombrada: 'calculated host listings count' -> 'calculated_host_listings_count'
Renombrada: 'host id' -> 'host_id'
Renombrada: 'Construction year' -> 'Construction_year'
Renombrada: 'host name' -> 'host_name'
Renombrada: 'neighbourhood group' -> 'neighbourhood_group'
Renombrada: 'country code' -> 'country_code'
Renombrada: 'room type' -> 'room_type'
Renombrada: 'last review' -> 'last_review'
Renombrada: 'service fee' -> 'service_fee'


  df = pd.read_sql(query, conn)


In [6]:
query = text("""
ALTER TABLE public."airbnb_EDA"
DROP COLUMN IF EXISTS host_name,
DROP COLUMN IF EXISTS country_code,
DROP COLUMN IF EXISTS country,
DROP COLUMN IF EXISTS neighbourhood,
DROP COLUMN IF EXISTS house_rules;
""")
with engine.connect() as connection:
    connection.execute(query)
    connection.commit()

In [None]:
def execute_sql_transformations():
    with engine.connect() as connection:
        try:

            query1 = text("""
                ALTER TABLE public."airbnb_EDA" 
                ALTER COLUMN last_review TYPE TEXT 
                USING last_review::TEXT;
            """)
            connection.execute(query1)
            connection.commit()
            
            query2 = text("""
                -- Crear columna temporal para fechas si no existe
                DO $$ 
                BEGIN 
                    IF NOT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name = 'airbnb_EDA' AND column_name = 'temp_last_review') THEN
                        ALTER TABLE public."airbnb_EDA" ADD COLUMN temp_last_review DATE;
                    END IF;
                END $$;
            """)
            connection.execute(query2)
            connection.commit()
            
            query3 = text("""
                -- Conversión segura con múltiples formatos
                UPDATE public."airbnb_EDA" 
                SET temp_last_review = 
                    CASE
                        -- Limpiar caracteres no numéricos antes de la conversión
                        WHEN last_review ~ '[^0-9/-]' THEN NULL

                        -- Normalizar y convertir fechas M/D/YYYY a MM/DD/YYYY antes de aplicar TO_DATE()
                        WHEN last_review ~ '^\d{1,2}/\d{1,2}/\d{4}$' 
                        THEN TO_DATE(
                            LPAD(SPLIT_PART(last_review, '/', 1), 2, '0') || '/' ||
                            LPAD(SPLIT_PART(last_review, '/', 2), 2, '0') || '/' ||
                            SPLIT_PART(last_review, '/', 3),
                            'MM/DD/YYYY'
                        )

                        -- Manejo del formato YYYY-MM-DD
                        WHEN last_review ~ '^\d{4}-\d{2}-\d{2}$' 
                        THEN TO_DATE(last_review, 'YYYY-MM-DD')

                        ELSE NULL 
                    END;
            """)
            connection.execute(query3)
            connection.commit()
            
            query4 = text("""    
                -- Asignar 1900-01-01 a valores NULL antes de la conversión a entero
                UPDATE public."airbnb_EDA"
                SET temp_last_review = '1900-01-01'
                WHERE temp_last_review IS NULL;
            """)
            connection.execute(query4)
            connection.commit()

            query5 = text("""
                -- Eliminar la columna original y renombrar la nueva columna
                ALTER TABLE public."airbnb_EDA" DROP COLUMN last_review;
                ALTER TABLE public."airbnb_EDA" RENAME COLUMN temp_last_review TO last_review;
            """)
            connection.execute(query5)
            connection.commit()
            
            query6 = text("""
                -- Convertir fechas a formato INTEGER AAAAMMDD
                ALTER TABLE public."airbnb_EDA" 
                ALTER COLUMN last_review TYPE INTEGER 
                USING (
                    EXTRACT(YEAR FROM last_review) * 10000 +
                    EXTRACT(MONTH FROM last_review) * 100 +
                    EXTRACT(DAY FROM last_review)
                );
            """)
            connection.execute(query6)
            connection.commit()
            
            print("Transformación de fechas y manejo de valores nulos completado correctamente")

        except Exception as e:
            print(f"Error durante las transformaciones: {str(e)}")
            connection.rollback()
            raise

if __name__ == "__main__":
    execute_sql_transformations()


Transformación de fechas y manejo de valores nulos completado correctamente


In [8]:
query = 'SELECT * FROM public."airbnb_EDA";'
df = pd.read_sql(query, conn)
df.head()

query = 'SELECT COUNT(*) FROM public."airbnb_EDA"'

with engine.connect() as connection:
    count = pd.read_sql(query, connection)

print(count)

with engine.connect() as connection:
    df = pd.read_sql('SELECT * FROM public."airbnb_EDA"', connection)

df.head()
#df.info()  
#df.describe()
#df.isnull().sum()

  df = pd.read_sql(query, conn)


    count
0  102599


Unnamed: 0,id,NAME,host_id,host_identity_verified,neighbourhood_group,lat,long,instant_bookable,cancellation_policy,room_type,...,price,service_fee,minimum_nights,number_of_reviews,reviews_per_month,review_rate_number,calculated_host_listings_count,availability_365,license,last_review
0,1112901,,21389589535,unconfirmed,Queens,40.71546,-73.87854,,,Entire home/apt,...,$907,$181,7.0,38.0,0.38,4.0,5.0,294.0,,20190427
1,1138859,BROWNSTONE SUNDRENCHED BEAUTY,49595827306,,,40.688,-73.9171,True,moderate,Entire home/apt,...,"$1,028",$206,3.0,111.0,2.13,,1.0,150.0,,19000101
2,1159846,**Fantastic Williamsburg Apt**,98480978167,,,40.71031,-73.9583,False,moderate,Entire home/apt,...,$387,$77,,9.0,0.11,4.0,1.0,323.0,,20190216
3,1231093,,20797630440,verified,Brooklyn,40.6755,-73.95878,True,moderate,Private room,...,$445,$89,2.0,115.0,1.18,1.0,1.0,,,20170525
4,1399544,City Skyline Views from every room!,8120491427,verified,Queens,40.74558,-73.92324,True,moderate,Private room,...,$367,$73,2.0,95.0,1.02,5.0,2.0,387.0,,20190102


In [9]:
with engine.begin() as connection:
    connection.execute(text("""
        UPDATE public."airbnb_EDA"
        SET neighbourhood_group = CASE 
            WHEN neighbourhood_group = 'brookln' THEN 'Brooklyn'
            WHEN neighbourhood_group = 'manhatan' THEN 'Manhattan'
            ELSE neighbourhood_group
        END
        WHERE neighbourhood_group IN ('brookln', 'manhatan');
    """))
    
print("Datos actualizados correctamente en la columna 'neighbourhood_group'.")


Datos actualizados correctamente en la columna 'neighbourhood_group'.


In [10]:
df = pd.read_sql('SELECT DISTINCT neighbourhood_group FROM public."airbnb_EDA"', engine)
df

Unnamed: 0,neighbourhood_group
0,
1,Brooklyn
2,Bronx
3,Manhattan
4,Queens
5,Staten Island
