## Enviroment

In [1]:
import pandas as pd
from sqlalchemy import create_engine, text
import json
import psycopg2

Conexión a la base de datos mediante las credenciales definidas

In [2]:
credentials = "../credentials.json"

with open(credentials) as f:
    creds = json.load(f)
    
DB_USER = creds["user"]
DB_PASSWORD = creds["password"]
DB_HOST = creds["host"]
DB_PORT = creds["port"]
DB_NAME = creds["database"]

df = pd.read_csv("../data/Airbnb_Open_Data.csv", low_memory=False, encoding='ISO-8859-1')
print("Datos del CSV cargados correctamente.")
print(df.head()) 

# Contar el número total de filas en el CSV
total_filas_csv = len(df)
print(f"El archivo CSV tiene un total de {total_filas_csv} filas.")

# Crear Base de Datos
creator_connection_string = f'postgresql+psycopg2://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/postgres'
creator_engine = create_engine(creator_connection_string)

with creator_engine.connect() as connection:
    connection.execution_options(isolation_level="AUTOCOMMIT")
    result = connection.execute(
        text(f"SELECT 1 FROM pg_database WHERE datname = '{DB_NAME}'")
    )
    
    if not result.scalar():
        connection.execute(text(f"CREATE DATABASE {DB_NAME}"))
        print(f"Base de datos '{DB_NAME}' creada exitosamente!")
    else:
        print(f"La base de datos '{DB_NAME}' ya existe.")

# Crear conexión a la base de datos ya creada
connection_string = f'postgresql+psycopg2://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}'
engine = create_engine(connection_string)

# Crear la tabla en la base de datos y cargar los datos
table_name = "airbnb_data"  # Nombre de la nueva tablan
df.to_sql(table_name, engine, if_exists='replace', index=False)

print(f"Tabla '{table_name}' creada y datos cargados en la base de datos '{DB_NAME}'.")

# Verificar los datos en la nueva tabla
with engine.connect() as connection:
    result = connection.execute(text(f"SELECT * FROM {table_name} LIMIT 5;"))
    print(f"Primeros 5 registros de la tabla '{table_name}':")
    for row in result:
        print(row)

Datos del CSV cargados correctamente.
        id                                              NAME      host id  \
0  1001254                Clean & quiet apt home by the park  80014485718   
1  1002102                             Skylit Midtown Castle  52335172823   
2  1002403               THE VILLAGE OF HARLEM....NEW YORK !  78829239556   
3  1002755                                               NaN  85098326012   
4  1003689  Entire Apt: Spacious Studio/Loft by central park  92037596077   

  host_identity_verified host name neighbourhood group neighbourhood  \
0            unconfirmed  Madaline            Brooklyn    Kensington   
1               verified     Jenna           Manhattan       Midtown   
2                    NaN     Elise           Manhattan        Harlem   
3            unconfirmed     Garry            Brooklyn  Clinton Hill   
4               verified    Lyndon           Manhattan   East Harlem   

        lat      long        country  ... service fee minimum nigh

In [4]:
# Contar el número total de filas en el DataFrame
total_csv_rows = len(df)
print(f"El archivo CSV tiene un total de {total_csv_rows} filas.")

# Verificar el número total de filas en la tabla
with engine.connect() as connection:
    result = connection.execute(text(f"SELECT COUNT(*) FROM {table_name};"))
    total_rows = result.scalar()  # Obtener el valor del conteo
    print(f"La tabla '{table_name}' tiene un total de {total_rows} filas.")

El archivo CSV tiene un total de 102599 filas.
La tabla 'airbnb_data' tiene un total de 102599 filas.


In [5]:
with engine.connect() as connection:
    # Consulta para obtener los primeros 20 registros de la tabla
    query = f"SELECT * FROM {table_name} LIMIT 20;"
    
    # Cargar los datos en un DataFrame de pandas
    df = pd.read_sql(query, connection)
    
    # Mostrar el DataFrame en formato tabular
    print(f"Primeros 20 registros de la tabla '{table_name}':")
    print(df)

Primeros 20 registros de la tabla 'airbnb_data':
         id                                              NAME      host id  \
0   1001254                Clean & quiet apt home by the park  80014485718   
1   1002102                             Skylit Midtown Castle  52335172823   
2   1002403               THE VILLAGE OF HARLEM....NEW YORK !  78829239556   
3   1002755                                              None  85098326012   
4   1003689  Entire Apt: Spacious Studio/Loft by central park  92037596077   
5   1004098         Large Cozy 1 BR Apartment In Midtown East  45498551794   
6   1004650                                   BlissArtsSpace!  61300605564   
7   1005202                                   BlissArtsSpace!  90821839709   
8   1005754                   Large Furnished Room Near B'way  79384379533   
9   1006307                Cozy Clean Guest Room - Family Apt  75527839483   
10  1006859                Cute & Cozy Lower East Side 1 bdrm   1280143094   
11  1007411    

In [6]:
# Cargar el dataset
df = pd.read_csv("../data/Airbnb_Open_Data.csv", low_memory=False, encoding='ISO-8859-1')

# Revisión inicial
print(f"Filas: {df.shape[0]}, Columnas: {df.shape[1]}")
print(df.info())  # Tipos de datos y valores no nulos
print(df.isnull().sum())  # Conteo de valores nulos por columna

Filas: 102599, Columnas: 26
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102599 entries, 0 to 102598
Data columns (total 26 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   id                              102599 non-null  int64  
 1   NAME                            102349 non-null  object 
 2   host id                         102599 non-null  int64  
 3   host_identity_verified          102310 non-null  object 
 4   host name                       102193 non-null  object 
 5   neighbourhood group             102570 non-null  object 
 6   neighbourhood                   102583 non-null  object 
 7   lat                             102591 non-null  float64
 8   long                            102591 non-null  float64
 9   country                         102067 non-null  object 
 10  country code                    102468 non-null  object 
 11  instant_bookable                102494 non-null  o