In [None]:
import pandas as pd
pd.options.plotting.backend = "plotly"
from sqlalchemy import create_engine
import plotly.express as px

In [None]:
connection_string = "postgresql+psycopg2://postgres:postgres@localhost:5433/dwh_airbnb"
engine = create_engine(connection_string)

In [None]:
schema_name = "stg"
table_name = "listing"

query = f"SELECT * FROM {schema_name}.{table_name};"

df = pd.read_sql(query, engine)

df.head()

In [None]:
display(df.info())

In [None]:
df_price = df[['price_dollar', 'city']]
df_price_null = df_price[df_price['price_dollar'].isnull()]
display(df_price_null)
display(df_price_null['city'].value_counts())

Los nulos de la columna precio parece no tener relacion con la ciudad

In [None]:
df = df.drop(columns=["calculated_host_listings_count", "calculated_host_listings_count_entire_homes", "calculated_host_listings_count_private_rooms", "calculated_host_listings_count_shared_rooms", "description", "neighborhood_overview", "picture_url", "host_url", "host_response_time", "host_response_rate_percentage", "host_acceptance_rate_percentage", "host_is_superhost", "host_listings_count", "host_total_listings_count", "host_verifications", "host_has_profile_pic", "host_identity_verified", "neighbourhood", "neighbourhood_group_cleansed", "calendar_updated"])

df.info()

In [None]:
import ast

failures = []
for amenities_list in df["amenities"]:
    try:
        ast.literal_eval(amenities_list)
    except:
        failures.append(amenities_list)

In [None]:
len(failures)

In [None]:
amenities_set = set()
for amenities_list in df["amenities"]:
    for amenitie in ast.literal_eval(amenities_list):
        amenities_set.add(amenitie)


In [None]:
print(amenities_set)
print(len(amenities_set))

In [None]:
import numpy as np

df["amenities_count"] = np.zeros(df.shape[0])

for i in range(df.shape[0]):
    df.loc[i, "amenities_count"] = len(ast.literal_eval(df["amenities"][i]))


In [None]:
df.head()

In [None]:
df["room_type"].value_counts()

In [None]:
def plot_price_box_plot_by_room_type(room_type):
    # Seleccionar subset
    sub = df[df["room_type"] == room_type]["price_dollar"]

    # Calcular Q1, Q3 e IQR
    Q1 = sub.quantile(0.25)
    Q3 = sub.quantile(0.75)
    IQR = Q3 - Q1

    # Definir límites
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR

    # Filtrar
    filtered = sub[(sub >= lower) & (sub <= upper)]

    # Graficar sin outliers
    fig = px.box(
        filtered.to_frame(name="price_dollar"),
        x=[room_type] * len(filtered),
        y="price_dollar",
        points=False
    )
    fig.update_traces(boxmean=True)
    fig.show()

In [None]:
df[df["room_type"] == "entire home/apt"].plot(x="room_type", y="price_dollar", kind='box')

In [None]:
df[df["room_type"] == "private room"].plot(x="room_type", y="price_dollar", kind='box')

In [None]:
df[df["room_type"] == "hotel room"].plot(x="room_type", y="price_dollar", kind='box')

In [None]:
df[df["room_type"] == "shared room"].plot(x="room_type", y="price_dollar", kind='box')

In [None]:
plot_price_box_plot_by_room_type("entire home/apt")

In [None]:
plot_price_box_plot_by_room_type("private room")

In [None]:
plot_price_box_plot_by_room_type("hotel room")

In [None]:
plot_price_box_plot_by_room_type("shared room")

In [None]:
def plot_price_box_plot_by_numeric_value(var_name, df):
    # Calcular Q1, Q3 e IQR para 'price_dollar'
    Q1 = df["price_dollar"].quantile(0.25)
    Q3 = df["price_dollar"].quantile(0.75)
    IQR = Q3 - Q1

    # Definir límites para detectar outliers
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR

    # Filtrar el DataFrame completo para eliminar outliers en 'price_dollar'
    filtered_df = df[(df["price_dollar"] >= lower) & (df["price_dollar"] <= upper)]

    # Verificar que la columna var_name existe en el DataFrame
    if var_name not in filtered_df.columns:
        raise ValueError(f"La columna '{var_name}' no existe en el DataFrame.")

    # Graficar sin outliers
    fig = px.box(
        filtered_df,
        x=var_name,
        y="price_dollar",
        points=False
    )
    fig.update_traces(boxmean=True)
    fig.show()

In [None]:
df.accommodates.value_counts()

In [None]:
plot_price_box_plot_by_numeric_value("accommodates", df)

In [None]:
plot_price_box_plot_by_numeric_value("bedrooms", df)

In [None]:
plot_price_box_plot_by_numeric_value("beds", df)

In [None]:
plot_price_box_plot_by_numeric_value("bathrooms", df)

In [None]:
plot_price_box_plot_by_numeric_value("amenities_count", df)

In [None]:
plot_price_box_plot_by_numeric_value("instant_bookable", df)

In [None]:
plot_price_box_plot_by_numeric_value("country", df)

In [None]:
for i in range(df["beds"].shape[0]):
    if 

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error, r2_score, mean_absolute_percentage_error

In [None]:
selected_columns = [
    "latitude", 
    "longitude", 
    "room_type", 
    "accommodates", 
    "bathrooms", 
    "bedrooms", 
    "beds", 
    "amenities_count",
    "minimum_nights",
    "maximum_nights",
    "country",
    "price_dollar"
    ]

In [None]:
df_regression = df[selected_columns]
df_regression.info()