In [28]:
from bs4 import BeautifulSoup
import pandas as pd

# Ruta del archivo HTML local
file_path = '/Users/monzon8/Documents/Ironhack/PROYECTOS/02_Tiburones/Outer Banks Real Estate - Outer Banks Homes For Sale _ Zillow.html'
with open(file_path, 'r', encoding='utf-8') as file:
    html_content = file.read()

# Analizo el HTML con BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')

# Extraigo los precios y áreas con las clases CSS específicas
pricesou = soup.find_all(class_="PropertyCardWrapper__StyledPriceLine-srp-8-105-0__sc-16e8gqd-1 dHAgxu")
areasou = soup.find_all(class_="StyledPropertyCardDataArea-c11n-8-105-0__sc-10i1r6-0 cCMTdc")

# Creo listas
price_listou = [price.get_text(strip=True) for price in pricesou]
area_listou = [area.get_text(strip=True) for area in areasou]

# Crear un DataFrame para organizar los datos
propertyou_data = {
    'Price': price_listou,
    'Area': area_listou
}

df_outer_banks = pd.DataFrame(propertyou_data)

print(df_outer_banks)


        Price                                Area
0  $2,999,000   8bds10ba5,263sqft- House for sale
1  $3,999,000   9bds11ba4,968sqft- House for sale
2  $2,580,000    5bds8ba6,837sqft- House for sale
3  $2,900,000    5bds7ba4,500sqft- House for sale
4  $2,499,000  10bds12ba4,997sqft- House for sale
5  $2,900,000   8bds10ba5,258sqft- House for sale
6    $315,000    6bds2ba1,820sqft- House for sale
7    $829,000    5bds6ba2,434sqft- House for sale
8  $2,099,000    6bds7ba4,344sqft- House for sale


In [29]:
def extract_area_value(area_string):
    """
    Esta función busca el número entre 'ba' y 'sf' en una cadena de texto que describe el área,
    sin utilizar expresiones regulares.
    """
    # Nos aseguramos de que area_string contiene valores y es una cadena
    if isinstance(area_string, str):
        # Encontrar las posiciones de 'ba' y 'sqft' en la cadena
        ba_index = area_string.find('ba')
        sf_index = area_string.find('sqft')
        
        # Si ambos 'ba' y 'sqft' se encuentran en la cadena
        if ba_index != -1 and sf_index != -1:
            # Extramos la subcadena que está entre 'ba' y 'sqft'
            number_string = area_string[ba_index+2:sf_index].strip()  # +2 para saltar 'ba'
            
            # Quitamos cualquier carácter no numérico restante
            number_string = ''.join(char for char in number_string if char.isdigit())
            
            if number_string.isdigit():
                return int(number_string)  
            else:
                return None  # Si no hay un número válido
        else:
            return None  # Si 'ba' o 'sqft' no se encuentran
    else:
        return None  # Si el valor no es una cadena

# Aplicamos la función al DataFrame en la columna 'Area'
df_outer_banks['Square_Feet'] = df_outer_banks['Area'].apply(extract_area_value)

print(df_outer_banks)


        Price                                Area  Square_Feet
0  $2,999,000   8bds10ba5,263sqft- House for sale         5263
1  $3,999,000   9bds11ba4,968sqft- House for sale         4968
2  $2,580,000    5bds8ba6,837sqft- House for sale         6837
3  $2,900,000    5bds7ba4,500sqft- House for sale         4500
4  $2,499,000  10bds12ba4,997sqft- House for sale         4997
5  $2,900,000   8bds10ba5,258sqft- House for sale         5258
6    $315,000    6bds2ba1,820sqft- House for sale         1820
7    $829,000    5bds6ba2,434sqft- House for sale         2434
8  $2,099,000    6bds7ba4,344sqft- House for sale         4344


In [30]:
# Eliminamos los símbolos de dólar y comas de 'Price'
df_outer_banks['Price'] = df_outer_banks['Price'].replace({'\$': '', ',': ''}, regex=True).astype(float)

# Eliminamos las comas de 'Square Feet' y convertimos a numérico
df_outer_banks['Square_Feet'] = df_outer_banks['Square_Feet'].replace({',': ''}, regex=True).astype(float)

# Calculamos el precio por square foot
df_outer_banks['$/f2'] = df_outer_banks['Price'] / df_outer_banks['Square_Feet']

df_outer_banks

Unnamed: 0,Price,Area,Square_Feet,$/f2
0,2999000.0,"8bds10ba5,263sqft- House for sale",5263.0,569.827095
1,3999000.0,"9bds11ba4,968sqft- House for sale",4968.0,804.951691
2,2580000.0,"5bds8ba6,837sqft- House for sale",6837.0,377.358491
3,2900000.0,"5bds7ba4,500sqft- House for sale",4500.0,644.444444
4,2499000.0,"10bds12ba4,997sqft- House for sale",4997.0,500.10006
5,2900000.0,"8bds10ba5,258sqft- House for sale",5258.0,551.54051
6,315000.0,"6bds2ba1,820sqft- House for sale",1820.0,173.076923
7,829000.0,"5bds6ba2,434sqft- House for sale",2434.0,340.591619
8,2099000.0,"6bds7ba4,344sqft- House for sale",4344.0,483.195212


In [31]:
# Filtramos las filas donde el precio por pie cuadrado es válido (no NaN)
outer_banks_price_sqft = df_outer_banks['$/f2'].dropna()

# Calculamos la media del precio por pie cuadrado
outer_banks_mean = outer_banks_price_sqft.mean()

outer_banks_mean

493.8984493308887

In [24]:
print(outer_banks_mean)


493.8984493308887


In [25]:
ny_montauk_mean = 3923.891149813695
hawaii_mean = 1766.9549815547869
tx_southpadre_mean = 474.9413689275061
outer_banks_mean = 493.8984493308887


In [32]:
hawai_oahu_mean_m = hawaii_mean * 10.764

tx_southpadre_mean_m = tx_southpadre_mean * 10.764

ny_montauk_mean_m = ny_montauk_mean * 10.764

outer_banks_mean_m = outer_banks_mean * 10.764

print('La media de los precios de las 3 ubicaciones seleccionadas')
print('en casas de más de 5 dormitorios y con superficie suficiente:')
print(f"New York - Montauk Beaxh: ${ny_montauk_mean_m:,.2f}")
print(f"Texas - South Padre Island: ${tx_southpadre_mean_m:,.2f}")
print(f"North Caroline - Outer Banks: ${outer_banks_mean_m:,.2f}")
print(f"Hawaii - Oahu Island:  ${hawai_oahu_mean_m:,.2f}")


La media de los precios de las 3 ubicaciones seleccionadas
en casas de más de 5 dormitorios y con superficie suficiente:
New York - Montauk Beaxh: $42,236.76
Texas - South Padre Island: $5,112.27
North Caroline - Outer Banks: $5,316.32
Hawaii - Oahu Island:  $19,019.50
