# UCU - Proyecto Webscraping
Proyecto para la Licenciatura en Datos y Negocios de la Universidad Católica del Uruguay

In [21]:
# === Montevideo (prop.com.uy) ===
import requests
import json
from bs4 import BeautifulSoup
import re

HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}

mvd_props = []
base_url = 'https://prop.com.uy'
list_urls = [
    base_url + '/propiedades/comprar',   # listado típico
    base_url + '/propiedades',           # alternativa
    base_url                              # fallback
]

for page in range(1, 8):  # subí este rango si necesitás más resultados
    candidate_pages = [f'{u}?page={page}' for u in list_urls] + [list_urls[-1]]
    got_page = False

    for url in candidate_pages:
        try:
            resp = requests.get(url, headers=HEADERS, timeout=20)
            if resp.status_code != 200 or not resp.text:
                continue

            soup = BeautifulSoup(resp.text, 'html.parser')
            containers = soup.find_all('a', href=lambda h: h and (
                '/propiedades/' in h or '/propiedad/' in h
            ))

            seen, links = set(), []
            for a in containers:
                href = a.get('href')
                if not href:
                    continue
                full = href if href.startswith('http') else (base_url + href)
                if full not in seen:
                    seen.add(full)
                    links.append(full)

            for link in links:
                try:
                    r = requests.get(link, headers=HEADERS, timeout=20)
                    s = BeautifulSoup(r.text, 'html.parser')

                    price_txt = s.find(string=lambda t: t and ('U$S' in t or 'USD' in t or '$' in t))
                    rooms_txt = s.find(string=lambda t: t and ('Dorm' in t or 'dorm' in t or 'Dormitorio' in t))
                    size_txt  = s.find(string=lambda t: t and ('m²' in t or 'm2' in t or 'M2' in t))

                    # --- normalización de dormitorios ---
                    habitaciones = None
                    if rooms_txt:
                        m = re.search(r'(\d+)', rooms_txt)
                        if m:
                            n = int(m.group(1))
                            if n == 1:
                                habitaciones = "1 dormitorio"
                            else:
                                habitaciones = f"{n} dormitorios"

                    mvd_props.append({
                        'precio': price_txt.strip() if price_txt else None,
                        'tamano': size_txt.strip() if size_txt else None,
                        'habitaciones': habitaciones,
                        'link': link
                    })

                    if len(mvd_props) >= 10:
                        break
                except requests.exceptions.RequestException:
                    continue

            got_page = True
            if len(mvd_props) >= 10:
                break
        except requests.exceptions.RequestException:
            continue

    if len(mvd_props) >= 10:
        break
    if not got_page:
        continue

# guardado en JSON
data = {
    "ciudades": [
        {
            "nombre": "Montevideo",
            "propiedades": mvd_props[:10]
        }
    ]
}

with open("propiedades.json", "w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

print("Montevideo listo → propiedades.json")

Montevideo listo → propiedades.json


In [20]:
# === Salto (Durán & Piastri) — llegar a ≥10 propiedades con paginación amplia ===
import requests
import json
from bs4 import BeautifulSoup
import re

HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}

base_url = "https://www.duranypiastri.com"

# listado principal de venta + variantes de paginado (intentamos varias por si el sitio usa nombres distintos)
list_patterns = [
    "/?op=1&pag=propiedades",                         # base
    "/?op=1&pag=propiedades&page={page}",            # variante 1
    "/?op=1&pag=propiedades&pagina={page}",          # variante 2
    "/?op=1&pag=propiedades&p={page}",               # variante 3
    "/?op=1&pag=propiedades&pg={page}",              # variante 4
    "/?page={page}",                                  # fallback genérico
    "/?pagina={page}"                                 # fallback genérico
]

DESIRED = 10
MAX_PAGES = 30

def normalize_link(base, href):
    if not href:
        return None
    href = href.strip()
    if href.startswith(("javascript:", "#")):
        return None
    if href.startswith(("http://", "https://")):
        return href
    if href.startswith("/"):
        return base + href
    return base.rstrip("/") + "/" + href

def extract_price_text(text):
    if not text:
        return None
    t = text.replace("\xa0", " ").replace("\n", " ")
    m = re.search(r"Venta\s*(U\$S|USD|\$)\s*[\d\.\,]+", t, re.I)
    if m:
        return m.group(0).strip()
    m = re.search(r"(U\$S|USD|\$)\s*[\d\.\,]+", t, re.I)
    return m.group(0).strip() if m else None

def extract_rooms_fmt(text):
    if not text:
        return None
    t = text.replace("\xa0"," ").replace("\n"," ")
    m = re.search(r"(\d+)\s*Dormitorio/?s?", t, re.I)
    if not m:
        m = re.search(r"(\d+)\s*Dorms?\.?", t, re.I)
    if m:
        n = int(m.group(1))
        return "1 dormitorio" if n == 1 else f"{n} dormitorios"
    return None

def extract_m2(text):
    if not text:
        return None
    t = text.replace("\xa0", " ").replace("\n", " ")
    m = re.search(r"(\d{1,4})\s*(m²|m2|M2|mt2|mts2|m\^2)\b", t, re.I)
    return f"{m.group(1)} m²" if m else None

salto_props = []
seen_detail = set()   # para no repetir fichas

# recorremos páginas hasta juntar DESIRED
for page in range(1, MAX_PAGES + 1):
    found_this_page = False

    for pat in list_patterns:
        url = base_url + (pat.format(page=page) if "{page}" in pat else pat)
        try:
            resp = requests.get(url, headers=HEADERS, timeout=20)
            if resp.status_code != 200 or not resp.text:
                continue

            soup = BeautifulSoup(resp.text, "html.parser")

            # candidatos a ficha (patrones amplios)
            anchors = soup.find_all("a", href=lambda h: h and any(x in h for x in [
                "idprop=", "prop=", "/propiedad", "/propiedades", "ficha"
            ]))

            # normalizamos y filtramos enlaces que aparentan ser fichas (evitamos volver a listados)
            detail_links = []
            for a in anchors:
                full = normalize_link(base_url, a.get("href"))
                if not full:
                    continue
                # evitamos enlaces al propio listado
                if "pag=propiedades" in full and "idprop=" not in full and "prop=" not in full and "/propiedad" not in full:
                    continue
                if full not in seen_detail:
                    seen_detail.add(full)
                    detail_links.append(full)

            # parseamos fichas
            for link in detail_links:
                try:
                    r = requests.get(link, headers=HEADERS, timeout=20)
                    if r.status_code != 200 or not r.text:
                        continue
                    s = BeautifulSoup(r.text, "html.parser")
                    full = s.get_text(" ", strip=True)

                    price = extract_price_text(full)
                    rooms = extract_rooms_fmt(full)
                    size  = extract_m2(full)

                    salto_props.append({
                        "precio": price,
                        "tamano": size,
                        "habitaciones": rooms,
                        "link": link
                    })
                    found_this_page = True

                    if len(salto_props) >= DESIRED:
                        break
                except requests.exceptions.RequestException:
                    continue

            if len(salto_props) >= DESIRED:
                break

        except requests.exceptions.RequestException:
            continue

    if len(salto_props) >= DESIRED:
        break
    # si esta iteración de page no trajo nada, igual seguimos probando siguientes páginas/patrones

# Guardado JSON en el formato pedido (una sola ciudad)
data = {
    "ciudades": [
        {"nombre": "Salto", "propiedades": salto_props[:DESIRED]}
    ]
}

with open("propiedades.json", "w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

print(f"Salto listo → propiedades.json (total: {len(salto_props)})")

Salto listo → propiedades.json (total: 10)


In [24]:
# === Punta del Este (Market del Este) — enlaces correctos + datos normalizados ===
import requests
import json
import re
from bs4 import BeautifulSoup

HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)", "Accept-Language": "es-UY,es;q=0.9,en;q=0.8"}

base_url = "https://www.marketdeleste.com"
seed_url = base_url + "/apartamentos/en-venta/?ref=&ciudad=2&dormitorios=&precioMaximo="

DESIRED = 10
MAX_PAGES = 50  # si necesitás más resultados, subilo

# Variantes de paginado que suelen usar estos sitios
list_patterns = [
    seed_url,
    seed_url + "&page={page}",
    seed_url + "&pagina={page}",
    seed_url + "&paged={page}",
    base_url + "/apartamentos/en-venta/page/{page}/",
]

# ---------------- helpers ----------------
def abs_url(base, href):
    if not href: return None
    href = href.strip()
    if href.startswith(("javascript:", "#")): return None
    if href.startswith(("http://", "https://")): return href
    if href.startswith("/"): return base + href
    return base.rstrip("/") + "/" + href

def is_list_url(url):
    # Evitar que se cuelen listados/paginadores en el JSON
    if not url: return True
    bad_bits = ["/en-venta?", "/en-venta/&", "/en-venta/#", "/en-venta/page/", "pagina=", "paged=", "page="]
    return any(bit in url for bit in bad_bits)

def price_text(text):
    if not text: return None
    t = text.replace("\xa0", " ").replace("\n", " ")
    m = re.search(r"(U\$S|US\$|USD|\$)\s*[\d\.\,]+", t, re.I)
    return m.group(0).strip() if m else None

def price_to_int_usd(text):
    if not text: return None
    # toma solo dígitos y comas/puntos, y convierte a int
    digits = re.sub(r"[^\d]", "", text)
    try:
        return int(digits) if digits else None
    except ValueError:
        return None

def rooms_fmt(text):
    if not text: return None
    t = text.replace("\xa0"," ").replace("\n"," ")
    m = re.search(r"(\d+)\s*Dorms?\.?", t, re.I)
    if not m:
        m = re.search(r"(\d+)\s*Dormitorio/?s?", t, re.I)
    if m:
        n = int(m.group(1))
        return "1 dormitorio" if n == 1 else f"{n} dormitorios"
    return None

def size_m2(text):
    if not text: return None
    t = text.replace("\xa0", " ").replace("\n", " ")
    m = re.search(r"(\d{1,4})\s*(m²|m2|M2|mt2|mts2|m\^2)\b", t, re.I)
    return f"{m.group(1)} m²" if m else None

def get_detail_fields(detail_html):
    s = BeautifulSoup(detail_html, "html.parser")
    full = s.get_text(" ", strip=True)

    p_txt = price_text(full)
    p_num = price_to_int_usd(p_txt)
    rooms = rooms_fmt(full)
    m2 = size_m2(full)

    return p_num, m2, rooms

# ---------------- scraping ----------------
props = []
seen = set()  # dedupe por link

for page in range(1, MAX_PAGES + 1):
    got_any = False

    for pat in list_patterns:
        url = pat.format(page=page) if "{page}" in pat else pat
        try:
            resp = requests.get(url, headers=HEADERS, timeout=20)
            if resp.status_code != 200 or not resp.text:
                continue
            soup = BeautifulSoup(resp.text, "html.parser")

            # Candidatos a ficha (enlaces con slugs típicos de detalle)
            anchors = soup.find_all("a", href=True)
            cand_links = []
            for a in anchors:
                href = a.get("href")
                full = abs_url(base_url, href)
                if not full:
                    continue
                # Filtramos listados y priorizamos detalle (propiedad/apartamento con slug)
                if is_list_url(full):
                    continue
                if any(seg in full for seg in ["/propiedad/", "/propiedades/", "/apartamento", "/apartamentos"]):
                    if full not in seen:
                        seen.add(full)
                        cand_links.append(full)

            # Si no hubo suerte con filtros, relajamos (fallback): tomamos todos y filtramos por contenido de ficha
            if not cand_links:
                for a in anchors:
                    full = abs_url(base_url, a.get("href"))
                    if full and full not in seen and not is_list_url(full):
                        seen.add(full)
                        cand_links.append(full)

            # Abrimos cada candidato y validamos que sea ficha (y extraemos campos)
            for link in cand_links:
                try:
                    r = requests.get(link, headers=HEADERS, timeout=20)
                    if r.status_code != 200 or not r.text:
                        continue

                    price_num, m2_txt, rooms_str = get_detail_fields(r.text)

                    # Consideramos "ficha válida" si recuperamos al menos precio numérico o m2 o dormitorios
                    if price_num is None and m2_txt is None and rooms_str is None:
                        continue  # probablemente no es ficha

                    props.append({
                        "precio": price_num,        # número en USD si se pudo parsear, si no -> None
                        "tamano": m2_txt,           # 'NN m²' si se encontró
                        "habitaciones": rooms_str,  # 'X dormitorio(s)' si se encontró
                        "link": link                # absoluto y verificado
                    })
                    got_any = True

                    if len(props) >= DESIRED:
                        break
                except requests.exceptions.RequestException:
                    continue

            if len(props) >= DESIRED:
                break

        except requests.exceptions.RequestException:
            continue

    if len(props) >= DESIRED:
        break
    if not got_any:
        continue

# ---------------- guardado ----------------
data = {
    "ciudades": [
        {"nombre": "Punta del Este", "propiedades": props[:DESIRED]}
    ]
}

with open("propiedades.json", "w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

print(f"Punta del Este listo → propiedades.json (total: {len(props)})")

Punta del Este listo → propiedades.json (total: 10)
