# 1) Librerias

In [None]:
import os, time, random, re, sys
from pathlib import Path
from urllib.parse import urljoin, urlparse
import requests
from bs4 import BeautifulSoup
import urllib.robotparser as rp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sqlalchemy import create_engine
from thefuzz import process

# ------------ WebScraping pagina 1 -------------

**Marcas por pais**

# 1) User‑Agent, sesión y robots.txt

In [3]:
BASE1 = "https://www.motor.mapfre.es/coches/noticias-coches/marcas-coche/"
UA = "ETL-Class/1.0 (contact: jhonny.hurtado@uao.edu.co)"
HEADERS = {"User-Agent": UA}
session = requests.Session(); session.headers.update(HEADERS)

# Revisa robots.txt (si no existe o falla, continúa pero documenta)
robots_url = urljoin(BASE1, "/robots.txt")
rpobj = rp.RobotFileParser()
try:
    rpobj.set_url(robots_url); rpobj.read()
    print("robots.txt leído:", robots_url)
    print("¿Permitido el acceso al dominio BASE?", rpobj.can_fetch(UA, BASE1))
except Exception as e:
    print("No se pudo leer robots.txt:", e)

robots.txt leído: https://www.motor.mapfre.es/robots.txt
¿Permitido el acceso al dominio BASE? True


# 2) Helper de descarga robusto (reintentos, backoff, variantes http/https)

In [4]:
def fetch_html(url, max_retries=3, base_sleep=0.5, session=session, verbose=True):
    variants = [url, url.rstrip('/'), url.replace('https://','http://')]
    for v in variants:
        for attempt in range(1, max_retries+1):
            try:
                r = session.get(v, timeout=20)
                if verbose: print(f"GET {v} -> {r.status_code}")
                if r.ok:
                    return r.text
                if r.status_code == 404:
                    break  # prueba siguiente variante
            except Exception as e:
                if verbose: print("Error de red:", e)
            sleep = base_sleep * (2**(attempt-1)) + random.uniform(0, 0.25)
            time.sleep(sleep)
    return None

# Parser preferido: lxml; fallback a html.parser
try:
    import lxml  # solo para verificar
    PARSER = "lxml"
except Exception:
    PARSER = "html.parser"
    print("⚠️ lxml no disponible; usando 'html.parser'.")

⚠️ lxml no disponible; usando 'html.parser'.


# 3) Parseo de páginas de listado

In [5]:
from bs4 import BeautifulSoup


def parse_country_brands(html):
    soup = BeautifulSoup(html, PARSER)
    mapping = {}  # dict: marca -> país
    
    # Cada país está en un <h3>, y su lista de marcas en el siguiente <ul>
    for h3 in soup.select("h3"):
        country = h3.get_text(strip=True)
        ul = h3.find_next_sibling("ul")
        if not ul:
            continue
        for li in ul.select("li"):
            brand = li.get_text(strip=True)
            mapping[brand] = country
    
    return mapping

html = fetch_html("https://www.motor.mapfre.es/coches/noticias-coches/marcas-coche/")
brands_by_country = parse_country_brands(html)

# Ejemplo de salida
for brand, country in list(brands_by_country.items())[:15]:
    print(brand, "->", country)


GET https://www.motor.mapfre.es/coches/noticias-coches/marcas-coche/ -> 200
Apollo Automobile -> Alemania
Audi -> Alemania
Alpina -> Alemania
Bitter Cars -> Alemania
BMW -> Alemania
Brabus -> Alemania
Ford -> Estados Unidos
Hauser -> Alemania
Mercedes-Benz -> Alemania
Opel -> Alemania
Porsche -> Alemania
Ruf -> Alemania
Smart -> Alemania
Volkswagen -> Alemania
Brabham Automotive -> Australia


In [6]:
#Creamos el dataset de los datos del scrapping

df_marcas_paises = pd.DataFrame(list(brands_by_country.items()), columns=["brand", "country"])
print(df_marcas_paises.head())

               brand   country
0  Apollo Automobile  Alemania
1               Audi  Alemania
2             Alpina  Alemania
3        Bitter Cars  Alemania
4                BMW  Alemania


# 4) Normalizacion

In [7]:
cols_texto = ["brand", "country"]
for col in cols_texto:
    df_marcas_paises[col] = df_marcas_paises[col].astype(str).str.strip().str.lower()


df_marcas_paises.head()

Unnamed: 0,brand,country
0,apollo automobile,alemania
1,audi,alemania
2,alpina,alemania
3,bitter cars,alemania
4,bmw,alemania


# 5) Unir datasets

In [10]:
df_used_cars_base_clean = pd.read_csv("../data/raw/used_cars_base_clean.csv")
df_cars_final = df_used_cars_base_clean.merge(df_marcas_paises, on="brand", how="left")


print(df_cars_final.shape)
df_cars_final.head()



(300, 13)


Unnamed: 0,listing_id,brand,model,year,mileage_km,price_usd,city,state,fuel,transmission,seller_type,listing_date,country
0,1,chevrolet,onix,2016,90791,14551.54,bucaramanga,santander,gasoline,manual,dealer,2024-02-17,estados unidos
1,2,hyundai,i30,2024,114658,21847.97,medellín,antioquia,gasoline,automatic,private,2024-04-22,corea del sur
2,3,volkswagen,t-cross,2008,91468,5265.93,medellín,antioquia,gasoline,automatic,dealer,2024-02-17,alemania
3,4,mazda,mazda2,2019,63498,19512.69,cali,valle del cauca,diesel,automatic,dealer,2024-07-12,japón
4,5,chevrolet,tracker,2019,27864,18018.49,cali,valle del cauca,electric,automatic,dealer,2024-07-13,estados unidos


In [73]:
df_used_cars_base_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   listing_id    300 non-null    int64  
 1   brand         300 non-null    object 
 2   model         300 non-null    object 
 3   year          300 non-null    int64  
 4   mileage_km    300 non-null    int64  
 5   price_usd     300 non-null    float64
 6   city          300 non-null    object 
 7   state         300 non-null    object 
 8   fuel          300 non-null    object 
 9   transmission  300 non-null    object 
 10  seller_type   300 non-null    object 
 11  listing_date  300 non-null    object 
dtypes: float64(1), int64(3), object(8)
memory usage: 28.3+ KB


# ------------ WebScraping pagina 2 -------------

**Precios Originales**

el dataset tiene el precio de como se venden los vehiculos ya a precio de segunda, añadiremos una columna extra con los precios de fabrica

# 1) User‑Agent, sesión y robots.txt

In [11]:
BASE2 = "https://www.autocosmos.com.co/auto/nuevo?pr=400&cd=5995&pidx=1"
UA = "ETL-Class/1.0 (contact: jhonny.hurtado@uao.edu.co)"
HEADERS = {"User-Agent": UA}
session = requests.Session(); session.headers.update(HEADERS)

# Revisa robots.txt (si no existe o falla, continúa pero documenta)
robots_url = urljoin(BASE2, "/robots.txt")
rpobj = rp.RobotFileParser()
try:
    rpobj.set_url(robots_url); rpobj.read()
    print("robots.txt leído:", robots_url)
    print("¿Permitido el acceso al dominio BASE?", rpobj.can_fetch(UA, BASE2))
except Exception as e:
    print("No se pudo leer robots.txt:", e)

robots.txt leído: https://www.autocosmos.com.co/robots.txt
¿Permitido el acceso al dominio BASE? True


# 2) Helper de descarga robusto

In [12]:
def fetch_html(url, max_retries=3, base_sleep=0.5, session=session, verbose=True):
    variants = [url, url.rstrip('/'), url.replace('https://','http://')]
    for v in variants:
        for attempt in range(1, max_retries+1):
            try:
                r = session.get(v, timeout=20)
                if verbose: print(f"GET {v} -> {r.status_code}")
                if r.ok:
                    return r.text
                if r.status_code == 404:
                    break  # prueba siguiente variante
            except Exception as e:
                if verbose: print("Error de red:", e)
            sleep = base_sleep * (2**(attempt-1)) + random.uniform(0, 0.25)
            time.sleep(sleep)
    return None

# Parser preferido: lxml; fallback a html.parser
try:
    import lxml  # solo para verificar
    PARSER = "lxml"
except Exception:
    PARSER = "html.parser"
    print("⚠️ lxml no disponible; usando 'html.parser'.")

⚠️ lxml no disponible; usando 'html.parser'.


# 3) Parseo de páginas de listado

In [13]:
# --- 1. Parsear una página de listados ---
def parse_list_page(html):
    soup = BeautifulSoup(html, PARSER)
    items = []
    
    # Cada card de vehículo está en <article class="card listing-card ...">
    for card in soup.select("article.card.listing-card"):
        item = {}
        # Título (ejemplo: "KIA K3 Cross Vibrant")
        title_tag = card.select_one("meta[itemprop='name']")
        item["title"] = title_tag["content"].strip() if title_tag and title_tag.has_attr("content") else None
        
        # Precio (ejemplo: "$81.990.000")
        price_tag = card.select_one("span.listing-card__price-value")
        item["price_raw"] = price_tag.get_text(strip=True) if price_tag else None
        
        # URL del detalle
        link_tag = card.select_one("a")
        item["detail_url"] = urljoin(BASE2, link_tag["href"]) if link_tag and link_tag.has_attr("href") else None

        items.append(item)

    # Buscar link a la siguiente página (si existe)
    next_link = soup.select_one("a.pagenav.btn.m-next")
    next_url = urljoin(BASE2, next_link["href"]) if next_link else None
    
    return items, next_url



# 5) Crawler: recorrer N paginas con pausas

In [14]:
 #--- 2. Crawler para recorrer varias páginas ---
def crawl_list(BASE2, max_pages=50, sleep=(0.7, 1.5)):
    url = BASE2; data = []; pages = 0
    while url and pages < max_pages:
        html = fetch_html(url)
        if html is None: break
        rows, url = parse_list_page(html)
        data.extend(rows); pages += 1
        time.sleep(random.uniform(*sleep))
        print(f"Acumulados: {len(data)} | Página: {pages} | Siguiente: {url}")
    return data

raw_list = crawl_list(BASE2, max_pages=50)

GET https://www.autocosmos.com.co/auto/nuevo?pr=400&cd=5995&pidx=1 -> 200
Acumulados: 48 | Página: 1 | Siguiente: https://www.autocosmos.com.co/auto/nuevo?pr=400&cd=5995&pidx=2
GET https://www.autocosmos.com.co/auto/nuevo?pr=400&cd=5995&pidx=2 -> 200
Acumulados: 96 | Página: 2 | Siguiente: https://www.autocosmos.com.co/auto/nuevo?pr=400&cd=5995&pidx=3
GET https://www.autocosmos.com.co/auto/nuevo?pr=400&cd=5995&pidx=3 -> 200
Acumulados: 144 | Página: 3 | Siguiente: https://www.autocosmos.com.co/auto/nuevo?pr=400&cd=5995&pidx=4
GET https://www.autocosmos.com.co/auto/nuevo?pr=400&cd=5995&pidx=4 -> 200
Acumulados: 192 | Página: 4 | Siguiente: https://www.autocosmos.com.co/auto/nuevo?pr=400&cd=5995&pidx=5
GET https://www.autocosmos.com.co/auto/nuevo?pr=400&cd=5995&pidx=5 -> 200
Acumulados: 240 | Página: 5 | Siguiente: https://www.autocosmos.com.co/auto/nuevo?pr=400&cd=5995&pidx=6
GET https://www.autocosmos.com.co/auto/nuevo?pr=400&cd=5995&pidx=6 -> 200
Acumulados: 288 | Página: 6 | Siguient

In [15]:
#Creamos el dataset de los datos del scrapping

df_precios = pd.DataFrame(raw_list)
print(df_precios.head())

              title     price_raw  \
0  Volkswagen Nivus  $101.990.000   
1      Jeep Compass  $185.990.000   
2  Volkswagen Nivus   $90.990.000   
3      Jeep Compass  $185.990.000   
4       Mazda CX-90  $262.350.000   

                                          detail_url  
0  https://www.autocosmos.com.co/auto/nuevo/volks...  
1  https://www.autocosmos.com.co/auto/nuevo/jeep/...  
2  https://www.autocosmos.com.co/auto/nuevo/volks...  
3  https://www.autocosmos.com.co/auto/nuevo/jeep/...  
4  https://www.autocosmos.com.co/auto/nuevo/mazda...  


# 6) Normalizacion

In [18]:
cols_texto = ["title", "price_raw"]
for col in cols_texto:
    df_precios[col] = df_precios[col].astype(str).str.strip().str.lower()


df_precios.head()

Unnamed: 0,title,price_raw,detail_url
0,volkswagen nivus,$101.990.000,https://www.autocosmos.com.co/auto/nuevo/volks...
1,jeep compass,$185.990.000,https://www.autocosmos.com.co/auto/nuevo/jeep/...
2,volkswagen nivus,$90.990.000,https://www.autocosmos.com.co/auto/nuevo/volks...
3,jeep compass,$185.990.000,https://www.autocosmos.com.co/auto/nuevo/jeep/...
4,mazda cx-90,$262.350.000,https://www.autocosmos.com.co/auto/nuevo/mazda...


**limpieza**

In [19]:
# Quitar espacios
df_precios["price_raw"] = df_precios["price_raw"].str.strip()

# Crear columna de moneda
df_precios["currency"] = "COP"
df_precios.loc[df_precios["price_raw"].str.startswith("us"), "currency"] = "USD"

# Limpiar la columna numérica
df_precios["price_clean"] = (
    df_precios["price_raw"]
    .str.replace(r"[^0-9]", "", regex=True)  # quitar $, us, puntos, etc.
    .astype(int)
)

df_precios= df_precios.drop(columns=["price_raw"])
print(df_precios.head())


              title                                         detail_url  \
0  volkswagen nivus  https://www.autocosmos.com.co/auto/nuevo/volks...   
1      jeep compass  https://www.autocosmos.com.co/auto/nuevo/jeep/...   
2  volkswagen nivus  https://www.autocosmos.com.co/auto/nuevo/volks...   
3      jeep compass  https://www.autocosmos.com.co/auto/nuevo/jeep/...   
4       mazda cx-90  https://www.autocosmos.com.co/auto/nuevo/mazda...   

  currency  price_clean  
0      COP    101990000  
1      COP    185990000  
2      COP     90990000  
3      COP    185990000  
4      COP    262350000  


**Creacion Columna key**

Se creo esta columna teniendo en cuenta el brand y model para luego unir los datasets

In [61]:

df_cars_final["key"] = (
    df_cars_final["brand"].str.lower().str.strip() + " " +
    df_cars_final["model"].str.lower().str.strip()
)

# --- Dataset de scraping (precios nuevos) ---
df_precios["key"] = df_precios["title"].str.lower()


print("Head de df_precios:")
print(df_precios.head())


print("\nHead de df_cars_final:")
print(df_cars_final.head())


Head de df_precios:
                title                                         detail_url  \
0  ford f-150 híbrida  https://www.autocosmos.com.co/auto/nuevo/ford/...   
1   nissan kicks play  https://www.autocosmos.com.co/auto/nuevo/nissa...   
2        jeep compass  https://www.autocosmos.com.co/auto/nuevo/jeep/...   
3   volkswagen virtus  https://www.autocosmos.com.co/auto/nuevo/volks...   
4          mazda mx-5  https://www.autocosmos.com.co/auto/nuevo/mazda...   

  currency  price_clean                 key  
0      COP    339990000  ford f-150 híbrida  
1      COP    104990000   nissan kicks play  
2      COP    185990000        jeep compass  
3      COP     89990000   volkswagen virtus  
4      COP    182650000          mazda mx-5  

Head de df_cars_final:
   listing_id       brand    model  year  mileage_km  price_usd         city  \
0           1   chevrolet     onix  2016       90791   14551.54  bucaramanga   
1           2     hyundai      i30  2024      114658   21847.97

# 7) Unimos Datasets

**Datos unicos**

In [None]:
df_precios_unique = df_precios.drop_duplicates(subset="key", keep="first")

used_cars_base_scrapping = df_cars_final.merge(
    df_precios_unique[["key", "price_clean", "currency"]],
    on="key",
    how="left"
)

**Directorio de precios para busquedas rapidas**

In [None]:

price_dict = dict(zip(df_precios_unique["key"], df_precios_unique["price_clean"]))
currency_dict = dict(zip(df_precios_unique["key"], df_precios_unique["currency"]))



**Función para buscar coincidencia difusa**

In [None]:
def fuzzy_match(key, choices, threshold=60):
    match = process.extractOne(key, choices)
    if match and match[1] >= threshold:
        return match[0]
    return None

**extraemos informacion**

In [59]:
# Aplicar la búsqueda a las filas que no tuvieron match exacto
missing_mask = used_cars_base_scrapping["price_clean"].isna()
choices = list(df_precios_unique["key"].unique())

used_cars_base_scrapping.loc[missing_mask, "matched_key"] = used_cars_base_scrapping.loc[missing_mask, "key"].apply(lambda k: fuzzy_match(k, choices))

# Recuperar los precios usando la matched_key
used_cars_base_scrapping["price_clean"] = used_cars_base_scrapping.apply(
    lambda r: r["price_clean"] if pd.notna(r["price_clean"]) else price_dict.get(r["matched_key"]),
    axis=1
)
used_cars_base_scrapping["currency"] = used_cars_base_scrapping.apply(
    lambda r: r["currency"] if pd.notna(r["currency"]) else currency_dict.get(r["matched_key"]),
    axis=1
)

In [60]:
print("Coincidencias exactas o difusas:", used_cars_base_scrapping["price_clean"].notna().sum())
print("Coincidencias faltantes:", used_cars_base_scrapping["price_clean"].isna().sum())

Coincidencias exactas o difusas: 300
Coincidencias faltantes: 0


In [61]:
used_cars_base_scrapping.head(20)

Unnamed: 0,listing_id,brand,model,year,mileage_km,price_usd,city,state,fuel,transmission,seller_type,listing_date,country,key,price_clean,currency,matched_key
0,1,chevrolet,onix,2016,90791,14551.54,bucaramanga,santander,gasoline,manual,dealer,2024-02-17,estados unidos,chevrolet onix,89800000.0,COP,chevrolet onix turbo
1,2,hyundai,i30,2024,114658,21847.97,medellín,antioquia,gasoline,automatic,private,2024-04-22,corea del sur,hyundai i30,78990000.0,COP,hyundai hb20 getz
2,3,volkswagen,t-cross,2008,91468,5265.93,medellín,antioquia,gasoline,automatic,dealer,2024-02-17,alemania,volkswagen t-cross,89990000.0,COP,
3,4,mazda,mazda2,2019,63498,19512.69,cali,valle del cauca,diesel,automatic,dealer,2024-07-12,japón,mazda mazda2,97900000.0,COP,mazda 2
4,5,chevrolet,tracker,2019,27864,18018.49,cali,valle del cauca,electric,automatic,dealer,2024-07-13,estados unidos,chevrolet tracker,100610000.0,COP,chevrolet tracker turbo
5,6,kia,cerato,2019,57174,16977.99,medellín,antioquia,gasoline,automatic,dealer,2024-03-28,corea del sur,kia cerato,104990000.0,COP,kia k3
6,7,ford,focus,2013,55301,14160.78,bucaramanga,santander,gasoline,manual,dealer,2024-04-27,estados unidos,ford focus,339990000.0,COP,ford f-150 hibrida
7,8,toyota,rav4,2020,98385,16784.75,bucaramanga,santander,diesel,manual,private,2024-07-21,japón,toyota rav4,109900000.0,COP,toyota corolla hev
8,9,volkswagen,polo,2016,137552,14496.13,bucaramanga,santander,gasoline,manual,private,2024-04-22,alemania,volkswagen polo,79990000.0,COP,
9,10,renault,kwid,2010,147209,8032.13,bogotá,cundinamarca,gasoline,manual,private,2024-02-02,francia,renault kwid,59990000.0,COP,


In [62]:
# borramos columnas key y matched key
used_cars_base_scrapping = used_cars_base_scrapping.drop(columns=["matched_key"])
used_cars_base_scrapping = used_cars_base_scrapping.drop(columns=["key"])


used_cars_base_scrapping.head(20)

Unnamed: 0,listing_id,brand,model,year,mileage_km,price_usd,city,state,fuel,transmission,seller_type,listing_date,country,price_clean,currency
0,1,chevrolet,onix,2016,90791,14551.54,bucaramanga,santander,gasoline,manual,dealer,2024-02-17,estados unidos,89800000.0,COP
1,2,hyundai,i30,2024,114658,21847.97,medellín,antioquia,gasoline,automatic,private,2024-04-22,corea del sur,78990000.0,COP
2,3,volkswagen,t-cross,2008,91468,5265.93,medellín,antioquia,gasoline,automatic,dealer,2024-02-17,alemania,89990000.0,COP
3,4,mazda,mazda2,2019,63498,19512.69,cali,valle del cauca,diesel,automatic,dealer,2024-07-12,japón,97900000.0,COP
4,5,chevrolet,tracker,2019,27864,18018.49,cali,valle del cauca,electric,automatic,dealer,2024-07-13,estados unidos,100610000.0,COP
5,6,kia,cerato,2019,57174,16977.99,medellín,antioquia,gasoline,automatic,dealer,2024-03-28,corea del sur,104990000.0,COP
6,7,ford,focus,2013,55301,14160.78,bucaramanga,santander,gasoline,manual,dealer,2024-04-27,estados unidos,339990000.0,COP
7,8,toyota,rav4,2020,98385,16784.75,bucaramanga,santander,diesel,manual,private,2024-07-21,japón,109900000.0,COP
8,9,volkswagen,polo,2016,137552,14496.13,bucaramanga,santander,gasoline,manual,private,2024-04-22,alemania,79990000.0,COP
9,10,renault,kwid,2010,147209,8032.13,bogotá,cundinamarca,gasoline,manual,private,2024-02-02,francia,59990000.0,COP


In [63]:
print(used_cars_base_scrapping.isnull().sum())

listing_id      0
brand           0
model           0
year            0
mileage_km      0
price_usd       0
city            0
state           0
fuel            0
transmission    0
seller_type     0
listing_date    0
country         0
price_clean     0
currency        0
dtype: int64


In [64]:
# Guardar en CSV
used_cars_base_scrapping.to_csv("../data/staging/used_cars_scrapping.csv", index=False, encoding="utf-8-sig")

# 8) guardar dataset final y Subir a postgress

In [None]:
used_cars_base_scrapping.to_csv("../data/staging/used_cars_scrapping.csv", index=False, encoding="utf-8-sig")

In [65]:
# Parámetros de conexión desde tu docker-compose
usuario = "etluser"
password = "etlpass"
host = "localhost"   # o la IP del servidor si no corres en la misma máquina
puerto = "5432"
base_datos = "dw"

# Crear motor de conexión
engine = create_engine(f"postgresql://{usuario}:{password}@{host}:{puerto}/{base_datos}")



# Subir a PostgreSQL
used_cars_base_scrapping.to_sql("used_cars_scrapping", engine, if_exists="replace", index=False)

print("Dataset subido correctamente a PostgreSQL en la tabla 'used_cars_scrapping'")

Dataset subido correctamente a PostgreSQL en la tabla 'used_cars_scrapping'


In [68]:
df_cars_final.info()

<class 'pandas.core.frame.DataFrame'>
Index: 148 entries, 2 to 298
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   listing_id    148 non-null    int64  
 1   brand         148 non-null    object 
 2   model         148 non-null    object 
 3   year          148 non-null    int64  
 4   mileage_km    148 non-null    int64  
 5   price_usd     148 non-null    float64
 6   city          148 non-null    object 
 7   state         148 non-null    object 
 8   fuel          148 non-null    object 
 9   transmission  148 non-null    object 
 10  seller_type   148 non-null    object 
 11  listing_date  148 non-null    object 
 12  country       148 non-null    object 
 13  key           148 non-null    object 
 14  price_clean   148 non-null    float64
 15  currency      148 non-null    object 
dtypes: float64(2), int64(3), object(11)
memory usage: 19.7+ KB
