### Librerías

In [23]:
from pymongo import MongoClient
import pandas as pd

from urllib.parse import urlparse
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import MiniBatchKMeans

### Conexión Mongo

In [None]:
client = MongoClient("mongodb://localhost:27017/")

### Obtención de Datos

In [4]:
# base de datos y colección
db = client["local_data_inmobiliaria"]
collection = db["companies"]

In [26]:
rows = []

for doc in collection.find().limit(500):
    company = doc.get("name")
    domain = doc.get("primaryDomain")

    for src in doc.get("dataSources", []):
        rows.append({
            "company": company,
            "domain": domain,
            "url": src.get("url"),
            "links": src.get("links", {})
        })

df_pages = pd.DataFrame(rows)
df_pages.head()

Unnamed: 0,company,domain,url,links
0,Reity,reity.cl,https://reity.cl,"{'head': [], 'header': [], 'main': ['https://r..."
1,Reity,reity.cl,https://reity.cl/blog/product-update-mercado-s...,"{'head': [], 'header': [], 'main': ['https://...."
2,Reity,reity.cl,https://ayuda.reity.cl/es/collections/7476943-...,{'head': ['https://ayuda.reity.cl/es/collectio...
3,Reity,reity.cl,https://reity.cl/signin,"{'head': [], 'header': [], 'main': ['https://r..."
4,Reity,reity.cl,https://reity.cl/signup,"{'head': ['https://reity.cl//signup'], 'header..."


### Vectorización

In [27]:
def url_to_tokens(url):
    if not isinstance(url, str):
        return []

    url = url.strip()
    if len(url) < 3:
        return []

    # aceptar URLs completas o paths
    if not (url.startswith("http") or url.startswith("/")):
        return []

    tokens = []

    try:
        parsed = urlparse(url)

        # dominio
        if parsed.netloc:
            domain = parsed.netloc.replace("www.", "")
            tokens.extend(domain.split("."))

        # path
        path_tokens = re.split(r"[\/\-_]", parsed.path)
        tokens.extend([t for t in path_tokens if len(t) > 2])

    except Exception:
        return []

    return tokens

def extract_text_from_links(links_dict):
    tokens = []

    for section in ["head", "body", "footer"]:
        urls = links_dict.get(section, [])
        if isinstance(urls, list):
            for url in urls:
                tokens.extend(url_to_tokens(url))

    return " ".join(tokens)

In [29]:
df_pages["text"] = df_pages["links"].apply(extract_text_from_links)
df_pages[["url", "text"]]

Unnamed: 0,url,text
0,https://reity.cl,apps apple com app reity id6468899299 play goo...
1,https://reity.cl/blog/product-update-mercado-s...,apps apple com app reity id6468899299 play ...
2,https://ayuda.reity.cl/es/collections/7476943-...,ayuda reity cl collections 7476943 preguntas f...
3,https://reity.cl/signin,
4,https://reity.cl/signup,reity cl signup
...,...,...
70284,https://concreteinvesting.com/blog/progetti-im...,gmpg org xfn concreteinvesting com blog proget...
70285,https://concreteinvesting.com/blog/progetti-im...,
70286,https://concreteinvesting.com/blog/wp-json/wp/...,
70287,https://concreteinvesting.com/blog/wp-json/oem...,


### Tokenización

In [32]:
texts = df_pages["text"].fillna("")

vectorizer = TfidfVectorizer(
    min_df=2,
    max_features=3000
)

X = vectorizer.fit_transform(texts)

kmeans = MiniBatchKMeans(n_clusters=5, random_state=42)
df_pages["cluster"] = kmeans.fit_predict(X)

df_pages[["url", "cluster"]]

Unnamed: 0,url,cluster
0,https://reity.cl,0
1,https://reity.cl/blog/product-update-mercado-s...,4
2,https://ayuda.reity.cl/es/collections/7476943-...,0
3,https://reity.cl/signin,0
4,https://reity.cl/signup,0
...,...,...
70284,https://concreteinvesting.com/blog/progetti-im...,4
70285,https://concreteinvesting.com/blog/progetti-im...,0
70286,https://concreteinvesting.com/blog/wp-json/wp/...,0
70287,https://concreteinvesting.com/blog/wp-json/oem...,0


In [33]:
df_pages["cluster"].value_counts()


cluster
0    34951
4    30944
1     1844
2     1326
3     1224
Name: count, dtype: int64

In [34]:
order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()

for i in range(kmeans.n_clusters):
    top_terms = [terms[ind] for ind in order_centroids[i, :10]]
    print(f"Cluster {i}: {top_terms}")

Cluster 0: ['nl', 'de', 'ch', 'co', 'io', 'd7', 'com', 'bolero', 'cl', 'be']
Cluster 1: ['5xx', 'error', 'landing', 'cloudflare', 'com', 'free', 'status', 'developers', 'troubleshooting', 'support']
Cluster 2: ['help', 'facebook', 'com', 'invest', 'materials', 'pre', 'dispute', 'education', 'terms', 'twitter']
Cluster 3: ['login', 'linkedin', 'com', 'licdn', 'static', 'legal', 'policy', 'aero', 'copyright', 'policies']
Cluster 4: ['com', 'co', 'eu', 'blog', 'mozzeno', 'twitter', 'json', 'legal', 'inveslar', 'assetmonk']


### Clusterización

In [48]:
SOCIAL_DOMAIN = {
    "facebook.com",
    "instagram.com",
    "linkedin.com",
    "twitter.com",
    "x.com",
    "tiktok.com",
    "threads.net",
    "pinterest.com",
    "github.com",
    "medium.com",
    "farcaster.xyz",
    "notion.site"
}

APP_DOMAIN = {
    "apps.apple.com",
    "play.google.com"
}

PROPERTY_DOMAIN = {
    "airbnb.cl"
}

LEGAL_DOMAIN = {
    "sii.cl",
    "buk.cl",
    "digitaloceanspaces.com"
}

TO_BE_IGNORED = {
    "intercom.com",
    "googleapis.com"
}

MULTIMEDIA_DOMAIN = {
    "intercomcdn.com"
}

YOUTUBE_DOMAIN = "youtube.com"

In [37]:
NEWS_KEYWORDS = ["news", "blog", "press", "update", "market", "fintech", "media"]
ONCHAIN_KEYWORDS = ["etherscan", "wallet", "tx", "blockchain", "smartcontract", "defi", "onchain"]
INVEST_KEYWORDS = ["invest", "investment", "fund", "portfolio", "capital"]

In [49]:
def categorize_url_multilabel(url, text=""):
    # proteger URL no string
    if not isinstance(url, str) or len(url.strip()) == 0:
        return ["other"]

    url = url.lower()
    text = "" if text is None else str(text).lower()

    try:
        parsed = urlparse(url)
        netloc = parsed.netloc.lower()
        path = parsed.path.lower()
        if ":" in netloc:
            netloc = netloc.split(":")[0]
    except Exception:
        netloc = ""
        path = ""

    categories = []

    # 1. dominio fijo
    if any(netloc == d or netloc.endswith("." + d) for d in SOCIAL_DOMAIN):
        categories.append("social")
    if any(netloc == d or netloc.endswith("." + d) for d in APP_DOMAIN):
        categories.append("app_store")
    if any(netloc == d or netloc.endswith("." + d) for d in PROPERTY_DOMAIN):
        categories.append("property")
    if any(netloc == d or netloc.endswith("." + d) for d in LEGAL_DOMAIN):
        categories.append("legal")
    if any(netloc == d or netloc.endswith("." + d) for d in TO_BE_IGNORED):
        categories.append("ignore")
    if any(netloc == d or netloc.endswith("." + d) for d in MULTIMEDIA_DOMAIN):
        categories.append("multimedia")
    if netloc == YOUTUBE_DOMAIN:
        if path.startswith("/@"):
            categories.append("youtube_profile")
        elif path.startswith("/watch"):
            categories.append("youtube_video")
        else:
            categories.append("youtube_other")

    # 2. keywords URL/text
    if any(k in url for k in NEWS_KEYWORDS) or any(k in text for k in NEWS_KEYWORDS):
        categories.append("news")
    if any(k in url for k in ONCHAIN_KEYWORDS) or any(k in text for k in ONCHAIN_KEYWORDS):
        categories.append("on_chain")
    if any(k in url for k in INVEST_KEYWORDS) or any(k in text for k in INVEST_KEYWORDS):
        categories.append("investment")

    # 3. fallback
    if len(categories) == 0:
        categories.append("other")

    return categories


In [50]:
df_pages["category"] = df_pages.apply(
    lambda row: categorize_url_multilabel(row.get("url", ""), row.get("text", "")),
    axis=1
)

print(df_pages["category"].value_counts())


category
[other]                                 21617
[news, investment]                      19204
[news]                                  10957
[investment]                            10225
[social]                                 2457
[news, on_chain, investment]             1480
[social, investment]                      919
[social, news]                            792
[on_chain]                                641
[social, news, investment]                627
[youtube_other]                           525
[news, on_chain]                          326
[youtube_video]                           156
[app_store]                               100
[on_chain, investment]                     58
[ignore]                                   55
[app_store, investment]                    40
[youtube_profile]                          23
[social, on_chain]                         15
[social, news, on_chain]                   12
[social, news, on_chain, investment]       10
[youtube_other, on_chain]

In [56]:
# Revisión de Clusters por Muestra Aleatoria

# excluir 'other' de la muestra
df_filtered = df_pages[~df_pages["category"].apply(lambda cats: "other" in cats)]

# tomar 20 muestras aleatorias
sample = df_filtered.sample(20)[["url", "category"]]

# mostrar con columna completa de texto si quieres
pd.set_option("display.max_colwidth", None)
sample


Unnamed: 0,url,category
51142,https://help.leoneinvestments.it/category/tassazione/feed,[investment]
8055,https://crowdbase.eu/en/glossary/cash-flow-statement,"[news, investment]"
23764,https://sobre.inco.vc/investindo-pela-inco?hsLang=pt-br#restri%C3%A7%C3%B5es-para-investimentos,[investment]
14434,https://twitter.com/intent/tweet?url=https%3A%2F%2Fassetmonk.com%2Farticles%2Ffixed-income%2Fhow-to-invest-10-lakhs-rupees-for-monthly-income%2F,"[social, investment]"
32294,https://blog.100ladrillos.com/wp-json/wp/v2/posts/5244,[news]
50161,https://it.october.eu/prestatori/investitori-istituzionali,"[news, investment]"
57687,https://landlordinvest.com/blog/landlordinvest-launches-secondary-market,"[news, investment]"
4279,https://facebook.com/sharer/sharer.php?u=https://www.shojin.co.uk/insights/news/first-asian-real-estate-investment-for-shojin,"[social, news, investment]"
28110,https://blog.hurst.capital/blog/o-que-e-criptomoedas-e-como-funciona,"[news, investment]"
55173,https://support.mozzeno.com/nl/support/solutions/articles/7000070062-hoe-vraag-ik-een-terugkoop-van-notes-aan-,"[news, investment]"
