# Companies Querys
#### Funciones y ejemplos para extraer data desde modelo en MongoDB

## Conectarce con DB y datos de ejemplo

In [None]:
from mongo import get_db

#conectar MongoDB
db = get_db()

#Seleccionar Companies
companies = db["companies"]

#Inputs de ejemplo
slug = "wbuild"
datasource_url = "https://wbuild.io"

## Slugs
#### Funciones para traer slugs y verificar duplicados

In [None]:
from companies_querys import (
    get_all_slugs, 
    get_unique_slugs, 
    get_repeated_slugs
    )

slugs = get_all_slugs(companies) #get_all_slugs(companies, include_empty: bool = False) -> List[str] Retorna todos los slugs en la colección, puede incluir repetidos
slugs_unique = get_unique_slugs(companies) #get_unique_slugs(companies, include_empty: bool = False) -> List[str]  Retorna slugs únicos usando distinct
slugs_repetidos = get_repeated_slugs(companies)  #get_repeated_slugs(companies, include_empty: bool = False) -> List[Dict[str, Any]]  Retorna slugs repetidos con count, ordenados por count desc

In [None]:
print("get_all_slugs (incluye repetidos)")
print(slugs)
print(len(slugs))
print("")
print("get_unique_slugs (unicos)")
print(slugs_unique)
print(len(slugs_unique))
print("")
print("get_repeated_slugs (slugs repetidos)")
print(slugs_repetidos)
print(len(slugs_repetidos))

## PrimaryDomanin
#### Funciones para traer primaryDomain de companies y verificar duplicados

In [None]:
from companies_querys import (
    get_all_primary_domains,
    get_unique_primary_domains,
    get_repeated_primary_domains,
)

primary_domains = get_all_primary_domains(companies)  # get_all_primary_domains(companies, include_empty: bool = False) -> List[str] Retorna todos los primaryDomain, puede incluir repetidos
primary_domains_unique = get_unique_primary_domains(companies)  # get_unique_primary_domains(companies, include_empty: bool = False) -> List[str] Retorna primaryDomain unicos usando distinct
primary_domains_repetidos = get_repeated_primary_domains(companies)  # get_repeated_primary_domains(companies, include_empty: bool = False) -> List[Dict[str, Any]] Retorna primaryDomain repetidos con count, ordenados por count desc


In [None]:
print("get_all_primary_domains (incluye repetidos)")
print(primary_domains)
print(len(primary_domains))
print("")

print("get_unique_primary_domains (unicos)")
print(primary_domains_unique)
print(len(primary_domains_unique))
print("")

print("get_repeated_primary_domains (primaryDomain repetidos)")
print(primary_domains_repetidos)
print(len(primary_domains_repetidos))

## DataSources
#### Funciones para extraer data general y especifica del dataSource del modelo

### URLS: Extraccion y verificacion de las urls en dataSource

#### get_unique_datasource_urls()

In [None]:
from companies_querys import get_unique_datasource_urls

datasource_urls_unique = get_unique_datasource_urls(companies, slug)  # get_unique_datasource_urls(companies, slug: str) -> List[str] Retorna urls unicas de dataSources.url para la company

In [None]:
print("get_unique_datasource_urls (urls unicas en dataSources.url)")
print(datasource_urls_unique)
print(len(datasource_urls_unique))

#### get_repeated_datasource_urls()

In [None]:
from companies_querys import get_repeated_datasource_urls

datasource_urls_repeated = get_repeated_datasource_urls(companies, slug)  # get_repeated_datasource_urls(companies, slug: str) -> List[Dict[str, Any]] Retorna urls repetidas en dataSources.url con count

In [None]:
print("get_repeated_datasource_urls (urls repetidas en dataSources.url con count)")
print(datasource_urls_repeated)
print(len(datasource_urls_repeated))

#### unique_company_urls_from_primary_domain()

In [None]:
from companies_querys import unique_company_urls_from_primary_domain

urls_loose = unique_company_urls_from_primary_domain(companies, slug, mode="loose")  # unique_company_urls_from_primary_domain(companies, slug: str, mode: str = "loose") -> List[str] URLs unicas del dataSources que matchean primaryDomain usando modo loose
urls_strict = unique_company_urls_from_primary_domain(companies, slug, mode="strict")  # unique_company_urls_from_primary_domain(companies, slug: str, mode: str = "loose") -> List[str] URLs unicas del dataSources que matchean primaryDomain usando modo strict


In [None]:
print("unique_company_urls_from_primary_domain")
print("mode=loose Incluye primaryDomain y subdominios. Tambien acepta casos tipo ejemplo.algo.com si contiene el label base del primaryDomain")
print(urls_loose)
print(len(urls_loose))
print("")
print("mode=strict Solo incluye primaryDomain y subdominios directos. No acepta casos tipo ejemplo.algo.com si no es subdominio del primaryDomain")
print(urls_strict)
print(len(urls_strict))

### Links: Extraccion de links de navegacion dentro de urls del dataSource

#### get_links_from_company_datasource()

In [None]:
from companies_querys import get_links_from_company_datasource

# get_links_from_company_datasource(companies, slug: str, datasource_url: str, sections=None) -> List[str]
# Retorna links unicos desde dataSources.links para una url especifica dentro de una company.
# sections controla desde que partes se extraen links:
# - None: head header main footer
# - "main": solo esa seccion
# - ["header", "main"]: combinatoria

links_all = get_links_from_company_datasource(companies, slug, datasource_url)  # sections=None
links_main = get_links_from_company_datasource(companies, slug, datasource_url, sections="main")
links_footer = get_links_from_company_datasource(companies, slug, datasource_url, sections="footer")
links_combo = get_links_from_company_datasource(companies, slug, datasource_url, sections=[ "main", "footer"])

In [None]:
print("get_links_from_company_datasource sections=None (head header main footer)")
print(links_all)
print(len(links_all))
print("")

print("get_links_from_company_datasource sections=main (solo main)")
print(links_main)
print(len(links_main))
print("")

print("get_links_from_company_datasource sections=head (solo head)")
print(links_footer)
print(len(links_footer))
print("")

print("get_links_from_company_datasource sections=[ main, footer] (combinatoria)")
print(links_combo)
print(len(links_combo))

### Texts: Extraccion de textos dentro de urls del dataSource

#### get_texts_from_company_datasource()

In [None]:
from companies_querys import get_texts_from_company_datasource

# get_texts_from_company_datasource(companies, slug: str, datasource_url: str, sections=None, dedupe: bool = True) -> List[str]
# Retorna textos desde dataSources.texts para una url especifica dentro de una company.
# sections controla desde que partes se extraen textos:
# - None: head header main footer
# - "main": solo esa seccion
# - ["header", "main"]: combinatoria
# dedupe controla si se deduplica el output:
# - True: lista unica sin duplicados
# - False: lista tal cual sin deduplicar

texts_all = get_texts_from_company_datasource(companies, slug, datasource_url)  # sections=None dedupe=True
texts_main = get_texts_from_company_datasource(companies, slug, datasource_url, sections="main")  # dedupe=True
texts_header = get_texts_from_company_datasource(companies, slug, datasource_url, sections="header")  # dedupe=True
texts_combo_no_dedupe = get_texts_from_company_datasource(companies, slug, datasource_url, sections=["header", "main", "footer"], dedupe=False)


In [None]:
print("get_texts_from_company_datasource sections=None dedupe=True (head header main footer)")
print(texts_all)
print("total")
print(len(texts_all))
print("")

print("get_texts_from_company_datasource sections=main dedupe=True (solo main)")
print(texts_main)
print("total")
print(len(texts_main))
print("")

print("get_texts_from_company_datasource sections=header dedupe=True (solo header)")
print(texts_header)
print("total")
print(len(texts_header))
print("")

print("get_texts_from_company_datasource sections=[header, main, footer] dedupe=False (combinatoria sin deduplicar)")
print(texts_combo_no_dedupe)
print("total")
print(len(texts_combo_no_dedupe))

In [None]:
print()
print("Cantidad: " + str(len()))