In [1]:
import numpy as np
from scipy.spatial.distance import jaccard, cosine
from bs4 import BeautifulSoup
import re
import requests
import openai
import os
import json
from datetime import datetime
import pandas as pd
import tqdm
from matplotlib import pyplot as plt
import glob as glob

from nltk.corpus import wordnet as wn
from nltk import download
from nltk.metrics.distance import edit_distance
#download('wordnet')

In [2]:
def read_json_domain_urls_file(file_path):
    with open(file_path, "r") as file:
        return json.load(file)
def read_plain_domain_urls_file(file_path):
    with open(file_path, "r") as file:
        return file.readlines()

In [3]:
ranking_files = [(file, datetime.strptime(file.split("_")[-1].split(".")[0], "%d%m%Y") )for file in glob.glob("../data/SCImago*Spanish_*.xlsx")]
ranking_files_sort = sorted(ranking_files, 
                            key=lambda x: x[1]
                            )
most_recent_file = ranking_files_sort[-1][0]
ranking_files, ranking_files_sort, most_recent_file

([('../data\\SCImago Media Ranking - Spain - Spanish_02092023.xlsx',
   datetime.datetime(2023, 9, 2, 0, 0)),
  ('../data\\SCImago Media Ranking - Spain - Spanish_20082023.xlsx',
   datetime.datetime(2023, 8, 20, 0, 0))],
 [('../data\\SCImago Media Ranking - Spain - Spanish_20082023.xlsx',
   datetime.datetime(2023, 8, 20, 0, 0)),
  ('../data\\SCImago Media Ranking - Spain - Spanish_02092023.xlsx',
   datetime.datetime(2023, 9, 2, 0, 0))],
 '../data\\SCImago Media Ranking - Spain - Spanish_02092023.xlsx')

In [4]:
n_examples = 60
rankings_file_path = os.path.join("..\data", most_recent_file)
media_ranks = pd.read_excel(rankings_file_path).sort_values("Global_rank")
media_ranks["Media"] = media_ranks["Media"].str.strip()
media_ranks["Domain"] = media_ranks["Domain"].str.strip()
media_ranks[["Media", "Domain", "Overall"]].head(n_examples) \
                                           .to_csv("../data/spain_media name_to_url.txt", 
                                                   index=False, 
                                                   header=False,
                                                   sep=";")
media_ranks.head(15)

Unnamed: 0,Media,Domain,Country,Region,Language,Global_rank,Overall
190,El País,elpais.com,Spain,Western Europe,Spanish,7,84.5
189,ABC,abc.es,Spain,Western Europe,Spanish,28,78.25
187,La Vanguardia,lavanguardia.com,Spain,Western Europe,Spanish/Catalan,43,76.75
188,El Español,elespanol.com,Spain,Western Europe,Spanish,43,76.75
186,El Mundo,elmundo.es,Spain,Western Europe,Spanish,60,74.75
185,El Periódico de Catalunya,elperiodico.com,Spain,Western Europe,Spanish,89,72.75
183,La Razón,larazon.es,Spain,Western Europe,Spanish,115,71.25
184,Europa Press,europapress.es,Spain,Western Europe,Spanish,115,71.25
182,20 Minutos,20minutos.es,Spain,Western Europe,Spanish,124,70.75
181,El Confidencial,elconfidencial.com,Spain,Western Europe,Spanish,151,69.25


In [61]:
urls = [
    "https://www.elperiodico.com/es/economia/evolucion-liderazgo-empresa-aniversario-periodico-sh/index.html",
    "https://www.elperiodico.com/es/politics/",
    "https://www.elperiodico.com/es/politics",
    "https://www.elperiodico.com/es/",
    "https://www.elperiodico.com/es",
    "a./politics/province_1/",
    "/politics/province",
    "aa.a./politics/a",
    "/politics",
    "politics/a",
    "https://especiales.publico.es/opinion/",
    "https://www.elperiodico.com/es/sant-cugat/", 
    "https://www.publico.es/opinion#analytics-cabecera-comprimida:submenu", 
    "https://www.lavozdigital.es/micuenta/intereses/",
    "https://www.libertaddigital.com/espana/",
    "https://www.libertaddigital.com/internacional/",
    "/galicia/", 
    "https://www.larazon.es/gente/poder/"
    "://www.libertaddigital.com/internacional/",
    "//www.libertaddigital.com/internacional/",
    "/www.lavozdigital.es/micuenta/intereses/",
]
regex = "^(?:https://www[.]|https://|//www[.]|//|/www[.]|/[0-9a-zA-Z]*[.][0-9a-zA-Z]*|[0-9a-zA-Z]*[.]{,1}[0-9a-zA-Z]*)[^/]*/([^/]*$|[^/]*/$)"
regex = r"^(?:\w*:?)\/{2}[^\/]+/[^\/]*/?$"
replacement = "https://www."
for i, url in enumerate(urls):
    url_mod = url.replace(replacement, "")
    print(f"- {i} -")
    s = re.search(regex, 
                  url)
    print(url)
    if s:
        print("\t", s)

    s = re.search(regex, 
                  url_mod)
    if s:
        print(url_mod, "\n\t", s)

- 0 -
https://www.elperiodico.com/es/economia/evolucion-liderazgo-empresa-aniversario-periodico-sh/index.html
- 1 -
https://www.elperiodico.com/es/politics/
- 2 -
https://www.elperiodico.com/es/politics
- 3 -
https://www.elperiodico.com/es/
	 <re.Match object; span=(0, 31), match='https://www.elperiodico.com/es/'>
- 4 -
https://www.elperiodico.com/es
	 <re.Match object; span=(0, 30), match='https://www.elperiodico.com/es'>
- 5 -
a./politics/province_1/
- 6 -
/politics/province
- 7 -
aa.a./politics/a
- 8 -
/politics
- 9 -
politics/a
- 10 -
https://especiales.publico.es/opinion/
	 <re.Match object; span=(0, 38), match='https://especiales.publico.es/opinion/'>
https://especiales.publico.es/opinion/ 
	 <re.Match object; span=(0, 38), match='https://especiales.publico.es/opinion/'>
- 11 -
https://www.elperiodico.com/es/sant-cugat/
- 12 -
https://www.publico.es/opinion#analytics-cabecera-comprimida:submenu
	 <re.Match object; span=(0, 68), match='https://www.publico.es/opinion#analytics-cabe

In [29]:
urls = [
    "https://www.elperiodico.com/es/economia/evolucion-liderazgo-empresa-aniversario-periodico-sh/index.html",
    "https://www.elperiodico.com/es/politics/",
    "https://www.elperiodico.com/es/politics",
    "https://www.elperiodico.com/es/",
    "https://www.elperiodico.com/es",
    "a./politics/province_1/",
    "/politics/province",
    "aa.a./politics/a",
    "/politics",
    "politics/a",
    "https://especiales.publico.es/opinion/",
    "https://www.elperiodico.com/es/sant-cugat/", 
    "https://www.publico.es/opinion#analytics-cabecera-comprimida:submenu", 
    "https://www.lavozdigital.es/micuenta/intereses/",
    "https://www.libertaddigital.com/espana/",
    "https://www.libertaddigital.com/internacional/",
    "/galicia/", 
    "www.larazon.es/gente/poder/",
    "especiales.publico.es/opinion/",
    "www.elperiodico.com/es/sant-cugat/", 
    "www.publico.es/opinion#analytics-cabecera-comprimida:submenu", 
    "www.lavozdigital.es/micuenta/intereses/",
    "www.libertaddigital.com/espana/",
    "www.libertaddigital.com/internacional/"
]
regex = r"^(?:https://www[.]|https://|//www[.]|//|/www[.]|/[0-9a-zA-Z]*[.][0-9a-zA-Z]*|[0-9a-zA-Z]*[.]{,1}[0-9a-zA-Z]*)[^/]*/([^/]*$|[^/]*/$)"
regex = r"^(?!(\w*:?/{1,2}))[^\/]+/[^\/]+\/?$"
replacement = "https://www."
for i, url in enumerate(urls):
    url_mod = url.replace(replacement, "")
    print(f"- {i} -")
    s = re.search(regex, 
                  url)
    if s:
        print(url, "\n\t", s)

    #s = re.search(regex, 
    #              url_mod)
    #if s:
    #    print("mod -", url_mod, "\n\t", s)

- 0 -
- 1 -
- 2 -
- 3 -
- 4 -
- 5 -
- 6 -
- 7 -
- 8 -
- 9 -
- 10 -
- 11 -
- 12 -
- 13 -
- 14 -
- 15 -
- 16 -
- 17 -
- 18 -
especiales.publico.es/opinion/ 
	 <re.Match object; span=(0, 30), match='especiales.publico.es/opinion/'>
- 19 -
- 20 -
www.publico.es/opinion#analytics-cabecera-comprimida:submenu 
	 <re.Match object; span=(0, 60), match='www.publico.es/opinion#analytics-cabecera-comprim>
- 21 -
- 22 -
www.libertaddigital.com/espana/ 
	 <re.Match object; span=(0, 31), match='www.libertaddigital.com/espana/'>
- 23 -
www.libertaddigital.com/internacional/ 
	 <re.Match object; span=(0, 38), match='www.libertaddigital.com/internacional/'>


In [72]:
urls = [
    "https://www.elperiodico.com/es/economia/evolucion-liderazgo-empresa-aniversario-periodico-sh/index.html",
    "https://www.elperiodico.com/es/politics/",
    "https://www.elperiodico.com/es/politics",
    "https://www.elperiodico.com/es/",
    "https://www.elperiodico.com/es",
    "a./politics/province_1/",
    "/politics/province",
    "aa.a./politics/a",
    "/politics",
    "politics/a",
    "https://especiales.publico.es/opinion/",
    "https://www.elperiodico.com/es/sant-cugat/", 
    "https://www.publico.es/opinion#analytics-cabecera-comprimida:submenu", 
    "https://www.lavozdigital.es/micuenta/intereses/",
    "https://www.libertaddigital.com/espana/",
    "https://www.libertaddigital.com/internacional/",
    "/galicia/", 
    "politica/", 
    "/leisures", 
    "/galicia/vigo",
    "/galicia/potevedra-viva",
    "/galicia/potevedra-cine/",
    "/galicia/potevedra-cine/",
    "/galicia/potevedra-cine/drama",
    "/galicia/potevedra-cine/drama/",
    "www.larazon.es/gente/poder/",
    "especiales.publico.es/opinion/",
    "www.elperiodico.com/es/sant-cugat/", 
    "www.publico.es/opinion#analytics-cabecera-comprimida:submenu", 
    "www.lavozdigital.es/micuenta/intereses/",
    "www.libertaddigital.com/espana/",
    "www.libertaddigital.com/internacional/"
]
regex = r"^\/[^\/-]+-?[^\/-]*\/?$"
replacement = "https://www."
for i, url in enumerate(urls):
    url_mod = url.replace(replacement, "")
    print(f"- {i} -")
    s = re.search(regex, 
                  url)
    if s:
        print(url, "\n\t", s)

    #s = re.search(regex, 
    #              url_mod)
    #if s:
    #    print("mod -", url_mod, "\n\t", s)

- 0 -
- 1 -
- 2 -
- 3 -
- 4 -
- 5 -
- 6 -
- 7 -
- 8 -
/politics 
	 <re.Match object; span=(0, 9), match='/politics'>
- 9 -
- 10 -
- 11 -
- 12 -
- 13 -
- 14 -
- 15 -
- 16 -
/galicia/ 
	 <re.Match object; span=(0, 9), match='/galicia/'>
- 17 -
- 18 -
/leisures 
	 <re.Match object; span=(0, 9), match='/leisures'>
- 19 -
- 20 -
- 21 -
- 22 -
- 23 -
- 24 -
- 25 -
- 26 -
- 27 -
- 28 -
- 29 -
- 30 -
- 31 -


In [75]:
url1.replace("https:", ""), \
url1.replace("https:", ""), \
url1.replace("https:", ""), \
url1.replace("https:", ""), \
url1.replace("https:", "")

('//www.elperiodico.com/es/economia/evolucion-liderazgo-empresa-aniversario-periodico-sh/index.html',
 '//www.elperiodico.com/es/economia/evolucion-liderazgo-empresa-aniversario-periodico-sh/index.html',
 '//www.elperiodico.com/es/economia/evolucion-liderazgo-empresa-aniversario-periodico-sh/index.html',
 '//www.elperiodico.com/es/economia/evolucion-liderazgo-empresa-aniversario-periodico-sh/index.html',
 '//www.elperiodico.com/es/economia/evolucion-liderazgo-empresa-aniversario-periodico-sh/index.html')

In [94]:
url = urls[-1]
url[:url.find("#")] + "/"

'https://www.publico.es/opinion/'

In [271]:
main_url = "https://lavozdegalicia.es/"
domain = main_url.replace("https://", "")
response = requests.get("https://lavozdegalicia.es/", 
                        headers=HEADERS, 
                        timeout=6.0)
parsed_html = BeautifulSoup(response.content, 
                            "html.parser")
a_tags = parsed_html.html.find_all(lambda t: t.name == "a")

In [272]:
urls = [x.attrs.get("href", 0) for x in a_tags]
urls

['/',
 '/',
 '/galicia/',
 '/localidades/',
 '/economia/',
 '/espana/',
 '/internacional/',
 '/opinion/',
 '/deportes/',
 '/sociedad/',
 '/cultura/',
 '/somosagro/',
 '/somosmar/',
 '/alsol/',
 '/coruna/',
 '/amarina/',
 '/arousa/',
 '/barbanza/',
 '/carballo/',
 '/deza/',
 '/ferrol/',
 '/lemos/',
 '/lugo/',
 '/ourense/',
 '/pontevedra/',
 '/santiago/',
 '/vigo/',
 '/noticia/internacional/2023/09/10/marruecos-entierra-muertos-medio-carrera-hallar-supervivientes/00031694358824221504644.htm',
 '/noticia/internacional/2023/09/10/marruecos-entierra-muertos-medio-carrera-hallar-supervivientes/00031694358824221504644.htm',
 'https://www.lavozdegalicia.es/noticia/internacional/2023/09/10/tafagajt-aldea-marroqui-entierra-cuatro-vecinos/00031694361767779608837.htm',
 'https://www.lavozdegalicia.es/noticia/opinion/2023/09/10/estrecho-gibraltar-concentracion-estructuras-tectonicas-letales/0003_202309G10P15991.htm',
 '/video/internacional/2023/09/10/devastacion-marruecos-vista-drone/0031_202309znR

In [306]:
for a_tag in a_tags:
    link = a_tag.attrs.get("href", "")
    # Skipping conditionals
    link_lower = link.lower().strip()
    link_domain = "/".join(link_lower.split("/")[:3] + [""])

    # Drop if 'garbage' nodes
    #if "#" in link_lower \
    if "@" in link_lower \
        or "php" in link_lower \
        or "javascript" in link_lower \
        or "mailto" in link_lower \
        or "cookie" in link_lower \
        or "feed" in link_lower \
        or "contact" in link_lower \
        or ("aviso" in link_lower and "legal" in link_lower) \
        or "inici" in link_lower \
        or "session" in link_lower \
        or "sesion" in link_lower \
        or "ads" in link_lower \
        or "publicidad" in link_lower \
        or "privacidad" in link_lower \
        or "condiciones" in link_lower \
        or "tags" in link_lower \
        or "premium" in link_lower \
        or "archiv" in link_lower \
        or "sorteo" in link_lower \
        or "loter" in link_lower \
        or "newsletter" in link_lower \
        or "podcast" in link_lower \
        or "logout" in link_lower \
        or "login" in link_lower \
        or "notifica" in link_lower \
        or "push" in link_lower \
        or "servicio" in link_lower \
        or "esquela" in link_lower \
        or "defunci" in link_lower \
        or "favorito" in link_lower \
        or "firma" in link_lower \
        or "suscri" in link_lower \
        or "subscrib" in link_lower \
        or "pasatiempo" in link_lower \
        or "compra" in link_lower \
        or "tienda" in link_lower \
        or "gráfico" in link_lower or "gráfico" in link_lower \
        or "humor" in link_lower \
        or "foto" in link_lower \
        or "grafic" in link_lower \
        or "galeria" in link_lower \
        or "galería" in link_lower \
        or "opinion" in link_lower or "opinión" in link_lower \
        or "hemeroteca" in link_lower \
        or "video" in link_lower or "vídeo" in link_lower \
        or "play" in link_lower \
        or "patrocin" in link_lower \
        or "autor" in link_lower \
        or "author" in link_lower \
        or "mapa" in link_lower \
        or "map" in link_lower \
        or "blog" in link_lower \
        or "index" in link_lower \
        or "condicion" in link_lower \
        or "docs" in link_lower \
        or re.compile("\d{3,}").search(link_lower):
        continue

    if not link_lower \
        or (link_lower == response.url) \
        or (link_lower == domain) \
        or (not link_lower.startswith("/") and not link_domain.replace("https://", "").startswith(domain)) \
        or ("?" in link_lower or "=" in link_lower or "%" in link_lower or "%" in link_lower):
        continue
    else:
        print("ok", link_lower, link_domain.replace("https://", ""), domain)
    link_search = re.search(regex, link_lower)
    if link_search:
        # Starts with double slash (protocol calling)
        if link_lower.startswith("//"):
            link_lower = "https:" + link_lower[1:]
        elif link_lower.startswith("/"):
            if "www." in link_lower:
                link_lower = "https:/" + link_lower
            else:
                link_lower = response.url[:-1] + link_lower
        # Starts with 'world wide web' prefix
        elif link_lower.startswith("www."):
            link_lower = "https://" + link_lower
        elif link_lower.startswith("//www."):
            link_lower = "https:" + link_lower
    else:
        continue


    if "#" in link_lower:
        link_lower = link_lower[:link_lower.find("#")] + "/"

ok / // lavozdegalicia.es/
ok / // lavozdegalicia.es/
ok /galicia/ /galicia// lavozdegalicia.es/
ok /localidades/ /localidades// lavozdegalicia.es/
ok /economia/ /economia// lavozdegalicia.es/
ok /espana/ /espana// lavozdegalicia.es/
ok /internacional/ /internacional// lavozdegalicia.es/
ok /deportes/ /deportes// lavozdegalicia.es/
ok /sociedad/ /sociedad// lavozdegalicia.es/
ok /cultura/ /cultura// lavozdegalicia.es/
ok /somosagro/ /somosagro// lavozdegalicia.es/
ok /somosmar/ /somosmar// lavozdegalicia.es/
ok /alsol/ /alsol// lavozdegalicia.es/
ok /coruna/ /coruna// lavozdegalicia.es/
ok /amarina/ /amarina// lavozdegalicia.es/
ok /arousa/ /arousa// lavozdegalicia.es/
ok /barbanza/ /barbanza// lavozdegalicia.es/
ok /carballo/ /carballo// lavozdegalicia.es/
ok /deza/ /deza// lavozdegalicia.es/
ok /ferrol/ /ferrol// lavozdegalicia.es/
ok /lemos/ /lemos// lavozdegalicia.es/
ok /lugo/ /lugo// lavozdegalicia.es/
ok /ourense/ /ourense// lavozdegalicia.es/
ok /pontevedra/ /pontevedra// lavoz

In [295]:
link_domain, domain

('/docs/condiciones_generales.htm/', 'lavozdegalicia.es/')

In [294]:
"https://lavozdegalicia.es/".replace("https://", "")

'lavozdegalicia.es/'

In [287]:
print(link_lower == response.url, (link_lower == domain),
      (not link_lower.startswith("/") and not link_domain.replace("https://", "").startswith(domain)),
      ("?" in link_lower or "=" in link_lower or "%" in link_lower or "%" in link_lower))

False False False False


In [275]:
link_domain

'/docs/condiciones_generales.htm/'

In [164]:
link_domain = "/".join(link_lower.split("/")[:3] + [""])
link_lower, link_domain

('', '/')

In [125]:
re.sub(r"^(?:\w*:?/{1,2})", 
       "", 
       "//www.elconfidencial.com/inmobiliario/")

'www.elconfidencial.com/inmobiliario/'

In [157]:
# regex with
regex_https = r"^(?:\w*:?)\/{2}[^\/]+/[^\/]*/?$"
regex_no_https = r"^(?!(\w*:?/{1,2}))[^\/]+/[^\/]+\/?$"
#regex_node = "^(?:https://[^/]+)/[^/]*/?"
regex_node =  r"^\/[^\/-]+-?[^\/-]*\/?$"
regex_sub = r"^(?:\w*:?/{1,2})"

MAX_N_HYPHENS = 1
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"
}
file_name = "spain_media name_to_url.txt"
file_path = os.path.join("..", "data", file_name)
if os.path.exists(file_path):
    print("Reading file of urls of regions...")
    name_to_domain_urls = read_plain_domain_urls_file(file_path)
    all_sections = []
skip_head = True
for line in name_to_domain_urls:
    #if skip_head:
    #    skip_head = False
    #    continue
    _, domain, score = line.split(";")
    media = domain.replace("www.", "")
    domain = domain.strip()
    domain_url = "https://" + domain
    try:
        response = requests.get(domain_url, 
                                headers=HEADERS, 
                                timeout=6.0)
    except requests.exceptions.Timeout:
        # Handle the timeout exception
        print("The request timed out.", domain_url)
        continue
    except requests.exceptions.RequestException as e:
        # Handle other request exceptions
        print(f"Retry request with 'https://www.' prefix", domain_url)
        domain_url = "https://www." + domain
        response = requests.get(domain_url, 
                                headers=HEADERS, 
                                timeout=6.0)
        print(f"ok ({response.status_code})")
        #continue
    domain = response.url.split("/")[2]
    domain_url = "/".join(response.url.split("/")[:3] + [""])
    parsed_html = BeautifulSoup(response.content, 
                                "html.parser")
    try:
        a_tags = parsed_html.html.find_all("a", 
                                           href=True)
    except Exception as e:
        print(e)
    valid_links_complete = []
    with open("Skipped urls.txt", "w") as file_skipped:
        for a_tag in a_tags:
            link = a_tag.attrs.get("href", "")
            if not re.search(regex_https, link) \
                and not re.search(regex_no_https, link) \
                and not re.search(regex_node, link):
                continue
            # Skipping conditionals
            link_lower = link.lower()
            link_domain = "/".join(link_lower.split("/")[:3] + [""])

            # Drop if 'garbage' nodes
            #if "#" in link_lower \
            if link_lower.count("-") > MAX_N_HYPHENS \
                or "@" in link_lower \
                or "php" in link_lower \
                or "javascript" in link_lower \
                or "mailto" in link_lower \
                or "cookie" in link_lower \
                or "feed" in link_lower \
                or "contact" in link_lower \
                or ("aviso" in link_lower and "legal" in link_lower) \
                or "inici" in link_lower \
                or "session" in link_lower \
                or "sesion" in link_lower \
                or "ads" in link_lower \
                or "publicidad" in link_lower \
                or "privacidad" in link_lower \
                or "condiciones" in link_lower \
                or "tags" in link_lower \
                or "premium" in link_lower \
                or "archiv" in link_lower \
                or "sorteo" in link_lower \
                or "loter" in link_lower \
                or "newsletter" in link_lower \
                or "podcast" in link_lower \
                or "logout" in link_lower \
                or "login" in link_lower \
                or "notifica" in link_lower \
                or "push" in link_lower \
                or "servicio" in link_lower \
                or "esquela" in link_lower \
                or "defunci" in link_lower \
                or "favorito" in link_lower \
                or "firma" in link_lower \
                or "suscri" in link_lower \
                or "subscrib" in link_lower \
                or "pasatiempo" in link_lower \
                or "compra" in link_lower \
                or "tienda" in link_lower \
                or "gráfico" in link_lower or "gráfico" in link_lower \
                or "humor" in link_lower \
                or "foto" in link_lower \
                or "grafic" in link_lower \
                or "galeria" in link_lower \
                or "galería" in link_lower \
                or "opinion" in link_lower or "opinión" in link_lower \
                or "hemeroteca" in link_lower \
                or "video" in link_lower or "vídeo" in link_lower \
                or "play" in link_lower \
                or "patrocin" in link_lower \
                or "autor" in link_lower \
                or "author" in link_lower \
                or "mapa" in link_lower \
                or "map" in link_lower \
                or "blog" in link_lower \
                or "index" in link_lower \
                or "mi-" in link_lower \
                or "obituario" in link_lower \
                or "visual" in link_lower \
                or re.compile("\d{3,}").search(link_lower):
                continue
                
            if not link_lower \
                or (link_lower == response.url) \
                or (link_lower == domain) \
                or (link_lower.startswith("//") and not re.sub(regex_sub, "", link_lower).startswith(domain)) \
                or (not link_lower.startswith("/") and not re.sub(regex_sub, "", link_lower).startswith(domain)) \
                or ("?" in link_lower or "=" in link_lower or "%" in link_lower or "%" in link_lower):
                continue
            
            # Starts with double slash (protocol calling)
            if link_lower.startswith("//"):
                link_lower = "https:" + link_lower
            elif link_lower.startswith("/"):
                if "www." in link_lower:
                    link_lower = "https:/" + link_lower
                else:
                    link_lower = response.url[:-1] + link_lower
            # Starts with 'world wide web' prefix
            elif link_lower.startswith("www."):
                link_lower = "https://" + link_lower
            elif link_lower.startswith("//www."):
                link_lower = "https:" + link_lower
            
            if "#" in link_lower:
                link_lower = link_lower[:link_lower.find("#")] + "/"
            valid_links_complete.append(link_lower)
        print(f"{media}, domain: {domain}, url: {response.url}, domain_url: {domain_url}:", len(valid_links_complete))
        all_sections.extend(valid_links_complete)
    valid_links_complete_unique = list(set(all_sections))
    
scores = []
for link in valid_links_complete_unique:
    for media in media_ranks["Domain"]:
        if media in link:
            score = media_ranks.loc[(media_ranks["Domain"] == media), "Overall"].item()
            scores.append(score)
            break
len(valid_links_complete_unique), len(scores)

Reading file of urls of regions...
elpais.com, domain: elpais.com, url: https://elpais.com/, domain_url: https://elpais.com/: 71
abc.es, domain: www.abc.es, url: https://www.abc.es/, domain_url: https://www.abc.es/: 42
lavanguardia.com, domain: www.lavanguardia.com, url: https://www.lavanguardia.com/, domain_url: https://www.lavanguardia.com/: 58
elespanol.com, domain: www.elespanol.com, url: https://www.elespanol.com/, domain_url: https://www.elespanol.com/: 88
elmundo.es, domain: www.elmundo.es, url: https://www.elmundo.es/, domain_url: https://www.elmundo.es/: 62
elperiodico.com, domain: www.elperiodico.com, url: https://www.elperiodico.com/es/, domain_url: https://www.elperiodico.com/: 3
Retry request with 'https://www.' prefix https://larazon.es
ok (200)
larazon.es, domain: www.larazon.es, url: https://www.larazon.es/, domain_url: https://www.larazon.es/: 56
europapress.es, domain: www.europapress.es, url: https://www.europapress.es/, domain_url: https://www.europapress.es/: 88
20

(1782, 1782)

#### Analysis of medias nodes

In [182]:
urls_serie = pd.Series(valid_links_complete_unique)
urls_serie[urls_serie.str.contains("www.elmundo")].head(60)

41             https://www.elmundo.es/internacional.html
113                    https://www.elmundo.es/papel.html
176                      https://www.elmundo.es/loc.html
211                    https://www.elmundo.es/salud.html
293                   https://www.elmundo.es/espana.html
393               https://www.elmundo.es/pais-vasco.html
445                https://www.elmundo.es/vida-sana.html
483                   https://www.elmundo.es/viajes.html
520                 https://www.elmundo.es/economia.html
575     https://www.elmundo.es/comunidad-valenciana.html
630                     https://www.elmundo.es/como.html
635                https://www.elmundo.es/andalucia.html
639                   https://www.elmundo.es/madrid.html
679                 https://www.elmundo.es/deportes.html
795               https://www.elmundo.es/television.html
863               https://www.elmundo.es/la-lectura.html
872                   https://www.elmundo.es/yodona.html
886                 https://www

In [142]:
urls_one_node = urls_serie[(urls_serie.str.split("/").apply(lambda x: [y for y in x if y]).str.len() == 3)]
urls_one_node

0       https://www.libertaddigital.com/internacional/
1                    https://www.laverdad.es/eltiempo/
2         https://www.elperiodicodearagon.com/cultura/
3          https://www.laopiniondezamora.es/benavente/
4                   https://www.diariosur.es/interior/
                             ...                      
1787                   https://www.eldia.es/la-laguna/
1788     https://www.elperiodicodearagon.com/historia/
1789            https://www.lasprovincias.es/valencia/
1790               https://www.canarias7.es/pantallas/
1791                     https://www.larazon.es/salud/
Length: 1786, dtype: object

In [231]:
x =     urls_serie.str.replace("/{0,2}$", "", 
                           regex=True) \
              .str.replace("([^\/][.][^\/]$)?", "",
                           regex=True) \º
              .str.extract(".*/{1}(?P<node>.*)", expand=True)

In [263]:
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer

In [316]:
len(section_urls)

439

In [378]:
n_min_counts = 10
spanish_stemmer = SnowballStemmer('spanish')
section_urls = urls_serie.str.replace("/{0,2}$", "", 
                           regex=True) \
                         .str.replace("([^\/][.][^\/]$)?", "",
                           regex=True) \
                         .str.extract(".*/{1}(?P<node>.*)", 
                                      expand=True
                                     )
print(section_urls.shape)
section_urls["node"] = section_urls.node.str.replace("[.].*", 
                                                     "", 
                                                     regex=True)
section_urls["stemmed_node"] = section_urls["node"].apply(lambda x: spanish_stemmer.stem(x))
section_urls[["section", "domain"]] = urls_serie.str.extract("(?P<section>https?://(?P<domain>[^/]+/).*)")
nodes_count = section_urls["stemmed_node"].value_counts().rename("freq_node_score")
section_urls = section_urls.merge((nodes_count - nodes_count.min()) / (nodes_count.max() - nodes_count.min()), 
                                  how="inner",
                                  left_on="stemmed_node",
                                  right_index=True,
                                 )
domain_nodes_count = section_urls["domain"].value_counts().rename("freq_domain_score")
section_urls = section_urls.merge((domain_nodes_count - domain_nodes_count.min()) / (domain_nodes_count.max() - domain_nodes_count.min()), 
                                  how="inner",
                                  left_on="domain",
                                  right_index=True,
                                 )
nodes_count_more_n = nodes_count[nodes_count > n_min_counts]
frequent_sections = nodes_count_more_n.index.tolist()
section_urls.shape, frequent_sections, nodes_count_more_n

(1782, 1)


((1782, 6),
 ['deport',
  'economi',
  'socied',
  'cultur',
  'internacional',
  'motor',
  'rss',
  'tecnologi',
  'espan',
  'salud',
  'suces',
  'andaluci',
  'cienci',
  'medio-ambient',
  'nacional',
  'oci',
  'viv',
  'polit',
  'municipi',
  'plan',
  'pantall',
  'xlsemanal',
  'antropi',
  'tendencias21',
  'gent',
  'quienes-som',
  'lo-ultim',
  'eltiemp',
  'gastronomi',
  'viaj'],
 stemmed_node
 deport           59
 economi          56
 socied           50
 cultur           46
 internacional    45
 motor            41
 rss              28
 tecnologi        27
 espan            26
 salud            23
 suces            20
 andaluci         20
 cienci           18
 medio-ambient    17
 nacional         17
 oci              16
 viv              16
 polit            16
 municipi         16
 plan             15
 pantall          15
 xlsemanal        15
 antropi          15
 tendencias21     14
 gent             13
 quienes-som      13
 lo-ultim         12
 eltiemp          1

In [379]:
section_urls

Unnamed: 0,node,stemmed_node,section,domain,freq_node_score,freq_domain_score
0,internacional,internacional,https://www.libertaddigital.com/internacional/,www.libertaddigital.com/,0.758621,0.338028
1059,cultura,cultur,https://www.libertaddigital.com/cultura/,www.libertaddigital.com/,0.775862,0.338028
1428,deportes,deport,https://www.libertaddigital.com/deportes/,www.libertaddigital.com/,1.000000,0.338028
234,rss,rss,https://www.libertaddigital.com/rss/,www.libertaddigital.com/,0.465517,0.338028
1391,andalucia,andaluci,https://www.libertaddigital.com/andalucia/,www.libertaddigital.com/,0.327586,0.338028
...,...,...,...,...,...,...
1715,gl,gl,https://www.elprogreso.es/gl,www.elprogreso.es/,0.000000,0.042254
897,user,user,https://www.elconfidencialdigital.com/user,www.elconfidencialdigital.com/,0.034483,0.028169
1528,encuesta,encuest,https://www.elconfidencialdigital.com/encuesta,www.elconfidencialdigital.com/,0.051724,0.028169
1552,album,album,https://www.elconfidencialdigital.com/album,www.elconfidencialdigital.com/,0.051724,0.028169


In [380]:
section_urls.domain.value_counts()

domain
www.diariodesevilla.es/             72
www.diariodecadiz.es/               68
www.eldiario.es/                    51
www.europapress.es/                 45
lavozdegalicia.es/                  44
www.lasprovincias.es/               42
www.elcomercio.es/                  41
www.diariosur.es/                   40
okdiario.com/                       40
www.informacion.es/                 39
www.diariocordoba.com/              39
www.levante-emv.com/                39
www.diariovasco.com/                39
www.lne.es/                         39
www.elespanol.com/                  39
www.eldia.es/                       38
www.hoy.es/                         38
www.elperiodicodearagon.com/        37
www.elperiodicomediterraneo.com/    37
www.laverdad.es/                    37
sevilla.abc.es/                     37
www.farodevigo.es/                  37
www.abc.es/                         36
www.lavanguardia.com/               36
www.elnortedecastilla.es/           35
www.lavozdigital.e

In [381]:
filter_section_urls = section_urls["node"].apply(lambda node: any(section in node for section in frequent_sections))
section_urls = section_urls[filter_section_urls]
section_urls.loc[:, "temp_domain"] = section_urls.domain.str.replace("www.", "") \
                                                 .str.replace("/", "")
section_urls.dropna(inplace=True)
section_urls

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  section_urls.loc[:, "temp_domain"] = section_urls.domain.str.replace("www.", "") \
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  section_urls.dropna(inplace=True)


Unnamed: 0,node,stemmed_node,section,domain,freq_node_score,freq_domain_score,temp_domain
0,internacional,internacional,https://www.libertaddigital.com/internacional/,www.libertaddigital.com/,0.758621,0.338028,libertaddigital.com
1059,cultura,cultur,https://www.libertaddigital.com/cultura/,www.libertaddigital.com/,0.775862,0.338028,libertaddigital.com
1428,deportes,deport,https://www.libertaddigital.com/deportes/,www.libertaddigital.com/,1.000000,0.338028,libertaddigital.com
234,rss,rss,https://www.libertaddigital.com/rss/,www.libertaddigital.com/,0.465517,0.338028,libertaddigital.com
1391,andalucia,andaluci,https://www.libertaddigital.com/andalucia/,www.libertaddigital.com/,0.327586,0.338028,libertaddigital.com
...,...,...,...,...,...,...,...
1064,politica,polit,https://www.elplural.com/politica,www.elplural.com/,0.258621,0.239437,elplural.com
1441,quienes-somos,quienes-som,https://cronicaglobal.elespanol.com/quienes-so...,cronicaglobal.elespanol.com/,0.206897,0.169014,cronicaglobal.elespanol.com
803,politica,polit,https://cronicaglobal.elespanol.com/politica/,cronicaglobal.elespanol.com/,0.258621,0.169014,cronicaglobal.elespanol.com
417,primeras-planas,primeras-plan,https://cronicaglobal.elespanol.com/primeras-p...,cronicaglobal.elespanol.com/,0.000000,0.169014,cronicaglobal.elespanol.com


#### Common sections

In [414]:
columns_to_save = [
    "section",
    "domain", 
    "Overall",
    "freq_domain_score",
    "freq_node_score"
]
cols = ["domain_score", 
        "freq_domain_score", 
        "freq_node_score"]

sections_with_score = pd.merge(section_urls, 
                               media_ranks,
                               how="inner",
                               left_on="temp_domain",
                               right_on="Domain")[columns_to_save] \
                        .rename(columns={"Overall": "domain_score"}) 

sections_with_score["domain_score"] = (sections_with_score.domain_score - sections_with_score.domain_score.min()) / \
                                    (sections_with_score.domain_score.max() - sections_with_score.domain_score.min())
sections_with_score["score"] = sections_with_score[cols].pow(2).sum(axis=1).map(lambda x: np.sqrt(x))
sections_with_score.drop(cols, 
                         axis=1,
                         inplace=True
                        )
sections_without_score = pd.merge(section_urls, 
                                  media_ranks,
                                  how="right",
                                  left_on="temp_domain",
                                  right_on="Domain") \
                           .rename(columns={"Overall": "score"})
sections_with_score

Unnamed: 0,section,domain,score
0,https://www.libertaddigital.com/internacional/,www.libertaddigital.com/,0.925635
1,https://www.libertaddigital.com/cultura/,www.libertaddigital.com/,0.939818
2,https://www.libertaddigital.com/deportes/,www.libertaddigital.com/,1.131943
3,https://www.libertaddigital.com/rss/,www.libertaddigital.com/,0.705692
4,https://www.libertaddigital.com/andalucia/,www.libertaddigital.com/,0.623384
...,...,...,...
773,https://www.elplural.com/politica,www.elplural.com/,0.370126
774,https://cronicaglobal.elespanol.com/quienes-so...,cronicaglobal.elespanol.com/,0.452499
775,https://cronicaglobal.elespanol.com/politica/,cronicaglobal.elespanol.com/,0.478366
776,https://cronicaglobal.elespanol.com/primeras-p...,cronicaglobal.elespanol.com/,0.402430


In [403]:
import numpy as np

In [412]:
cols = ["domain_score", 
        "freq_domain_score", 
        "freq_node_score"]
%timeit sections_with_score[cols].apply(lambda x: np.linalg.norm(x), axis=1)
%timeit sections_with_score[cols].pow(2).sum(axis=1).map(lambda x: np.sqrt(x))

7.84 ms ± 32.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
1.77 ms ± 31.7 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [383]:
sections_with_score.value_counts("domain") 

domain
www.elcomercio.es/                  26
www.hoy.es/                         23
www.abc.es/                         23
www.lavozdigital.es/                22
www.laverdad.es/                    22
www.elnortedecastilla.es/           20
www.canarias7.es/                   20
www.elcorreo.com/                   20
www.diariodesevilla.es/             20
www.diariosur.es/                   19
sevilla.abc.es/                     19
www.diariocordoba.com/              19
www.diariodecadiz.es/               19
www.larioja.com/                    19
www.ideal.es/                       19
www.lasprovincias.es/               18
www.eldia.es/                       18
www.informacion.es/                 17
www.elperiodicodearagon.com/        17
www.larazon.es/                     17
www.diariovasco.com/                17
www.lne.es/                         17
www.laopiniondemalaga.es/           16
www.laopinioncoruna.es/             16
www.elperiodicomediterraneo.com/    16
www.farodevigo.es/

In [384]:
sections_with_score.domain.nunique()

55

In [359]:
not_found_cases = sections_without_score[sections_without_score.isnull().any(axis=1)]
print(not_found_cases)
not_found_cases.Domain.nunique(), not_found_cases.shape

    node stemmed_node section domain  freq_score temp_domain  \
69   NaN          NaN     NaN    NaN         NaN         NaN   
350  NaN          NaN     NaN    NaN         NaN         NaN   
653  NaN          NaN     NaN    NaN         NaN         NaN   
691  NaN          NaN     NaN    NaN         NaN         NaN   
766  NaN          NaN     NaN    NaN         NaN         NaN   
..   ...          ...     ...    ...         ...         ...   
909  NaN          NaN     NaN    NaN         NaN         NaN   
910  NaN          NaN     NaN    NaN         NaN         NaN   
911  NaN          NaN     NaN    NaN         NaN         NaN   
912  NaN          NaN     NaN    NaN         NaN         NaN   
913  NaN          NaN     NaN    NaN         NaN         NaN   

                            Media                          Domain Country  \
69      El Periódico de Catalunya                 elperiodico.com   Spain   
350        El Periódico de España                          epe.es   Spain   


(136, (136, 13))

### Save preprocess digital media urls to .csv file

In [415]:
files_section = glob.glob("../data/final_url_sections_v*.csv")
if len(files_section) > 0:
    version_n = max(int(x.split("_")[-1][1:-4]) for x in files_section) + 1
else:
    version_n = 0
sections_with_score.to_csv(f"../data/final_url_sections_v{version_n}.csv", 
                           index=False, 
                           header=True,
                           sep=";"
                          )
glob.glob("../data/final_url_sections_v*.csv")

['../data\\final_url_sections_v10.csv',
 '../data\\final_url_sections_v11.csv',
 '../data\\final_url_sections_v12.csv',
 '../data\\final_url_sections_v13.csv',
 '../data\\final_url_sections_v3.csv',
 '../data\\final_url_sections_v4.csv',
 '../data\\final_url_sections_v5.csv',
 '../data\\final_url_sections_v6.csv',
 '../data\\final_url_sections_v7.csv',
 '../data\\final_url_sections_v8.csv',
 '../data\\final_url_sections_v9.csv']

In [48]:
with open("nodes_08-09-2023.txt", "w") as f:
    f.write(nodes_count_more_2.index.dropna().str.strip().to_series().to_string(index=False))

In [322]:
nodes_count.iloc[:30]

deportes          58
economia          57
sociedad          55
salud             49
motor             47
internacional     47
cultura           39
tecnologia        38
ciencia           34
futbol            34
cine              31
gente             29
planes            29
sucesos           29
espana            29
gastronomia       27
andalucia         25
series            24
baloncesto        23
tv                22
politica          22
historia          21
medio-ambiente    20
ciclismo          18
madrid            18
ocio              17
viajes            16
television        16
educacion         16
tendencias21      16
Name: count, dtype: int64

### Word similarity analysis

In [14]:
words = media_nodes.tolist()
words

['cartas-al-director',
 'gradario',
 'economia',
 'wappissima',
 'series',
 'diego_j-_geniz',
 'deportes',
 'galerias',
 'deportes',
 'ocio',
 'mapaweb',
 'tintalibre',
 'carnaval',
 'mascotas',
 'agenda',
 'julia_alarcon_villanueva',
 'antropia',
 'motor',
 'campo-de-gibraltar',
 'costa',
 'lopd',
 'comunicacion',
 'television',
 'eldigitalcastillalamancha',
 'deportes',
 'luis_sanchez-molini',
 'user',
 'resultados_deportivos',
 'vital',
 'movil',
 'm-_h',
 'mundo',
 'ocio',
 'pantallas',
 'bocabierta',
 'tendencias21',
 'tecnologia',
 'mercados',
 'levante-ud',
 'ultima-hora',
 'alberto_perez_de_vargas',
 'television',
 'tiempo',
 'suplementotecnologico',
 'ciencia',
 'caceres',
 'obituarios',
 'sociedad',
 'motociclismo',
 'comunicados',
 'events',
 'mapaweb',
 'antropia',
 'provincia',
 'antropia',
 'encuesta',
 'libros-premios-madrid',
 'gl',
 'pantallas',
 'xlsemanal',
 'life',
 'carlos_navarro_antolin',
 'elandroidelibre',
 'ciencia',
 'medio-ambiente',
 'realmurcia',
 'tecnolo

In [166]:
processed = []
scores = {}
for w1 in tqdm.tqdm(words):
    processed.append(w1)
    for w2 in words:
        if w2 not in processed:
            scores[(w1, w2)] = edit_distance(w1, w2)
sorted_scores = sorted(scores.items(), key=lambda k: (k[0][0], k[1]), reverse=False)
sorted_scores = dict(sorted_scores)
sorted_scores

100%|████████████████████████████████████████████████████████████████████████████████| 722/722 [00:16<00:00, 44.00it/s]


{('Denuncian que Ayuso y Almeida han falsificado documentos en las obras de la estación de Madrid Río',
  'la-cronica-de-ribera-alta-del-ebro'): 80,
 ('Denuncian que Ayuso y Almeida han falsificado documentos en las obras de la estación de Madrid Río',
  'la-cronica-del-campo-de-carinena'): 81,
 ('Denuncian que Ayuso y Almeida han falsificado documentos en las obras de la estación de Madrid Río',
  'la-cronica-del-campo-de-belchite'): 83,
 ('Denuncian que Ayuso y Almeida han falsificado documentos en las obras de la estación de Madrid Río',
  'la-cronica-del-campo-de-borja'): 85,
 ('Denuncian que Ayuso y Almeida han falsificado documentos en las obras de la estación de Madrid Río',
  'la-cronica-de-ejea-y-sus-pueblos'): 85,
 ('Denuncian que Ayuso y Almeida han falsificado documentos en las obras de la estación de Madrid Río',
  'la-cronica-de-valdejalon'): 85,
 ('Denuncian que Ayuso y Almeida han falsificado documentos en las obras de la estación de Madrid Río',
  'agricultura-medio-am

In [67]:
dict(filter(lambda k: k[1] < 3, sorted_scores.items()))

{('alicante', 'alacanti'): 2,
 ('alsol', 'alcoy'): 2,
 ('amarina', 'marina'): 1,
 ('asturianu', 'asturianos'): 2,
 ('asturias', 'asturianu'): 2,
 ('asturias', 'asturianos'): 2,
 ('cadiz', 'cadizcf'): 2,
 ('cine', 'life'): 2,
 ('concurso', 'concursos'): 1,
 ('cultura', 'culturas'): 1,
 ('eltiempo', 'tiempo'): 2,
 ('es', 'yes'): 1,
 ('local', 'social'): 2,
 ('local', 'global'): 2,
 ('lugo', 'vigo'): 2,
 ('lugo', 'cdlugo'): 2,
 ('malaga', 'alava'): 2,
 ('marina', 'merida'): 2,
 ('motor', 'motogp'): 2,
 ('planeta', 'llanera'): 2,
 ('rocio', 'ocio'): 1,
 ('rocio', 'elrocio'): 2,
 ('sevilla', 'sevillafc'): 2,
 ('siero', 'sierra'): 2,
 ('social', 'epsocial'): 2,
 ('tendencias21', 'tendencias'): 2,
 ('viajar', 'viajes'): 2,
 ('viajes', 'virales'): 2,
 ('vida', 'vital'): 2,
 ('vida', 'elda'): 2,
 ('vigo', 'vida'): 2}

## News similarity

In [50]:
import sqlite3

conn = sqlite3.connect("../../db.sqlite3")
cursor = conn.cursor()

In [51]:
urls = cursor.execute("""
    SELECT url
        FROM news
"""
)

In [53]:
urls.fetchall()

[('https://cronicaglobal.elespanol.com/politica/20230907/erc-la-amnistia-formula-indepes-negociar-referendum/792670926_0.html',),
 ('https://cronicaglobal.elespanol.com/politica/20230907/erc-remontada-el-cambio-de-junts-dialogo/792670881_0.html',),
 ('https://cronicaglobal.elespanol.com/politica/20230907/la-generalitat-destina-de-tv3-catalunya-radio/792670940_0.html',),
 ('https://elpais.com/actualidad/noticias-del-dia/2023-09-08/las-cinco-noticias-clave-del-8-de-septiembre.html',),
 ('https://elpais.com/america-colombia/2023-09-08/la-bolsa-de-colombia-se-hunde-ante-la-amenaza-de-una-rebaja-crediticia.html',),
 ('https://elpais.com/america-colombia/2023-09-08/petro-tiene-al-enemigo-en-casa.html',),
 ('https://elpais.com/america-colombia/2023-09-08/un-sondeo-lo-dice-en-onlyfans-esta-el-futuro.html',),
 ('https://elpais.com/america-futura/2023-09-08/ricardo-mourinho-vicepresidente-del-banco-europeo-de-inversiones-latinoamerica-clama-por-accion-climatica.html',),
 ('https://elpais.com/ame