In [1]:
import numpy as np
from scipy.spatial.distance import jaccard, cosine
from bs4 import BeautifulSoup
import re
import requests
import openai
import os
import json
from datetime import datetime
import pandas as pd
import tqdm
from matplotlib import pyplot as plt
import glob as glob

from nltk.corpus import wordnet as wn
from nltk import download
from nltk.metrics.distance import edit_distance
from nltk.stem import SnowballStemmer
#download('wordnet')

In [2]:
def read_json_domain_urls_file(file_path):
    with open(file_path, "r") as file:
        return json.load(file)
def read_plain_domain_urls_file(file_path):
    with open(file_path, "r") as file:
        return file.readlines()

In [3]:
ranking_files = [(file, datetime.strptime(file.split("_")[-1].split(".")[0], "%d%m%Y") )for file in glob.glob("../data/SCImago*Spanish_*.xlsx")]
ranking_files_sort = sorted(ranking_files, 
                            key=lambda x: x[1]
                            )
most_recent_file = ranking_files_sort[-1][0]
ranking_files, ranking_files_sort, most_recent_file

([('../data\\SCImago Media Ranking - Spain - Spanish_02092023.xlsx',
   datetime.datetime(2023, 9, 2, 0, 0)),
  ('../data\\SCImago Media Ranking - Spain - Spanish_20082023.xlsx',
   datetime.datetime(2023, 8, 20, 0, 0))],
 [('../data\\SCImago Media Ranking - Spain - Spanish_20082023.xlsx',
   datetime.datetime(2023, 8, 20, 0, 0)),
  ('../data\\SCImago Media Ranking - Spain - Spanish_02092023.xlsx',
   datetime.datetime(2023, 9, 2, 0, 0))],
 '../data\\SCImago Media Ranking - Spain - Spanish_02092023.xlsx')

In [4]:
n_examples = 60
rankings_file_path = os.path.join("..\data", most_recent_file)
media_ranks = pd.read_excel(rankings_file_path).sort_values("Global_rank")
media_ranks["Media"] = media_ranks["Media"].str.strip()
media_ranks["Domain"] = media_ranks["Domain"].str.strip()
media_ranks[["Media", "Domain", "Overall"]].head(n_examples) \
                                           .to_csv("../data/spain_media name_to_url.txt", 
                                                   index=False, 
                                                   header=False,
                                                   sep=";")
media_ranks.head(15)

Unnamed: 0,Media,Domain,Country,Region,Language,Global_rank,Overall
190,El País,elpais.com,Spain,Western Europe,Spanish,7,84.5
189,ABC,abc.es,Spain,Western Europe,Spanish,28,78.25
187,La Vanguardia,lavanguardia.com,Spain,Western Europe,Spanish/Catalan,43,76.75
188,El Español,elespanol.com,Spain,Western Europe,Spanish,43,76.75
186,El Mundo,elmundo.es,Spain,Western Europe,Spanish,60,74.75
185,El Periódico de Catalunya,elperiodico.com,Spain,Western Europe,Spanish,89,72.75
183,La Razón,larazon.es,Spain,Western Europe,Spanish,115,71.25
184,Europa Press,europapress.es,Spain,Western Europe,Spanish,115,71.25
182,20 Minutos,20minutos.es,Spain,Western Europe,Spanish,124,70.75
181,El Confidencial,elconfidencial.com,Spain,Western Europe,Spanish,151,69.25


In [5]:
# regex with
regex_https = r"^(?:\w*:?)\/{2}[^\/]+/[^\/]*/?$"
regex_no_https = r"^(?!(\w*:?/{1,2}))[^\/]+/[^\/]+\/?$"
#regex_node = "^(?:https://[^/]+)/[^/]*/?"
regex_node =  r"^\/[^\/-]+-?[^\/-]*\/?$"
regex_sub = r"^(?:\w*:?/{1,2})"

MAX_N_HYPHENS = 1
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"
}
file_name = "spain_media name_to_url.txt"
file_path = os.path.join("..", "data", file_name)
if os.path.exists(file_path):
    print("Reading file of urls of regions...")
    name_to_domain_urls = read_plain_domain_urls_file(file_path)
    all_sections = []
skip_head = True
for line in name_to_domain_urls:
    #if skip_head:
    #    skip_head = False
    #    continue
    _, domain, score = line.split(";")
    media = domain.replace("www.", "")
    domain = domain.strip()
    domain_url = "https://" + domain
    try:
        response = requests.get(domain_url, 
                                headers=HEADERS, 
                                timeout=6.0)
    except requests.exceptions.Timeout:
        # Handle the timeout exception
        print("The request timed out.", domain_url)
        continue
    except requests.exceptions.RequestException as e:
        # Handle other request exceptions
        print(f"Retry request with 'https://www.' prefix", domain_url)
        domain_url = "https://www." + domain
        response = requests.get(domain_url, 
                                headers=HEADERS, 
                                timeout=6.0)
        print(f"ok ({response.status_code})")
        #continue
    domain = response.url.split("/")[2]
    domain_url = "/".join(response.url.split("/")[:3] + [""])
    parsed_html = BeautifulSoup(response.content, 
                                "html.parser")
    try:
        a_tags = parsed_html.html.find_all("a", 
                                           href=True)
    except Exception as e:
        print(e)
    valid_links_complete = []
    with open("Skipped urls.txt", "w") as file_skipped:
        for a_tag in a_tags:
            link = a_tag.attrs.get("href", "")
            if not re.search(regex_https, link) \
                and not re.search(regex_no_https, link) \
                and not re.search(regex_node, link):
                continue
            # Skipping conditionals
            link_lower = link.lower()
            link_domain = "/".join(link_lower.split("/")[:3] + [""])

            # Drop if 'garbage' nodes
            #if "#" in link_lower \
            if link_lower.count("-") > MAX_N_HYPHENS \
                or "@" in link_lower \
                or "php" in link_lower \
                or "javascript" in link_lower \
                or "mailto" in link_lower \
                or "cookie" in link_lower \
                or "feed" in link_lower \
                or "contact" in link_lower \
                or ("aviso" in link_lower and "legal" in link_lower) \
                or "inici" in link_lower \
                or "session" in link_lower \
                or "sesion" in link_lower \
                or "ads" in link_lower \
                or "publicidad" in link_lower \
                or "privacidad" in link_lower \
                or "condiciones" in link_lower \
                or "tags" in link_lower \
                or "premium" in link_lower \
                or "archiv" in link_lower \
                or "sorteo" in link_lower \
                or "loter" in link_lower \
                or "newsletter" in link_lower \
                or "podcast" in link_lower \
                or "logout" in link_lower \
                or "login" in link_lower \
                or "notifica" in link_lower \
                or "push" in link_lower \
                or "servicio" in link_lower \
                or "esquela" in link_lower \
                or "defunci" in link_lower \
                or "favorito" in link_lower \
                or "firma" in link_lower \
                or "suscri" in link_lower \
                or "subscrib" in link_lower \
                or "pasatiempo" in link_lower \
                or "compra" in link_lower \
                or "tienda" in link_lower \
                or "gráfico" in link_lower or "gráfico" in link_lower \
                or "humor" in link_lower \
                or "foto" in link_lower \
                or "grafic" in link_lower \
                or "galeria" in link_lower \
                or "galería" in link_lower \
                or "opinion" in link_lower or "opinión" in link_lower \
                or "hemeroteca" in link_lower \
                or "video" in link_lower or "vídeo" in link_lower \
                or "play" in link_lower \
                or "patrocin" in link_lower \
                or "autor" in link_lower \
                or "author" in link_lower \
                or "mapa" in link_lower \
                or "map" in link_lower \
                or "blog" in link_lower \
                or "index" in link_lower \
                or "mi-" in link_lower \
                or "obituario" in link_lower \
                or "visual" in link_lower \
                or "rss" in link_lower \
                or re.compile("\d{3,}").search(link_lower):
                continue
                
            if not link_lower \
                or (link_lower == response.url) \
                or (link_lower == domain) \
                or (link_lower.startswith("//") and not re.sub(regex_sub, "", link_lower).startswith(domain)) \
                or (not link_lower.startswith("/") and not re.sub(regex_sub, "", link_lower).startswith(domain)) \
                or ("?" in link_lower or "=" in link_lower or "%" in link_lower or "%" in link_lower):
                continue
            
            # Starts with double slash (protocol calling)
            if link_lower.startswith("//"):
                link_lower = "https:" + link_lower
            elif link_lower.startswith("/"):
                if "www." in link_lower:
                    link_lower = "https:/" + link_lower
                else:
                    link_lower = response.url[:-1] + link_lower
            # Starts with 'world wide web' prefix
            elif link_lower.startswith("www."):
                link_lower = "https://" + link_lower
            elif link_lower.startswith("//www."):
                link_lower = "https:" + link_lower
            
            
            if "#" in link_lower:
                link_lower = link_lower[:link_lower.find("#")] + "/"
            if not link_lower.endswith("/"):
                link_lower = link_lower + "/"
            valid_links_complete.append(link_lower)
        print(f"{media}, domain: {domain}, url: {response.url}, domain_url: {domain_url}:", len(valid_links_complete))
        all_sections.extend(valid_links_complete)
    valid_links_complete_unique = list(set(all_sections))
len(valid_links_complete_unique)

Reading file of urls of regions...
elpais.com, domain: elpais.com, url: https://elpais.com/, domain_url: https://elpais.com/: 72
abc.es, domain: www.abc.es, url: https://www.abc.es/, domain_url: https://www.abc.es/: 42
lavanguardia.com, domain: www.lavanguardia.com, url: https://www.lavanguardia.com/, domain_url: https://www.lavanguardia.com/: 57
elespanol.com, domain: www.elespanol.com, url: https://www.elespanol.com/, domain_url: https://www.elespanol.com/: 87
elmundo.es, domain: www.elmundo.es, url: https://www.elmundo.es/, domain_url: https://www.elmundo.es/: 58
elperiodico.com, domain: www.elperiodico.com, url: https://www.elperiodico.com/es/, domain_url: https://www.elperiodico.com/: 3
Retry request with 'https://www.' prefix https://larazon.es
ok (200)
larazon.es, domain: www.larazon.es, url: https://www.larazon.es/, domain_url: https://www.larazon.es/: 56
europapress.es, domain: www.europapress.es, url: https://www.europapress.es/, domain_url: https://www.europapress.es/: 87
20

1681

In [6]:
valid_links_complete_unique

['https://www.laopiniondezamora.es/sociedad/',
 'https://www.laverdad.es/economia/',
 'https://www.lne.es/oviedo/',
 'https://www.elnortedecastilla.es/somoscampo/',
 'https://www.diariodesevilla.es/con_cuchillo_y_tenedor/',
 'https://www.elcorreo.com/xlsemanal/',
 'https://www.diariodecadiz.es/vivir_en_cadiz/',
 'https://www.lne.es/siero/',
 'https://www.larazon.es/lifestyle/',
 'https://www.larazon.es/sucesos/',
 'https://www.elcomercio.es/pantallas/',
 'https://www.informacion.es/economia/',
 'https://www.ultimahora.es/comer-beber.html/',
 'https://www.huffingtonpost.es/tecnologia/',
 'https://www.elcorreo.com/gipuzkoa/',
 'https://www.abc.es/gente/',
 'https://www.huffingtonpost.es/pillalo/',
 'https://www.elespanol.com/series/',
 'https://www.eldiario.es/galicia/',
 'https://www.hoy.es/ciencia/',
 'https://www.larazon.es/medio-ambiente/',
 'https://www.elperiodicodearagon.com/real-zaragoza/',
 'https://lavozdegalicia.es/andarmiudino/',
 'https://www.lavanguardia.com/television/',
 

#### Analysis of medias nodes

In [7]:
urls_serie = pd.Series(valid_links_complete_unique)
urls_serie[urls_serie.str.contains("elpais")].head(60)

116     https://elpais.com/america-colombia/
158              https://elpais.com/ciencia/
173             https://elpais.com/deportes/
193              https://elpais.com/ajedrez/
242           https://elpais.com/especiales/
246           https://elpais.com/escaparate/
255                https://elpais.com/chile/
308              https://elpais.com/quadern/
320           https://elpais.com/television/
380        https://elpais.com/internacional/
419              https://elpais.com/cultura/
443             https://elpais.com/sociedad/
492       https://elpais.com/planeta-futuro/
497           https://elpais.com/actualidad/
512               https://elpais.com/espana/
580              https://elpais.com/america/
673               https://elpais.com/mexico/
817                https://elpais.com/ideas/
820               https://elpais.com/juegos/
824       https://elpais.com/america-futura/
847          https://elpais.com/gastronomia/
873           https://elpais.com/tecnologia/
875       

In [8]:
urls_one_node = urls_serie[(urls_serie.str.split("/").apply(lambda x: [y for y in x if y]).str.len() == 3)]
urls_one_node

0              https://www.laopiniondezamora.es/sociedad/
1                       https://www.laverdad.es/economia/
2                              https://www.lne.es/oviedo/
3            https://www.elnortedecastilla.es/somoscampo/
4       https://www.diariodesevilla.es/con_cuchillo_y_...
                              ...                        
1676                    https://www.diariodeleon.es/user/
1677       https://www.elperiodicoextremadura.com/motogp/
1678                   https://www.diariosur.es/eltiempo/
1679                         https://www.abc.es/eltiempo/
1680         https://www.elnortedecastilla.es/valladolid/
Length: 1675, dtype: object

In [9]:
n_min_counts = 10
spanish_stemmer = SnowballStemmer('spanish')
section_urls = urls_serie.str.replace("/{0,2}$", "", 
                           regex=True) \
                         .str.replace("([^\/][.][^\/]$)?", "",
                           regex=True) \
                         .str.extract(".*/{1}(?P<node>.*)", 
                                      expand=True
                                     )
print(section_urls.shape)
section_urls["node"] = section_urls.node.str.replace("[.].*", 
                                                     "", 
                                                     regex=True)
section_urls["stemmed_node"] = section_urls["node"].apply(lambda x: spanish_stemmer.stem(x))
section_urls[["section", "domain"]] = urls_serie.str.extract("(?P<section>https?://(?P<domain>[^/]+/).*)")
nodes_count = section_urls["stemmed_node"].value_counts().rename("freq_node_score")
section_urls = section_urls.merge((nodes_count - nodes_count.min()) / (nodes_count.max() - nodes_count.min()), 
                                  how="inner",
                                  left_on="stemmed_node",
                                  right_index=True,
                                 )
domain_nodes_count = section_urls["domain"].value_counts().rename("freq_domain_score")
section_urls = section_urls.merge((domain_nodes_count - domain_nodes_count.min()) / (domain_nodes_count.max() - domain_nodes_count.min()), 
                                  how="inner",
                                  left_on="domain",
                                  right_index=True,
                                 )
nodes_count_more_n = nodes_count[nodes_count > n_min_counts]
frequent_sections = nodes_count_more_n.index.tolist()
section_urls.shape, frequent_sections, nodes_count_more_n

(1681, 1)


((1681, 6),
 ['deport',
  'economi',
  'socied',
  'cultur',
  'motor',
  'internacional',
  'espan',
  'tecnologi',
  'salud',
  'suces',
  'cienci',
  'nacional',
  'medio-ambient',
  'viv',
  'oci',
  'andaluci',
  'municipi',
  'xlsemanal',
  'antropi',
  'tendencias21',
  'polit',
  'quienes-som',
  'lo-ultim',
  'pantall',
  'plan',
  'eltiemp',
  'gent',
  'gastronomi',
  'ultima-hor'],
 stemmed_node
 deport           52
 economi          49
 socied           43
 cultur           43
 motor            40
 internacional    39
 espan            24
 tecnologi        24
 salud            21
 suces            20
 cienci           18
 nacional         17
 medio-ambient    17
 viv              16
 oci              16
 andaluci         16
 municipi         16
 xlsemanal        15
 antropi          15
 tendencias21     14
 polit            13
 quienes-som      13
 lo-ultim         12
 pantall          12
 plan             12
 eltiemp          12
 gent             12
 gastronomi       12
 

In [10]:
section_urls

Unnamed: 0,node,stemmed_node,section,domain,freq_node_score,freq_domain_score
0,sociedad,socied,https://www.laopiniondezamora.es/sociedad/,www.laopiniondezamora.es/,0.823529,0.397059
490,economia,economi,https://www.laopiniondezamora.es/economia/,www.laopiniondezamora.es/,0.941176,0.397059
1057,sucesos,suces,https://www.laopiniondezamora.es/sucesos/,www.laopiniondezamora.es/,0.372549,0.397059
1140,medio-ambiente,medio-ambient,https://www.laopiniondezamora.es/medio-ambiente/,www.laopiniondezamora.es/,0.313725,0.397059
1280,motor,motor,https://www.laopiniondezamora.es/motor/,www.laopiniondezamora.es/,0.764706,0.397059
...,...,...,...,...,...,...
1238,cronica-directo,cronica-direct,https://cronicaglobal.elespanol.com/cronica-di...,cronicaglobal.elespanol.com/,0.000000,0.161765
1654,pensamiento,pensamient,https://cronicaglobal.elespanol.com/pensamiento/,cronicaglobal.elespanol.com/,0.000000,0.161765
744,es,es,https://www.epe.es/es/es/,www.epe.es/,0.019608,0.000000
960,es,es,https://www.elperiodico.com/es/es/,www.elperiodico.com/,0.019608,0.014706


In [11]:
section_urls.domain.value_counts()

domain
www.diariodesevilla.es/             69
www.diariodecadiz.es/               67
www.eldiario.es/                    49
www.europapress.es/                 44
lavozdegalicia.es/                  43
www.lasprovincias.es/               41
okdiario.com/                       40
www.elespanol.com/                  39
www.diariosur.es/                   38
www.levante-emv.com/                38
www.eldia.es/                       38
www.informacion.es/                 36
www.lne.es/                         36
www.diariovasco.com/                36
www.laverdad.es/                    36
www.diariocordoba.com/              36
www.lavanguardia.com/               35
www.elperiodicomediterraneo.com/    35
www.farodevigo.es/                  35
www.elperiodicodearagon.com/        34
www.elcomercio.es/                  34
www.larazon.es/                     34
www.20minutos.es/                   33
sevilla.abc.es/                     33
elpais.com/                         32
www.hoy.es/       

In [12]:
nodes_count_more_n

stemmed_node
deport           52
economi          49
socied           43
cultur           43
motor            40
internacional    39
espan            24
tecnologi        24
salud            21
suces            20
cienci           18
nacional         17
medio-ambient    17
viv              16
oci              16
andaluci         16
municipi         16
xlsemanal        15
antropi          15
tendencias21     14
polit            13
quienes-som      13
lo-ultim         12
pantall          12
plan             12
eltiemp          12
gent             12
gastronomi       12
ultima-hor       11
Name: freq_node_score, dtype: int64

In [13]:
section_urls["node"].str.replace("[.](.*)", "", regex=True)

0              sociedad
490            economia
1057            sucesos
1140     medio-ambiente
1280              motor
             ...       
1238    cronica-directo
1654        pensamiento
744                  es
960                  es
1229              cuore
Name: node, Length: 1681, dtype: object

In [14]:
section_urls["node"].str.replace("[.](.*)", "", regex=True) 

0              sociedad
490            economia
1057            sucesos
1140     medio-ambiente
1280              motor
             ...       
1238    cronica-directo
1654        pensamiento
744                  es
960                  es
1229              cuore
Name: node, Length: 1681, dtype: object

In [15]:
section_urls[section_urls["node"].str.contains("www")]

Unnamed: 0,node,stemmed_node,section,domain,freq_node_score,freq_domain_score
778,www,www,https://www.publico.es//,www.publico.es/,0.039216,0.161765
935,www,www,https://www.elconfidencial.com/,www.elconfidencial.com/,0.039216,0.279412
456,www,www,https://www.periodistadigital.com/,www.periodistadigital.com/,0.039216,0.441176


In [16]:
filter_section_urls = section_urls["node"].apply(lambda node: any(section in node for section in frequent_sections))
section_urls = section_urls[filter_section_urls]
section_urls.loc[:, "temp_domain"] = section_urls.domain.str.replace("www.", "") \
                                                 .str.replace("/", "")
section_urls.dropna(inplace=True)
section_urls

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  section_urls.loc[:, "temp_domain"] = section_urls.domain.str.replace("www.", "") \
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  section_urls.dropna(inplace=True)


Unnamed: 0,node,stemmed_node,section,domain,freq_node_score,freq_domain_score,temp_domain
0,sociedad,socied,https://www.laopiniondezamora.es/sociedad/,www.laopiniondezamora.es/,0.823529,0.397059,laopiniondezamora.es
490,economia,economi,https://www.laopiniondezamora.es/economia/,www.laopiniondezamora.es/,0.941176,0.397059,laopiniondezamora.es
1057,sucesos,suces,https://www.laopiniondezamora.es/sucesos/,www.laopiniondezamora.es/,0.372549,0.397059,laopiniondezamora.es
1140,medio-ambiente,medio-ambient,https://www.laopiniondezamora.es/medio-ambiente/,www.laopiniondezamora.es/,0.313725,0.397059,laopiniondezamora.es
1280,motor,motor,https://www.laopiniondezamora.es/motor/,www.laopiniondezamora.es/,0.764706,0.397059,laopiniondezamora.es
...,...,...,...,...,...,...,...
1308,ciencia-tecnologia,ciencia-tecnologi,https://www.libertaddigital.com/ciencia-tecnol...,www.libertaddigital.com/,0.000000,0.338235,libertaddigital.com
74,asuntos-sociales,asuntos-social,https://www.lainformacion.com/asuntos-sociales/,www.lainformacion.com/,0.000000,0.088235,lainformacion.com
1004,politica,polit,https://cronicaglobal.elespanol.com/politica/,cronicaglobal.elespanol.com/,0.235294,0.161765,cronicaglobal.elespanol.com
1409,quienes-somos,quienes-som,https://cronicaglobal.elespanol.com/quienes-so...,cronicaglobal.elespanol.com/,0.235294,0.161765,cronicaglobal.elespanol.com


#### Common sections

In [17]:
columns_to_save = [
    "section",
    "domain", 
    "Overall",
    "freq_domain_score",
    "freq_node_score"
]
cols = ["domain_score", 
        "freq_domain_score", 
        "freq_node_score"]

sections_with_score = pd.merge(section_urls, 
                               media_ranks,
                               how="inner",
                               left_on="temp_domain",
                               right_on="Domain")[columns_to_save] \
                        .rename(columns={"Overall": "domain_score"}) 

sections_with_score["domain_score"] = (sections_with_score.domain_score - sections_with_score.domain_score.min()) / \
                                    (sections_with_score.domain_score.max() - sections_with_score.domain_score.min())
sections_with_score["score"] = sections_with_score[cols].pow(2).sum(axis=1).map(lambda x: np.sqrt(x))
sections_with_score.drop(cols, 
                         axis=1,
                         inplace=True
                        )
sections_without_score = pd.merge(section_urls, 
                                  media_ranks,
                                  how="right",
                                  left_on="temp_domain",
                                  right_on="Domain") \
                           .rename(columns={"Overall": "score"})
sections_with_score

Unnamed: 0,section,domain,score
0,https://www.laopiniondezamora.es/sociedad/,www.laopiniondezamora.es/,0.914624
1,https://www.laopiniondezamora.es/economia/,www.laopiniondezamora.es/,1.021836
2,https://www.laopiniondezamora.es/sucesos/,www.laopiniondezamora.es/,0.545095
3,https://www.laopiniondezamora.es/medio-ambiente/,www.laopiniondezamora.es/,0.506715
4,https://www.laopiniondezamora.es/motor/,www.laopiniondezamora.es/,0.862039
...,...,...,...
685,https://www.libertaddigital.com/ciencia-tecnol...,www.libertaddigital.com/,0.530505
686,https://www.lainformacion.com/asuntos-sociales/,www.lainformacion.com/,0.218599
687,https://cronicaglobal.elespanol.com/politica/,cronicaglobal.elespanol.com/,0.463589
688,https://cronicaglobal.elespanol.com/quienes-so...,cronicaglobal.elespanol.com/,0.463589


In [18]:
sections_with_score.domain.nunique(), sections_with_score.value_counts("domain") 

(55,
 domain
 www.laverdad.es/                    21
 www.elcomercio.es/                  19
 www.ideal.es/                       19
 www.diariocordoba.com/              18
 www.lasprovincias.es/               18
 www.elcorreo.com/                   18
 www.hoy.es/                         18
 www.diariosur.es/                   18
 www.larioja.com/                    17
 www.diariodesevilla.es/             17
 www.canarias7.es/                   17
 www.eldia.es/                       16
 www.diariodecadiz.es/               16
 sevilla.abc.es/                     16
 www.diariovasco.com/                16
 www.lavozdigital.es/                16
 www.larazon.es/                     16
 www.elnortedecastilla.es/           16
 www.informacion.es/                 15
 www.laopinioncoruna.es/             15
 www.elperiodicodearagon.com/        15
 www.abc.es/                         15
 www.lne.es/                         15
 www.laopiniondezamora.es/           14
 www.elperiodicomediterrane

In [19]:
not_found_cases = sections_without_score[sections_without_score.isnull().any(axis=1)]
print(not_found_cases)
not_found_cases.Domain.nunique(), not_found_cases.shape

    node stemmed_node section domain  freq_node_score  freq_domain_score  \
56   NaN          NaN     NaN    NaN              NaN                NaN   
316  NaN          NaN     NaN    NaN              NaN                NaN   
583  NaN          NaN     NaN    NaN              NaN                NaN   
614  NaN          NaN     NaN    NaN              NaN                NaN   
679  NaN          NaN     NaN    NaN              NaN                NaN   
..   ...          ...     ...    ...              ...                ...   
821  NaN          NaN     NaN    NaN              NaN                NaN   
822  NaN          NaN     NaN    NaN              NaN                NaN   
823  NaN          NaN     NaN    NaN              NaN                NaN   
824  NaN          NaN     NaN    NaN              NaN                NaN   
825  NaN          NaN     NaN    NaN              NaN                NaN   

    temp_domain                         Media                          Domain  \
56    

(136, (136, 14))

### Save preprocess digital media urls to .csv file

In [47]:
result = sections_with_score.groupby('domain')[['section', 'score']].apply(lambda x: tuple(x.values)).reset_index()
result.columns = ['domain', 'data']

# Convert the DataFrame to JSON
json_data = result.set_index("domain").to_json(orient='index')
json_data

'{"cronicaglobal.elespanol.com\\/":{"data":[["https:\\/\\/cronicaglobal.elespanol.com\\/politica\\/",0.4635891336],["https:\\/\\/cronicaglobal.elespanol.com\\/quienes-somos\\/",0.4635891336],["https:\\/\\/cronicaglobal.elespanol.com\\/primeras-planas\\/",0.3994390604]]},"efe.com\\/":{"data":[["https:\\/\\/efe.com\\/economia\\/",1.0051769112],["https:\\/\\/efe.com\\/andalucia\\/",0.4594264515],["https:\\/\\/efe.com\\/deportes\\/",1.0604562575],["https:\\/\\/efe.com\\/cultura\\/",0.8959733066],["https:\\/\\/efe.com\\/espana\\/",0.5726698771],["https:\\/\\/efe.com\\/quienes-somos\\/",0.424182503],["https:\\/\\/efe.com\\/portada-espana\\/",0.3529411765]]},"elpais.com\\/":{"data":[["https:\\/\\/elpais.com\\/sociedad\\/",1.3733278603],["https:\\/\\/elpais.com\\/economia\\/",1.4469422478],["https:\\/\\/elpais.com\\/tecnologia\\/",1.1879444574],["https:\\/\\/elpais.com\\/gente\\/",1.1199773608],["https:\\/\\/elpais.com\\/ciencia\\/",1.1484510572],["https:\\/\\/elpais.com\\/deportes\\/",1.48587

In [51]:
len()

55

In [43]:
json_data

'{"domain":"cronicaglobal.elespanol.com\\/","data":[["https:\\/\\/cronicaglobal.elespanol.com\\/politica\\/",0.4635891336],["https:\\/\\/cronicaglobal.elespanol.com\\/quienes-somos\\/",0.4635891336],["https:\\/\\/cronicaglobal.elespanol.com\\/primeras-planas\\/",0.3994390604]]}\n{"domain":"efe.com\\/","data":[["https:\\/\\/efe.com\\/economia\\/",1.0051769112],["https:\\/\\/efe.com\\/andalucia\\/",0.4594264515],["https:\\/\\/efe.com\\/deportes\\/",1.0604562575],["https:\\/\\/efe.com\\/cultura\\/",0.8959733066],["https:\\/\\/efe.com\\/espana\\/",0.5726698771],["https:\\/\\/efe.com\\/quienes-somos\\/",0.424182503],["https:\\/\\/efe.com\\/portada-espana\\/",0.3529411765]]}\n{"domain":"elpais.com\\/","data":[["https:\\/\\/elpais.com\\/sociedad\\/",1.3733278603],["https:\\/\\/elpais.com\\/economia\\/",1.4469422478],["https:\\/\\/elpais.com\\/tecnologia\\/",1.1879444574],["https:\\/\\/elpais.com\\/gente\\/",1.1199773608],["https:\\/\\/elpais.com\\/ciencia\\/",1.1484510572],["https:\\/\\/elpai

In [None]:
"""files_section = glob.glob("../data/final_url_sections_v*.csv")
if len(files_section) > 0:
    version_n = max(int(x.split("_")[-1][1:-4]) for x in files_section) + 1
else:
    version_n = 0
sections_with_score.to_csv(f"../data/final_url_sections_v{version_n}.csv", 
                           index=False, 
                           header=True,
                           sep=";"
                           )
glob.glob("../data/final_url_sections_v*.csv")"""

In [52]:
files_section = glob.glob("../data/final_url_sections_v*.json")
if len(files_section) > 0:
    version_n = max(int(x.split("_")[-1][1:-4]) for x in files_section) + 1
else:
    version_n = 0
result = sections_with_score.groupby('domain')[['section', 'score']].apply(lambda x: tuple(x.values)).reset_index()
result.columns = ['domain', 'data']

# Convert the DataFrame to JSON
result.set_index("domain") \
      .to_json(f"../data/final_url_sections_v{version_n}.json", 
               orient='index')
glob.glob("../data/final_url_sections_v*.json")

['../data\\final_url_sections_v15.json']

In [48]:
with open("nodes_08-09-2023.txt", "w") as f:
    f.write(nodes_count_more_n.index.dropna().str.strip().to_series().to_string(index=False))

In [21]:
nodes_count.iloc[:30]

stemmed_node
deport           59
economi          55
socied           50
cultur           46
internacional    45
motor            41
rss              28
tecnologi        27
espan            26
salud            22
suces            20
andaluci         20
cienci           18
medio-ambient    17
nacional         17
oci              16
municipi         16
viv              16
polit            16
plan             15
xlsemanal        15
antropi          15
pantall          15
tendencias21     14
gent             13
quienes-som      13
gastronomi       12
eltiemp          12
lo-ultim         12
viaj             11
Name: freq_node_score, dtype: int64

### Word similarity analysis

In [14]:
words = media_nodes.tolist()
words

['cartas-al-director',
 'gradario',
 'economia',
 'wappissima',
 'series',
 'diego_j-_geniz',
 'deportes',
 'galerias',
 'deportes',
 'ocio',
 'mapaweb',
 'tintalibre',
 'carnaval',
 'mascotas',
 'agenda',
 'julia_alarcon_villanueva',
 'antropia',
 'motor',
 'campo-de-gibraltar',
 'costa',
 'lopd',
 'comunicacion',
 'television',
 'eldigitalcastillalamancha',
 'deportes',
 'luis_sanchez-molini',
 'user',
 'resultados_deportivos',
 'vital',
 'movil',
 'm-_h',
 'mundo',
 'ocio',
 'pantallas',
 'bocabierta',
 'tendencias21',
 'tecnologia',
 'mercados',
 'levante-ud',
 'ultima-hora',
 'alberto_perez_de_vargas',
 'television',
 'tiempo',
 'suplementotecnologico',
 'ciencia',
 'caceres',
 'obituarios',
 'sociedad',
 'motociclismo',
 'comunicados',
 'events',
 'mapaweb',
 'antropia',
 'provincia',
 'antropia',
 'encuesta',
 'libros-premios-madrid',
 'gl',
 'pantallas',
 'xlsemanal',
 'life',
 'carlos_navarro_antolin',
 'elandroidelibre',
 'ciencia',
 'medio-ambiente',
 'realmurcia',
 'tecnolo

In [166]:
processed = []
scores = {}
for w1 in tqdm.tqdm(words):
    processed.append(w1)
    for w2 in words:
        if w2 not in processed:
            scores[(w1, w2)] = edit_distance(w1, w2)
sorted_scores = sorted(scores.items(), key=lambda k: (k[0][0], k[1]), reverse=False)
sorted_scores = dict(sorted_scores)
sorted_scores

100%|████████████████████████████████████████████████████████████████████████████████| 722/722 [00:16<00:00, 44.00it/s]


{('Denuncian que Ayuso y Almeida han falsificado documentos en las obras de la estación de Madrid Río',
  'la-cronica-de-ribera-alta-del-ebro'): 80,
 ('Denuncian que Ayuso y Almeida han falsificado documentos en las obras de la estación de Madrid Río',
  'la-cronica-del-campo-de-carinena'): 81,
 ('Denuncian que Ayuso y Almeida han falsificado documentos en las obras de la estación de Madrid Río',
  'la-cronica-del-campo-de-belchite'): 83,
 ('Denuncian que Ayuso y Almeida han falsificado documentos en las obras de la estación de Madrid Río',
  'la-cronica-del-campo-de-borja'): 85,
 ('Denuncian que Ayuso y Almeida han falsificado documentos en las obras de la estación de Madrid Río',
  'la-cronica-de-ejea-y-sus-pueblos'): 85,
 ('Denuncian que Ayuso y Almeida han falsificado documentos en las obras de la estación de Madrid Río',
  'la-cronica-de-valdejalon'): 85,
 ('Denuncian que Ayuso y Almeida han falsificado documentos en las obras de la estación de Madrid Río',
  'agricultura-medio-am

In [67]:
dict(filter(lambda k: k[1] < 3, sorted_scores.items()))

{('alicante', 'alacanti'): 2,
 ('alsol', 'alcoy'): 2,
 ('amarina', 'marina'): 1,
 ('asturianu', 'asturianos'): 2,
 ('asturias', 'asturianu'): 2,
 ('asturias', 'asturianos'): 2,
 ('cadiz', 'cadizcf'): 2,
 ('cine', 'life'): 2,
 ('concurso', 'concursos'): 1,
 ('cultura', 'culturas'): 1,
 ('eltiempo', 'tiempo'): 2,
 ('es', 'yes'): 1,
 ('local', 'social'): 2,
 ('local', 'global'): 2,
 ('lugo', 'vigo'): 2,
 ('lugo', 'cdlugo'): 2,
 ('malaga', 'alava'): 2,
 ('marina', 'merida'): 2,
 ('motor', 'motogp'): 2,
 ('planeta', 'llanera'): 2,
 ('rocio', 'ocio'): 1,
 ('rocio', 'elrocio'): 2,
 ('sevilla', 'sevillafc'): 2,
 ('siero', 'sierra'): 2,
 ('social', 'epsocial'): 2,
 ('tendencias21', 'tendencias'): 2,
 ('viajar', 'viajes'): 2,
 ('viajes', 'virales'): 2,
 ('vida', 'vital'): 2,
 ('vida', 'elda'): 2,
 ('vigo', 'vida'): 2}

## News similarity

In [50]:
import sqlite3

conn = sqlite3.connect("../../db.sqlite3")
cursor = conn.cursor()

In [51]:
urls = cursor.execute("""
    SELECT url
        FROM news
"""
)

In [53]:
urls.fetchall()

[('https://cronicaglobal.elespanol.com/politica/20230907/erc-la-amnistia-formula-indepes-negociar-referendum/792670926_0.html',),
 ('https://cronicaglobal.elespanol.com/politica/20230907/erc-remontada-el-cambio-de-junts-dialogo/792670881_0.html',),
 ('https://cronicaglobal.elespanol.com/politica/20230907/la-generalitat-destina-de-tv3-catalunya-radio/792670940_0.html',),
 ('https://elpais.com/actualidad/noticias-del-dia/2023-09-08/las-cinco-noticias-clave-del-8-de-septiembre.html',),
 ('https://elpais.com/america-colombia/2023-09-08/la-bolsa-de-colombia-se-hunde-ante-la-amenaza-de-una-rebaja-crediticia.html',),
 ('https://elpais.com/america-colombia/2023-09-08/petro-tiene-al-enemigo-en-casa.html',),
 ('https://elpais.com/america-colombia/2023-09-08/un-sondeo-lo-dice-en-onlyfans-esta-el-futuro.html',),
 ('https://elpais.com/america-futura/2023-09-08/ricardo-mourinho-vicepresidente-del-banco-europeo-de-inversiones-latinoamerica-clama-por-accion-climatica.html',),
 ('https://elpais.com/ame