In [1]:
import numpy as np
from scipy.spatial.distance import jaccard, cosine
from bs4 import BeautifulSoup
import re
import requests
import openai
import os
import json
from datetime import datetime
import pandas as pd
import tqdm
from matplotlib import pyplot as plt
import glob as glob

from nltk.corpus import wordnet as wn
from nltk import download
from nltk.metrics.distance import edit_distance
#download('wordnet')

In [2]:
def read_json_domain_urls_file(file_path):
    with open(file_path, "r") as file:
        return json.load(file)
def read_plain_domain_urls_file(file_path):
    with open(file_path, "r") as file:
        return file.readlines()

In [3]:
ranking_files = [(file, datetime.strptime(file.split("_")[-1].split(".")[0], "%d%m%Y") )for file in glob.glob("../data/SCImago*Spanish_*.xlsx")]
ranking_files_sort = sorted(ranking_files, 
                            key=lambda x: x[1]
                            )
most_recent_file = ranking_files_sort[-1][0]
ranking_files, ranking_files_sort, most_recent_file

([('../data\\SCImago Media Ranking - Spain - Spanish_02092023.xlsx',
   datetime.datetime(2023, 9, 2, 0, 0)),
  ('../data\\SCImago Media Ranking - Spain - Spanish_20082023.xlsx',
   datetime.datetime(2023, 8, 20, 0, 0))],
 [('../data\\SCImago Media Ranking - Spain - Spanish_20082023.xlsx',
   datetime.datetime(2023, 8, 20, 0, 0)),
  ('../data\\SCImago Media Ranking - Spain - Spanish_02092023.xlsx',
   datetime.datetime(2023, 9, 2, 0, 0))],
 '../data\\SCImago Media Ranking - Spain - Spanish_02092023.xlsx')

In [4]:
n_examples = 60
rankings_file_path = os.path.join("..\data", most_recent_file)
media_ranks = pd.read_excel(rankings_file_path).sort_values("Global_rank")
media_ranks["Media"] = media_ranks["Media"].str.strip()
media_ranks["Domain"] = media_ranks["Domain"].str.strip()
media_ranks[["Media", "Domain", "Overall"]].head(n_examples) \
                                           .to_csv("../data/spain_media name_to_url.txt", 
                                                   index=False, 
                                                   header=False,
                                                   sep=";")
media_ranks.head(15)

Unnamed: 0,Media,Domain,Country,Region,Language,Global_rank,Overall
190,El País,elpais.com,Spain,Western Europe,Spanish,7,84.5
189,ABC,abc.es,Spain,Western Europe,Spanish,28,78.25
187,La Vanguardia,lavanguardia.com,Spain,Western Europe,Spanish/Catalan,43,76.75
188,El Español,elespanol.com,Spain,Western Europe,Spanish,43,76.75
186,El Mundo,elmundo.es,Spain,Western Europe,Spanish,60,74.75
185,El Periódico de Catalunya,elperiodico.com,Spain,Western Europe,Spanish,89,72.75
183,La Razón,larazon.es,Spain,Western Europe,Spanish,115,71.25
184,Europa Press,europapress.es,Spain,Western Europe,Spanish,115,71.25
182,20 Minutos,20minutos.es,Spain,Western Europe,Spanish,124,70.75
181,El Confidencial,elconfidencial.com,Spain,Western Europe,Spanish,151,69.25


In [30]:
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"
}
file_name = "spain_media name_to_url.txt"
file_path = os.path.join("..", "data", file_name)
if os.path.exists(file_path):
    print("Reading file of urls of regions...")
    name_to_media_urls = read_plain_media_urls_file(file_path)
    all_sections = []
skip_head = True
scores = []
for media_line in name_to_media_urls:
    #if skip_head:
    #    skip_head = False
    #    continue
    _, media, score = media_line.split(";")
    media_url = "https://www." + media.strip()
    try:
        response = requests.get(media_url, 
                                headers=HEADERS, 
                                timeout=10)
    except requests.exceptions.Timeout:
        # Handle the timeout exception
        print("The request timed out.")
    except requests.exceptions.RequestException as e:
        # Handle other request exceptions
        print(f"An error occurred: {str(e)}")
        media_url = media_url.replace("www.", "")
        response = requests.get(media_url, 
                                headers=HEADERS, 
                                timeout=10) 

    parsed_hmtl = BeautifulSoup(response.content, 
                                "html.parser")
    try:
        links = [x.attrs.get("href", None) for x in parsed_hmtl.body.find_all("a")]
    except Exception as e:
        print(e)

    links_serie = pd.Series(links).dropna()
    nodes = links_serie.str.replace(media_url, "", regex=True)

    nodes_split = nodes.str.split("/")
    nodes_split_clean = nodes_split.apply(lambda x: [elem for elem in x if elem])

    nodes_split_clean_filter = nodes_split_clean.str.len().eq(1)

    valid_links = links_serie[nodes_split_clean_filter]
    #valid_links_complete = [media_url + x if not x.startswith(media_url) else x for x in valid_links]
    valid_links_complete = []
    with open("Skipped urls.txt", "w") as file_skipped:
        for link_node in valid_links:
            link = link_node
            link_lower = link.lower()
            # Drop if 'garbage' node
            if link == media_url \
                or len(link.replace("/", "")) == 1 \
                or "#" in link_lower \
                or ":" in link_lower \
                or "@" in link_lower \
                or "php" in link_lower \
                or "javascript" in link_lower \
                or "mailto" in link_lower \
                or "cookie" in link_lower \
                or "feed" in link_lower \
                or "contact" in link_lower \
                or ("aviso" in link_lower and "legal" in link_lower) \
                or "inici" in link_lower \
                or "session" in link_lower \
                or "sesion" in link_lower \
                or "ads" in link_lower \
                or "publicidad" in link_lower \
                or "privacidad" in link_lower \
                or "condiciones" in link_lower \
                or "tags" in link_lower \
                or "premium" in link_lower \
                or "archiv" in link_lower \
                or "sorteo" in link_lower \
                or "loter" in link_lower \
                or "newsletter" in link_lower \
                or "podcast" in link_lower \
                or "logout" in link_lower \
                or "login" in link_lower \
                or link.endswith(".html") \
                or "notifica" in link_lower \
                or "push" in link_lower \
                or "servicio" in link_lower \
                or "esquela" in link_lower \
                or "defunci" in link_lower \
                or "favorito" in link_lower \
                or "firma" in link_lower \
                or "suscri" in link_lower \
                or "subscrib" in link_lower \
                or "pasatiempo" in link_lower \
                or "compra" in link_lower \
                or "tienda" in link_lower \
                or "gráfico" in link_lower or "gráfico" in link_lower \
                or "humor" in link_lower \
                or "foto" in link_lower \
                or "opinion" in link_lower or "opinión" in link_lower \
                or "hemeroteca" in link_lower \
                or "video" in link_lower or "vídeo" in link_lower \
                or "play" in link_lower \
                or "patrocin" in link_lower \
                or "autor" in link_lower \
                or re.compile("\d{3,}").search(link):
                continue
            if not link.startswith(media_url):
                if not link.startswith("//www."):
                    link_parts = link.split("/")
                    link_parts = [x for x in link_parts if x]
                    if len(link_parts) == 1:
                        if link.startswith("/"):
                            link = media_url + link
                        else:
                            link = media_url + "/" + link
                    else:
                        continue
                else:
                    continue
            else:
                link_parts = link.split("/")
                link_parts = [x for x in link_parts if x][2:]
                if len(link_parts) > 1 or len(link_parts) == 0:
                    continue
                else:
                    continue
            if not link.endswith("/"):
                link += "/"
            valid_links_complete.append(link)
            scores.append(score)
        print(f"Valid links of {media_url.strip()}:", len(valid_links))
        all_sections.extend(valid_links_complete)
    valid_links_complete_unique = list(set(all_sections))

Reading file of urls of regions...
Valid links of https://www.elpais.com: 28
Valid links of https://www.abc.es: 58
Valid links of https://www.lavanguardia.com: 83
Valid links of https://www.elespanol.com: 113
Valid links of https://www.elmundo.es: 87
Valid links of https://www.elperiodico.com: 17
Valid links of https://www.larazon.es: 63
Valid links of https://www.europapress.es: 107
Valid links of https://www.20minutos.es: 49
Valid links of https://www.elconfidencial.com: 31
Valid links of https://www.eldiario.es: 121
Valid links of https://www.sevilla.abc.es: 51
Valid links of https://www.lavozdegalicia.es: 105
Valid links of https://www.libertaddigital.com: 48
An error occurred: HTTPSConnectionPool(host='www.cronicaglobal.elespanol.com', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000001A61F208B80>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
Valid links of https://cronicaglo

In [26]:
valid_links_complete_unique

['https://www.elperiodicoextremadura.com/deportes/',
 'https://sevilla.abc.es/andalucia/',
 'https://www.levante-emv.com/urban/',
 'https://www.noticiasdenavarra.com/vivir/',
 'https://www.elnortedecastilla.es/sociedad/',
 'https://www.diariodemallorca.es/sucesos/',
 'https://www.europapress.es/madrid/',
 'https://www.diariodesevilla.es/jose_aguilar/',
 'https://www.europapress.es/internacional/',
 'https://www.lavanguardia.com/economia/',
 'https://www.diariodesevilla.es/suplementotecnologico/',
 'https://www.elplural.com/comunicacion/',
 'https://www.laopiniondemalaga.es/malagacf/',
 'https://www.abc.es/sociedad/',
 'https://www.noticiasdenavarra.com/pamplona/',
 'https://www.ideal.es/historias-visuales/',
 'https://www.eldia.es/lo-ultimo/',
 'https://www.diariodecadiz.es/noticias-provincia-cadiz/',
 'https://elpais.com/ultimas-noticias/',
 'https://www.eldia.es/motogp/',
 'https://www.farodevigo.es/espana/',
 'https://www.elespanol.com/ciencia/',
 'https://www.deia.eus/promociones/'

#### Analysis of medias nodes

In [10]:
urls_serie = pd.Series(valid_links_complete_unique)
urls_serie[urls_serie.str.contains("larazon")].head(60)

0       https://www.larazon.es/deportes/baloncesto/eur...
34          https://www.larazon.es/internacional/america/
76      https://www.larazon.es/deportes/futbol/primera...
86                    https://www.larazon.es/gastronomia/
108             https://www.larazon.es/lifestyle/belleza/
138                                https://www.larazon.es
174                 https://www.larazon.es/internacional/
201                        https://www.larazon.es/viajes/
218       https://www.larazon.es/deportes/futbol/mundial/
242          https://www.larazon.es/comunidad-valenciana/
261            https://www.larazon.es/deportes/formula-1/
333                       https://www.larazon.es/sucesos/
336         https://www.larazon.es/deportes/motociclismo/
382              https://www.larazon.es/cultura/historia/
419                         https://www.larazon.es/gente/
484             https://www.larazon.es/deportes/ciclismo/
495                       https://www.larazon.es/acceder/
502           

In [392]:
urls_serie[(urls_serie.str.split("/").str.len() < 4)].sort_values()

3343                      https://elpais.com
438                     https://okdiario.com
3162                  https://sevilla.abc.es
3362                      https://www.abc.es
1246                https://www.canarias7.es
3615         https://www.diariodemallorca.es
770                 https://www.diariosur.es
3673             https://www.diariovasco.com
2935               https://www.elcomercio.es
691                 https://www.elcorreo.com
3033                    https://www.eldia.es
2120                 https://www.eldiario.es
1285         https://www.eldiariomontanes.es
1477                  https://www.elmundo.es
417         https://www.elnortedecastilla.es
3519                https://www.elplural.com
2525              https://www.europapress.es
2222               https://www.farodevigo.es
350                       https://www.hoy.es
3490                    https://www.ideal.es
3546              https://www.informacion.es
1268                  https://www.larazon.es
2997      

In [393]:
urls_one_node = urls_serie[(urls_serie.str.split("/").apply(lambda x: [y for y in x if y]).str.len() == 3)]
urls_one_node

0                  https://www.laopinioncoruna.es/motogp/
1                       https://www.elespanol.com/social/
3              https://www.diariovasco.com/internacional/
7                               https://www.lne.es/siero/
8                               https://www.lne.es/salud/
                              ...                        
3694       https://www.lasprovincias.es/revista-valencia/
3695    https://www.elperiodicoextremadura.com/femenin...
3696                https://www.larazon.es/internacional/
3702           https://www.diariodeleon.es/portadas-papel
3706      https://www.elperiodicodearagon.com/baloncesto/
Length: 1821, dtype: object

In [538]:
n_min_counts = 10
section_urls = urls_serie.str.replace("/{0,2}$", "", 
                                      regex=True) \
                         .str.extract(".*/{1}(?P<node>.*)", 
                                      expand=True)
nodes_count = section_urls["node"].str.replace("[.](.*)", 
                                               "", 
                                               regex=True) \
                                  .value_counts().rename("freq_score")

section_urls = section_urls.merge((nodes_count - nodes_count.min()) / (nodes_count.max() - nodes_count.min()), 
                   how="inner",
                   left_on="node",
                   right_index=True,
                  )
nodes_count_more_n = nodes_count[nodes_count > n_min_counts]
frequent_sections = nodes_count_more_n.index.tolist()
frequent_sections

In [543]:
section_urls.node.str.contains("www").sum()

57

In [539]:
section_urls[section_urls["node"].str.contains("[.]")]

Unnamed: 0,node
16,lena.html
42,cudillero.html
61,langreo.html
62,calendario.html
73,codigo-etico.html
...,...
3648,deportes.html
3662,alimente.elconfidencial.com
3670,municipios.html
3673,www.diariovasco.com


In [536]:
section_urls

Unnamed: 0,node
0,motogp
1,social
2,educacion
3,internacional
4,nervion
...,...
3702,portadas-papel
3703,baloncesto
3704,recetas-faciles
3705,nytimes


In [523]:
nodes_count_more_n

node
deportes           60
economia           58
www                57
sociedad           56
motor              50
                   ..
dietas             11
programacion-tv    11
agenda             11
fichajes           11
fitness            11
Name: freq_score, Length: 70, dtype: int64

In [531]:
 section_urls["node"].str.replace("[.](.*)", "", regex=True) 

0                motogp
1                social
2             educacion
3         internacional
4               nervion
             ...       
3702     portadas-papel
3703         baloncesto
3704    recetas-faciles
3705            nytimes
3706         baloncesto
Name: node, Length: 3707, dtype: object

In [532]:
 section_urls["node"].str.replace("[.](.*)", "", regex=True) 

0                motogp
1                social
2             educacion
3         internacional
4               nervion
             ...       
3702     portadas-papel
3703         baloncesto
3704    recetas-faciles
3705            nytimes
3706         baloncesto
Name: node, Length: 3707, dtype: object

In [522]:
section_urls[section_urls["node"].str.contains("www")]

Unnamed: 0,node,freq_score


In [508]:
filter_section_urls = urls_serie.apply(lambda url: any(section in url for section in frequent_sections))
section_urls[["section", "domain"]] = urls_serie[filter_section_urls].str.extract("(?P<section>https?://(?P<domain>[^/]+/).*)")

section_urls["temp_domain"] = section_urls.domain.str.replace("www.", "") \
                                                 .str.replace("/", "")
section_urls.dropna(inplace=True)
section_urls

Unnamed: 0,node,freq_score,section,domain,temp_domain
0,motogp,0.210526,https://www.laopinioncoruna.es/motogp/,www.laopinioncoruna.es/,laopinioncoruna.es
91,motogp,0.210526,https://www.farodevigo.es/motogp/,www.farodevigo.es/,farodevigo.es
245,motogp,0.210526,https://www.eldia.es/motogp/,www.eldia.es/,eldia.es
1343,motogp,0.210526,https://www.noticiasdenavarra.com/motogp/,www.noticiasdenavarra.com/,noticiasdenavarra.com
1669,motogp,0.210526,https://www.elperiodicomediterraneo.com/motogp/,www.elperiodicomediterraneo.com/,elperiodicomediterraneo.com
...,...,...,...,...,...
3663,forzabreo,0.000000,https://lavozdegalicia.es/forzabreo/,lavozdegalicia.es/,lavozdegalicia.es
3682,balonmano.html,0.000000,https://www.elmundo.es/deportes/balonmano.html,www.elmundo.es/,elmundo.es
3693,espazo-agora,0.000000,https://www.farodevigo.es/actualidad/espazo-ag...,www.farodevigo.es/,farodevigo.es
3698,latest,0.000000,https://cronicaglobal.elespanol.com/rss/latest/,cronicaglobal.elespanol.com/,cronicaglobal.elespanol.com


#### Common sections

In [505]:
columns_to_save = [
    "section",
    "domain", 
    "Overall",
    "freq_score"
]

sections_with_score = pd.merge(section_urls, 
                               media_ranks,
                               how="inner",
                               left_on="temp_domain",
                               right_on="Domain")[columns_to_save] \
                        .rename(columns={"Overall": "domain_score"}) 

sections_with_score["domain_score"] = (sections_with_score.domain_score - sections_with_score.domain_score.min()) / \
                                    (sections_with_score.domain_score.max() - sections_with_score.domain_score.min())

sections_without_score = pd.merge(section_urls, 
                                  media_ranks,
                                  how="right",
                                  left_on="temp_domain",
                                  right_on="Domain") \
                           .rename(columns={"Overall": "score"})
sections_with_score

Unnamed: 0,section,domain,domain_score,freq_score
0,https://www.laopinioncoruna.es/motogp/,www.laopinioncoruna.es/,0.000000,0.210526
1,https://www.laopinioncoruna.es/salud/,www.laopinioncoruna.es/,0.000000,0.666667
2,https://www.laopinioncoruna.es/cultura/,www.laopinioncoruna.es/,0.000000,0.666667
3,https://www.laopinioncoruna.es/deportes/futbol/,www.laopinioncoruna.es/,0.000000,0.578947
4,https://www.laopinioncoruna.es/salud/dietas/,www.laopinioncoruna.es/,0.000000,0.175439
...,...,...,...,...
2122,https://www.ultimahora.es/guia_util/transporte...,www.ultimahora.es/,0.121739,0.000000
2123,https://www.ultimahora.es/guia_util/emergencia...,www.ultimahora.es/,0.121739,0.000000
2124,https://www.elprogreso.es/rss/listado,www.elprogreso.es/,0.000000,0.035088
2125,https://www.diariodeleon.es/rss/listado,www.diariodeleon.es/,0.034783,0.035088


In [467]:
not_found_cases = sections_without_score[sections_without_score.isnull().any(axis=1)]
not_found_cases.Domain.nunique(), not_found_cases.shape

(131, (131, 11))

### Save preprocess digital media urls to .csv file

In [462]:
files_section = glob.glob("../data/final_url_sections_v*.csv")
if len(files_section) > 0:
    version_n = max(int(x.split("_")[-1][1:-4]) for x in files_section) + 1
else:
    version_n = 0
final_section_urls.to_csv(f"../data/final_url_sections_v{version_n}.csv", 
                          index=False, 
                          header=True,
                          sep=";"
                         )
glob.glob("../data/final_url_sections_v*.csv")

['../data\\final_url_sections_v3.csv',
 '../data\\final_url_sections_v4.csv',
 '../data\\final_url_sections_v5.csv',
 '../data\\final_url_sections_v6.csv',
 '../data\\final_url_sections_v7.csv',
 '../data\\final_url_sections_v8.csv']

In [48]:
with open("nodes_08-09-2023.txt", "w") as f:
    f.write(nodes_count_more_2.index.dropna().str.strip().to_series().to_string(index=False))

In [322]:
nodes_count.iloc[:30]

deportes          58
economia          57
sociedad          55
salud             49
motor             47
internacional     47
cultura           39
tecnologia        38
ciencia           34
futbol            34
cine              31
gente             29
planes            29
sucesos           29
espana            29
gastronomia       27
andalucia         25
series            24
baloncesto        23
tv                22
politica          22
historia          21
medio-ambiente    20
ciclismo          18
madrid            18
ocio              17
viajes            16
television        16
educacion         16
tendencias21      16
Name: count, dtype: int64

### Word similarity analysis

In [14]:
words = media_nodes.tolist()
words

['cartas-al-director',
 'gradario',
 'economia',
 'wappissima',
 'series',
 'diego_j-_geniz',
 'deportes',
 'galerias',
 'deportes',
 'ocio',
 'mapaweb',
 'tintalibre',
 'carnaval',
 'mascotas',
 'agenda',
 'julia_alarcon_villanueva',
 'antropia',
 'motor',
 'campo-de-gibraltar',
 'costa',
 'lopd',
 'comunicacion',
 'television',
 'eldigitalcastillalamancha',
 'deportes',
 'luis_sanchez-molini',
 'user',
 'resultados_deportivos',
 'vital',
 'movil',
 'm-_h',
 'mundo',
 'ocio',
 'pantallas',
 'bocabierta',
 'tendencias21',
 'tecnologia',
 'mercados',
 'levante-ud',
 'ultima-hora',
 'alberto_perez_de_vargas',
 'television',
 'tiempo',
 'suplementotecnologico',
 'ciencia',
 'caceres',
 'obituarios',
 'sociedad',
 'motociclismo',
 'comunicados',
 'events',
 'mapaweb',
 'antropia',
 'provincia',
 'antropia',
 'encuesta',
 'libros-premios-madrid',
 'gl',
 'pantallas',
 'xlsemanal',
 'life',
 'carlos_navarro_antolin',
 'elandroidelibre',
 'ciencia',
 'medio-ambiente',
 'realmurcia',
 'tecnolo

In [166]:
processed = []
scores = {}
for w1 in tqdm.tqdm(words):
    processed.append(w1)
    for w2 in words:
        if w2 not in processed:
            scores[(w1, w2)] = edit_distance(w1, w2)
sorted_scores = sorted(scores.items(), key=lambda k: (k[0][0], k[1]), reverse=False)
sorted_scores = dict(sorted_scores)
sorted_scores

100%|████████████████████████████████████████████████████████████████████████████████| 722/722 [00:16<00:00, 44.00it/s]


{('Denuncian que Ayuso y Almeida han falsificado documentos en las obras de la estación de Madrid Río',
  'la-cronica-de-ribera-alta-del-ebro'): 80,
 ('Denuncian que Ayuso y Almeida han falsificado documentos en las obras de la estación de Madrid Río',
  'la-cronica-del-campo-de-carinena'): 81,
 ('Denuncian que Ayuso y Almeida han falsificado documentos en las obras de la estación de Madrid Río',
  'la-cronica-del-campo-de-belchite'): 83,
 ('Denuncian que Ayuso y Almeida han falsificado documentos en las obras de la estación de Madrid Río',
  'la-cronica-del-campo-de-borja'): 85,
 ('Denuncian que Ayuso y Almeida han falsificado documentos en las obras de la estación de Madrid Río',
  'la-cronica-de-ejea-y-sus-pueblos'): 85,
 ('Denuncian que Ayuso y Almeida han falsificado documentos en las obras de la estación de Madrid Río',
  'la-cronica-de-valdejalon'): 85,
 ('Denuncian que Ayuso y Almeida han falsificado documentos en las obras de la estación de Madrid Río',
  'agricultura-medio-am

In [67]:
dict(filter(lambda k: k[1] < 3, sorted_scores.items()))

{('alicante', 'alacanti'): 2,
 ('alsol', 'alcoy'): 2,
 ('amarina', 'marina'): 1,
 ('asturianu', 'asturianos'): 2,
 ('asturias', 'asturianu'): 2,
 ('asturias', 'asturianos'): 2,
 ('cadiz', 'cadizcf'): 2,
 ('cine', 'life'): 2,
 ('concurso', 'concursos'): 1,
 ('cultura', 'culturas'): 1,
 ('eltiempo', 'tiempo'): 2,
 ('es', 'yes'): 1,
 ('local', 'social'): 2,
 ('local', 'global'): 2,
 ('lugo', 'vigo'): 2,
 ('lugo', 'cdlugo'): 2,
 ('malaga', 'alava'): 2,
 ('marina', 'merida'): 2,
 ('motor', 'motogp'): 2,
 ('planeta', 'llanera'): 2,
 ('rocio', 'ocio'): 1,
 ('rocio', 'elrocio'): 2,
 ('sevilla', 'sevillafc'): 2,
 ('siero', 'sierra'): 2,
 ('social', 'epsocial'): 2,
 ('tendencias21', 'tendencias'): 2,
 ('viajar', 'viajes'): 2,
 ('viajes', 'virales'): 2,
 ('vida', 'vital'): 2,
 ('vida', 'elda'): 2,
 ('vigo', 'vida'): 2}

## News similarity

In [50]:
import sqlite3

conn = sqlite3.connect("../../db.sqlite3")
cursor = conn.cursor()

In [51]:
urls = cursor.execute("""
    SELECT url
        FROM news
"""
)

In [53]:
urls.fetchall()

[('https://cronicaglobal.elespanol.com/politica/20230907/erc-la-amnistia-formula-indepes-negociar-referendum/792670926_0.html',),
 ('https://cronicaglobal.elespanol.com/politica/20230907/erc-remontada-el-cambio-de-junts-dialogo/792670881_0.html',),
 ('https://cronicaglobal.elespanol.com/politica/20230907/la-generalitat-destina-de-tv3-catalunya-radio/792670940_0.html',),
 ('https://elpais.com/actualidad/noticias-del-dia/2023-09-08/las-cinco-noticias-clave-del-8-de-septiembre.html',),
 ('https://elpais.com/america-colombia/2023-09-08/la-bolsa-de-colombia-se-hunde-ante-la-amenaza-de-una-rebaja-crediticia.html',),
 ('https://elpais.com/america-colombia/2023-09-08/petro-tiene-al-enemigo-en-casa.html',),
 ('https://elpais.com/america-colombia/2023-09-08/un-sondeo-lo-dice-en-onlyfans-esta-el-futuro.html',),
 ('https://elpais.com/america-futura/2023-09-08/ricardo-mourinho-vicepresidente-del-banco-europeo-de-inversiones-latinoamerica-clama-por-accion-climatica.html',),
 ('https://elpais.com/ame