In [22]:
import numpy as np
from scipy.spatial.distance import jaccard, cosine
from bs4 import BeautifulSoup
import re
import requests
import openai
import os
import json
from datetime import datetime
import pandas as pd
import tqdm
from matplotlib import pyplot as plt
import glob as glob

from nltk.corpus import wordnet as wn
from nltk import download
from nltk.metrics.distance import edit_distance
#download('wordnet')

In [25]:
def read_json_media_urls_file(file_path):
    with open(file_path, "r") as file:
        return json.load(file)
def read_plain_media_urls_file(file_path):
    with open(file_path, "r") as file:
        return file.readlines()

In [26]:
ranking_files = [(file, datetime.strptime(file.split("_")[-1].split(".")[0], "%d%m%Y") )for file in glob.glob("../data/SCImago*Spanish_*.xlsx")]
ranking_files_sort = sorted(ranking_files, 
                            key=lambda x: x[1]
                            )
most_recent_file = ranking_files_sort[-1][0]
ranking_files, ranking_files_sort, most_recent_file

([('../data\\SCImago Media Ranking - Spain - Spanish_02092023.xlsx',
   datetime.datetime(2023, 9, 2, 0, 0)),
  ('../data\\SCImago Media Ranking - Spain - Spanish_20082023.xlsx',
   datetime.datetime(2023, 8, 20, 0, 0))],
 [('../data\\SCImago Media Ranking - Spain - Spanish_20082023.xlsx',
   datetime.datetime(2023, 8, 20, 0, 0)),
  ('../data\\SCImago Media Ranking - Spain - Spanish_02092023.xlsx',
   datetime.datetime(2023, 9, 2, 0, 0))],
 '../data\\SCImago Media Ranking - Spain - Spanish_02092023.xlsx')

In [27]:
rankings_file_path = os.path.join("..\data", most_recent_file)
media_ranks = pd.read_excel(rankings_file_path).sort_values("Global_rank")
media_ranks["Media"] = media_ranks["Media"].str.strip()
media_ranks["Domain"] = media_ranks["Domain"].str.strip()
media_ranks[["Media", "Domain"]].head(50).to_csv("../data/spain_media name_to_url.txt", 
                                        index=False, 
                                        header=False,
                                        sep=";")
media_ranks.head(15)

Unnamed: 0,Media,Domain,Country,Region,Language,Global_rank,Overall
190,El País,elpais.com,Spain,Western Europe,Spanish,7,84.5
189,ABC,abc.es,Spain,Western Europe,Spanish,28,78.25
187,La Vanguardia,lavanguardia.com,Spain,Western Europe,Spanish/Catalan,43,76.75
188,El Español,elespanol.com,Spain,Western Europe,Spanish,43,76.75
186,El Mundo,elmundo.es,Spain,Western Europe,Spanish,60,74.75
185,El Periódico de Catalunya,elperiodico.com,Spain,Western Europe,Spanish,89,72.75
183,La Razón,larazon.es,Spain,Western Europe,Spanish,115,71.25
184,Europa Press,europapress.es,Spain,Western Europe,Spanish,115,71.25
182,20 Minutos,20minutos.es,Spain,Western Europe,Spanish,124,70.75
181,El Confidencial,elconfidencial.com,Spain,Western Europe,Spanish,151,69.25


In [5]:
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"
}
file_name = "spain_media name_to_url.txt"
file_path = os.path.join("..", "data", file_name)
if os.path.exists(file_path):
    print("Reading file of urls of regions...")
    name_to_media_urls = read_plain_media_urls_file(file_path)
    all_sections = []
skip_head = True
for media_line in name_to_media_urls:
    #if skip_head:
    #    skip_head = False
    #    continue
    _, media = media_line.split(";")
    media_url = "https://www." + media.strip()
    try:
        response = requests.get(media_url, 
                                headers=HEADERS, 
                                timeout=10)
    except requests.exceptions.Timeout:
        # Handle the timeout exception
        print("The request timed out.")
    except requests.exceptions.RequestException as e:
        # Handle other request exceptions
        print(f"An error occurred: {str(e)}")
        media_url = media_url.replace("www.", "")
        response = requests.get(media_url, 
                                headers=HEADERS, 
                                timeout=10) 

    parsed_hmtl = BeautifulSoup(response.content, 
                                "html.parser")
    try:
        links = [x.attrs.get("href", None) for x in parsed_hmtl.body.find_all("a")]
    except Exception as e:
        print(e)

    links_serie = pd.Series(links).dropna()
    nodes = links_serie.str.replace(media_url, "", regex=True)

    nodes_split = nodes.str.split("/")
    nodes_split_clean = nodes_split.apply(lambda x: [elem for elem in x if elem])

    nodes_split_clean_filter = nodes_split_clean.str.len().eq(1)

    valid_links = links_serie[nodes_split_clean_filter]
    #valid_links_complete = [media_url + x if not x.startswith(media_url) else x for x in valid_links]
    valid_links_complete = []
    with open("Skipped urls.txt", "w") as file_skipped:
        for link_node in valid_links:
            link = link_node
            link_lower = link.lower()
            # Drop if 'garbage' node
            if link == media_url \
                or len(link.replace("/", "")) == 1 \
                or "#" in link_lower \
                or ":" in link_lower \
                or "@" in link_lower \
                or "php" in link_lower \
                or "javascript" in link_lower \
                or "mailto" in link_lower \
                or "cookie" in link_lower \
                or "feed" in link_lower \
                or "contact" in link_lower \
                or ("aviso" in link_lower and "legal" in link_lower) \
                or "inici" in link_lower \
                or "session" in link_lower \
                or "sesion" in link_lower \
                or "ads" in link_lower \
                or "publicidad" in link_lower \
                or "privacidad" in link_lower \
                or "condiciones" in link_lower \
                or "tags" in link_lower \
                or "premium" in link_lower \
                or "archiv" in link_lower \
                or "sorteo" in link_lower \
                or "loter" in link_lower \
                or "newsletter" in link_lower \
                or "podcast" in link_lower \
                or "logout" in link_lower \
                or "login" in link_lower \
                or link.endswith(".html") \
                or "notifica" in link_lower \
                or "push" in link_lower \
                or "servicio" in link_lower \
                or "esquela" in link_lower \
                or "defunci" in link_lower \
                or "favorito" in link_lower \
                or "firma" in link_lower \
                or "suscri" in link_lower \
                or "subscrib" in link_lower \
                or "pasatiempo" in link_lower \
                or "compra" in link_lower \
                or "tienda" in link_lower \
                or "gráfico" in link_lower or "gráfico" in link_lower \
                or "humor" in link_lower \
                or "foto" in link_lower \
                or "opinion" in link_lower or "opinión" in link_lower \
                or "hemeroteca" in link_lower \
                or "video" in link_lower or "vídeo" in link_lower \
                or "play" in link_lower \
                or "patrocin" in link_lower \
                or "autor" in link_lower \
                or re.compile("\d{3,}").search(link):
                continue
            if not link.startswith(media_url):
                if not link.startswith("//www."):
                    link_parts = link.split("/")
                    link_parts = [x for x in link_parts if x]
                    if len(link_parts) == 1:
                        if link.startswith("/"):
                            link = media_url + link
                        else:
                            link = media_url + "/" + link
                    else:
                        continue
                else:
                    continue
            else:
                link_parts = link.split("/")
                link_parts = [x for x in link_parts if x][2:]
                if len(link_parts) > 1 or len(link_parts) == 0:
                    continue
                else:
                    continue
            if not link.endswith("/"):
                link += "/"
            valid_links_complete.append(link)
        print(f"Valid links of {media_url.strip()}:", len(valid_links))
        all_sections.extend(valid_links_complete)
    valid_links_complete_unique = list(set(all_sections))

Reading file of urls of regions...
Valid links of https://www.elpais.com: 28
Valid links of https://www.abc.es: 60
Valid links of https://www.lavanguardia.com: 81
Valid links of https://www.elespanol.com: 120
Valid links of https://www.elmundo.es: 86
Valid links of https://www.elperiodico.com: 18
Valid links of https://www.larazon.es: 62
Valid links of https://www.europapress.es: 107
Valid links of https://www.20minutos.es: 48
Valid links of https://www.elconfidencial.com: 30
Valid links of https://www.eldiario.es: 121
Valid links of https://www.sevilla.abc.es: 52
Valid links of https://www.lavozdegalicia.es: 105
Valid links of https://www.libertaddigital.com: 49
An error occurred: HTTPSConnectionPool(host='www.cronicaglobal.elespanol.com', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000001B46D6457C0>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
Valid links of https://cronicaglo

In [39]:
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"
}
file_name = "spain_media name_to_url.txt"
file_path = os.path.join("..", "data", file_name)
if os.path.exists(file_path):
    print("Reading file of urls of regions...")
    name_to_media_urls = read_plain_media_urls_file(file_path)
    all_sections = []
skip_head = True
for media_line in name_to_media_urls:
    #if skip_head:
    #    skip_head = False
    #    continue
    _, media = media_line.split(";")
    media = media.strip()
    media_url = "https://" + media
    try:
        response = requests.get(media_url, 
                                headers=HEADERS, 
                                timeout=6.0)
    except requests.exceptions.Timeout:
        # Handle the timeout exception
        print("The request timed out.")
        continue
    except requests.exceptions.RequestException as e:
        # Handle other request exceptions
        print(f"Retry request with 'https://www.' prefix")
        media_url = "https://www." + media
        response = requests.get(media_url, 
                                headers=HEADERS, 
                                timeout=6.0)
        print(f"ok ({response.status_code})")
        #continue
    domain = response.url.split("/")[2]
    domain_url = "/".join(response.url.split("/")[:3] + [""])
    parsed_hmtl = BeautifulSoup(response.content, 
                                "html.parser")
    try:
        links = [x.attrs.get("href", None) for x in parsed_hmtl.body.find_all("a")]
    except Exception as e:
        print(e)

    links_serie = pd.Series(links).dropna()
    nodes = links_serie.str.replace(media_url, "", regex=True)

    nodes_split = nodes.str.split("/")
    nodes_split_clean = nodes_split.apply(lambda x: [elem for elem in x if elem])

    nodes_split_clean_filter = nodes_split_clean.str.len().eq(1)

    valid_links = links_serie[nodes_split_clean_filter]
    #valid_links_complete = [media_url + x if not x.startswith(media_url) else x for x in valid_links]
    valid_links_complete = []
    with open("Skipped urls.txt", "w") as file_skipped:
        for link_node in valid_links:
            link = link_node
            link_lower = link.lower()
            if link.startswith("//www."):
                continue
            # Drop if 'garbage' nodes
            if link == media_url \
                or len(link.replace("/", "")) == 1 \
                or "#" in link_lower \
                or ":" in link_lower \
                or "@" in link_lower \
                or "php" in link_lower \
                or "javascript" in link_lower \
                or "mailto" in link_lower \
                or "cookie" in link_lower \
                or "feed" in link_lower \
                or "contact" in link_lower \
                or ("aviso" in link_lower and "legal" in link_lower) \
                or "inici" in link_lower \
                or "session" in link_lower \
                or "sesion" in link_lower \
                or "ads" in link_lower \
                or "publicidad" in link_lower \
                or "privacidad" in link_lower \
                or "condiciones" in link_lower \
                or "tags" in link_lower \
                or "premium" in link_lower \
                or "archiv" in link_lower \
                or "sorteo" in link_lower \
                or "loter" in link_lower \
                or "newsletter" in link_lower \
                or "podcast" in link_lower \
                or "logout" in link_lower \
                or "login" in link_lower \
                or link.endswith(".html") \
                or "notifica" in link_lower \
                or "push" in link_lower \
                or "servicio" in link_lower \
                or "esquela" in link_lower \
                or "defunci" in link_lower \
                or "favorito" in link_lower \
                or "firma" in link_lower \
                or "suscri" in link_lower \
                or "subscrib" in link_lower \
                or "pasatiempo" in link_lower \
                or "compra" in link_lower \
                or "tienda" in link_lower \
                or "gráfico" in link_lower or "gráfico" in link_lower \
                or "humor" in link_lower \
                or "foto" in link_lower \
                or "opinion" in link_lower or "opinión" in link_lower \
                or "hemeroteca" in link_lower \
                or "video" in link_lower or "vídeo" in link_lower \
                or "play" in link_lower \
                or "patrocin" in link_lower \
                or "autor" in link_lower \
                or re.compile("\d{3,}").search(link):
                continue
            if not link.startswith(domain):
                link_parts = link.split("/")
                nodes = [x for x in link_parts if x][:2]
                if len(nodes) < 1:
                    continue
                # Get only urls with one node
                elif len(nodes) == 1 and link.startswith("/"):
                    link = domain_url[:-1] + link
            if not link.endswith("/"):
                link += "/"
            valid_links_complete.append(link)
        print(f"Valid links of {media_url}, {domain}, {domain_url}:", len(valid_links))
        all_sections.extend(valid_links_complete)
    valid_links_complete_unique = list(set(all_sections))

Reading file of urls of regions...
Valid links of https://elpais.com, elpais.com, https://elpais.com/: 84
Valid links of https://abc.es, www.abc.es, https://www.abc.es/: 37
Valid links of https://lavanguardia.com, www.lavanguardia.com, https://www.lavanguardia.com/: 12
Valid links of https://elespanol.com, www.elespanol.com, https://www.elespanol.com/: 56
Valid links of https://elmundo.es, www.elmundo.es, https://www.elmundo.es/: 7
Valid links of https://elperiodico.com, www.elperiodico.com, https://www.elperiodico.com/: 15
Retry request with 'https://www.' prefix
ok (200)
Valid links of https://www.larazon.es, www.larazon.es, https://www.larazon.es/: 62
Valid links of https://europapress.es, www.europapress.es, https://www.europapress.es/: 50
Valid links of https://20minutos.es, www.20minutos.es, https://www.20minutos.es/: 7
Valid links of https://elconfidencial.com, www.elconfidencial.com, https://www.elconfidencial.com/: 10
Valid links of https://eldiario.es, www.eldiario.es, https:

In [40]:
len(all_sections), len(valid_links_complete_unique)

(1977, 742)

#### Analysis of medias nodes

In [41]:
urls_serie = pd.Series(valid_links_complete_unique)
urls_serie.head()

0           https://www.levante-emv.com/en-valencia/
1                         https://www.abc.es/espana/
2              https://www.levante-emv.com/comarcas/
3    https://www.diariodesevilla.es/ana_s-_ameneiro/
4                  https://www.hoy.es/internacional/
dtype: object

In [48]:
urls_no_https = urls_serie.str.extract("[/]{2}(?P<url>.*)[/]")
urls_no_https

Unnamed: 0,url
0,www.levante-emv.com/en-valencia
1,www.abc.es/espana
2,www.levante-emv.com/comarcas
3,www.diariodesevilla.es/ana_s-_ameneiro
4,www.hoy.es/internacional
...,...
737,www.diariodecadiz.es/mundo
738,www.abc.es/familia
739,lavozdegalicia.es/lemos
740,www.elespanol.com/ciencia


In [49]:
media_count = urls_no_https.value_counts()
media_count

url                                  
cronicaglobal.elespanol.com/business     1
www.elplural.com/autonomias              1
www.elperiodicodearagon.com/salud        1
www.elperiodicodearagon.com/sd-huesca    1
www.elperiodicodearagon.com/sociedad     1
                                        ..
www.diariodemallorca.es/blogs            1
www.diariodemallorca.es/cultura          1
www.diariodemallorca.es/deportes         1
www.diariodemallorca.es/economia         1
www.publico.es/public                    1
Name: count, Length: 741, dtype: int64

In [51]:
n_min_counts = 2
media_nodes = urls_serie.str.replace("/{0,2}$", "", 
                                     regex=True). \
                         str.extract(".*/{1}(.*)", 
                                     expand=False)
media_nodes_count = media_nodes.value_counts()
media_nodes_count_more_2 = media_nodes_count[media_nodes_count > n_min_counts]
#media_nodes_more_2_relfreq = (media_nodes_count_more_2 / media_nodes_count_more_2.sum())
#media_nodes_more_2_relfreq_smooth4 = media_nodes_more_2_relfreq.rolling(window=4, min_periods=1).mean()

common_sections = media_nodes_count_more_2.index.tolist()
filter_final_section_urls = urls_serie.apply(lambda url: any(section in url for section in common_sections))
final_section_urls = urls_serie[filter_final_section_urls].str.extract("(?P<section>https?://(?P<media>[^/]+/).*)")
#final_section_urls_concat = final_section_urls.section + ";" + final_section_urls.media
#final_section_urls_concat
final_section_urls

Unnamed: 0,section,media
1,https://www.abc.es/espana/,www.abc.es/
2,https://www.levante-emv.com/comarcas/,www.levante-emv.com/
4,https://www.hoy.es/internacional/,www.hoy.es/
5,https://www.deia.eus/promociones/,www.deia.eus/
6,https://www.lne.es/sucesos/,www.lne.es/
...,...,...
734,https://www.eldia.es/sociedad/,www.eldia.es/
737,https://www.diariodecadiz.es/mundo/,www.diariodecadiz.es/
739,https://lavozdegalicia.es/lemos/,lavozdegalicia.es/
740,https://www.elespanol.com/ciencia/,www.elespanol.com/


In [55]:
files_section = glob.glob("../data/final_url_sections_v*.csv")
if len(files_section) > 0:
    version_n = max(int(x.split("_")[-1][1:-4]) for x in files_section) + 1
else:
    version_n = 0
final_section_urls.to_csv(f"../data/final_url_sections_v{version_n}.csv", 
                          index=False, 
                          header=True,
                          sep=";"
                         )
glob.glob("../data/final_url_sections_v*.csv")

['../data\\final_url_sections_v3.csv',
 '../data\\final_url_sections_v4.csv',
 '../data\\final_url_sections_v5.csv',
 '../data\\final_url_sections_v6.csv',
 '../data\\final_url_sections_v7.csv']

In [42]:
temp = final_section_urls.loc[final_section_urls["section"].str.contains("https")]
subchunks = temp["section"] + ";" + temp["media"]
subchunks.tolist()

['https://www.elplural.com/sociedad/;www.elplural.com/',
 'https://www.noticiasdenavarra.com/vivir/;www.noticiasdenavarra.com/',
 'https://www.diariodesevilla.es/tecnologia/;www.diariodesevilla.es/',
 'https://www.canarias7.es/cultura/;www.canarias7.es/',
 'https://www.lavozdegalicia.es/andarmiudino/;www.lavozdegalicia.es/',
 'https://www.farodevigo.es/medio-ambiente/;www.farodevigo.es/',
 'https://www.hoy.es/economia/;www.hoy.es/',
 'https://www.diariocordoba.com/tendencias21/;www.diariocordoba.com/',
 'https://www.informacion.es/salud/;www.informacion.es/',
 'https://www.diariocordoba.com/deportes/;www.diariocordoba.com/',
 'https://www.diariocordoba.com/buzzeando/;www.diariocordoba.com/',
 'https://www.informacion.es/deportes/;www.informacion.es/',
 'https://www.diariosur.es/xlsemanal/;www.diariosur.es/',
 'https://www.huffingtonpost.es/economia/;www.huffingtonpost.es/',
 'https://www.canarias7.es/vivir/;www.canarias7.es/',
 'https://www.hoy.es/deportes/;www.hoy.es/',
 'https://www.

In [199]:
CURRENT_DATE, CURRENT_TIME = str(datetime.today()).split(" ")
CURRENT_DATE, CURRENT_TIME

('2023-08-22', '20:42:25.856657')

In [200]:
CURRENT_TIME.split(" ")[-1].split(".")[0].replace(":", "-")

'20-42-25'

In [190]:
def test1():
    temp = final_section_urls.loc[final_section_urls["section"].str.contains("https")]
    return temp.loc[:, "media"] + ";" + temp.loc[:, "section"]
def test2():
    return  final_section_urls.loc[final_section_urls["section"].str.contains("https"), "section"] + ";" + final_section_urls.loc[final_section_urls["section"].str.contains("https"), "media"]

In [191]:
%timeit test1()
%timeit test2()

556 µs ± 8.21 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
749 µs ± 6.83 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


### Word similarity analysis

In [50]:
words = media_nodes.tolist()

In [166]:
processed = []
scores = {}
for w1 in tqdm.tqdm(words):
    processed.append(w1)
    for w2 in words:
        if w2 not in processed:
            scores[(w1, w2)] = edit_distance(w1, w2)
sorted_scores = sorted(scores.items(), key=lambda k: (k[0][0], k[1]), reverse=False)
sorted_scores = dict(sorted_scores)
sorted_scores

100%|████████████████████████████████████████████████████████████████████████████████| 722/722 [00:16<00:00, 44.00it/s]


{('Denuncian que Ayuso y Almeida han falsificado documentos en las obras de la estación de Madrid Río',
  'la-cronica-de-ribera-alta-del-ebro'): 80,
 ('Denuncian que Ayuso y Almeida han falsificado documentos en las obras de la estación de Madrid Río',
  'la-cronica-del-campo-de-carinena'): 81,
 ('Denuncian que Ayuso y Almeida han falsificado documentos en las obras de la estación de Madrid Río',
  'la-cronica-del-campo-de-belchite'): 83,
 ('Denuncian que Ayuso y Almeida han falsificado documentos en las obras de la estación de Madrid Río',
  'la-cronica-del-campo-de-borja'): 85,
 ('Denuncian que Ayuso y Almeida han falsificado documentos en las obras de la estación de Madrid Río',
  'la-cronica-de-ejea-y-sus-pueblos'): 85,
 ('Denuncian que Ayuso y Almeida han falsificado documentos en las obras de la estación de Madrid Río',
  'la-cronica-de-valdejalon'): 85,
 ('Denuncian que Ayuso y Almeida han falsificado documentos en las obras de la estación de Madrid Río',
  'agricultura-medio-am

In [67]:
dict(filter(lambda k: k[1] < 3, sorted_scores.items()))

{('alicante', 'alacanti'): 2,
 ('alsol', 'alcoy'): 2,
 ('amarina', 'marina'): 1,
 ('asturianu', 'asturianos'): 2,
 ('asturias', 'asturianu'): 2,
 ('asturias', 'asturianos'): 2,
 ('cadiz', 'cadizcf'): 2,
 ('cine', 'life'): 2,
 ('concurso', 'concursos'): 1,
 ('cultura', 'culturas'): 1,
 ('eltiempo', 'tiempo'): 2,
 ('es', 'yes'): 1,
 ('local', 'social'): 2,
 ('local', 'global'): 2,
 ('lugo', 'vigo'): 2,
 ('lugo', 'cdlugo'): 2,
 ('malaga', 'alava'): 2,
 ('marina', 'merida'): 2,
 ('motor', 'motogp'): 2,
 ('planeta', 'llanera'): 2,
 ('rocio', 'ocio'): 1,
 ('rocio', 'elrocio'): 2,
 ('sevilla', 'sevillafc'): 2,
 ('siero', 'sierra'): 2,
 ('social', 'epsocial'): 2,
 ('tendencias21', 'tendencias'): 2,
 ('viajar', 'viajes'): 2,
 ('viajes', 'virales'): 2,
 ('vida', 'vital'): 2,
 ('vida', 'elda'): 2,
 ('vigo', 'vida'): 2}

### Analysis of invalid urls

In [97]:
with open("../data/No valid urls_8.txt", "r") as file:
    data = pd.read_table(file, 
                         header=None,
                        ).to_series()
data.rename(columns={0: "url"}, inplace=True)

AttributeError: 'DataFrame' object has no attribute 'to_series'

In [107]:
filter_farodevigo = data.url.str.contains("farodevigo")
data_farodevigo = data[filter_farodevigo].replace(";(.*)", "", regex=True)
data_farodevigo.values

array([['https://www.farodevigo.es/tags/baloncesto/'],
       ['https://www.farodevigo.es/autores/javier-puga-llopis.html'],
       ['https://www.farodevigo.es/autores/jorge-garnelo.html'],
       ['https://www.farodevigo.es/promociones/'],
       ['https://www.farodevigo.es/autores/jose-manuel-otero-lastres.html'],
       ['https://www.farodevigo.es/autores/alfonso-rueda-valenzuela.html'],
       ['https://www.farodevigo.es/autores/alfonso-gonzalez-jerez.html'],
       ['https://www.farodevigo.es/vida-y-estilo/decoracion/'],
       ['https://www.farodevigo.es/deportes/'],
       ['https://www.farodevigo.es/motor/industria/'],
       ['https://www.farodevigo.es/conozcanos/'],
       ['https://www.farodevigo.es/autores/alfonso-lono.html'],
       ['https://www.farodevigo.es/autores/j-fraiz.html'],
       ['https://www.farodevigo.es/escola-en-camino/'],
       ['https://www.farodevigo.es/autores/pilar-garces.html'],
       ['https://www.farodevigo.es/autores/eduardo-martinez-de-la-fe.htm

In [113]:
current_date, current_time = str(datetime.today()).split(" ")
current_time = current_time.split(" ")[-1].split(".")[0].replace(":", "-")
current_time

'17-34-26'

In [114]:
if not os.path.exists("./scrapping statistics"):
    os.makedirs("./scrapping statistics")
if not os.path.exists(os.path.join("./scrapping statistics", f"processes_{current_date}")):
    os.makedirs(os.path.join("./scrapping_statistics", f"processes_{current_date}"))

In [119]:
pid = "0"
stats_path = os.path.join("scrapping statistics", 
                          f"processes_{current_date}", 
                          f"process_{current_date}_{current_time}_pid_{pid}.csv",
                          
                          )
stats_path

'scrapping statistics\\processes_2023-08-22\\process_2023-08-22_17-34-26_0.csv'

In [151]:
if not os.path.exists(os.path.join("./scrapping stats", f"processes_{current_date}")):
    os.makedirs(os.path.join("scrapping stats", f"processes_{current_date}"))
else:
    print("exists 2")

exists 2


In [145]:
os.listdir()

['.ipynb_checkpoints',
 'Analysis of audited media ojdinteractiva.ipynb',
 'data',
 'db.sqlite3',
 'development and utest_generalizable_scrapper.ipynb',
 'Errors log.txt',
 'Extraction of audited media scimago media.ipynb',
 'Extraction of audited media.ipynb',
 'Medias url preprocessing before scrapping and loading.ipynb',
 'News scrapper and loader_general http extraction with Selenium.py',
 'obsolete',
 'registro de medios sin json disponible en codigo html.txt',
 'Scrapper and loader test.py',
 'Scrapper and loader.py',
 'scrapping_stats',
 'Skipped urls.txt',
 'Unit tests.ipynb',
 'URL cross-comparation.ipynb',
 'utilities.py',
 'Wrappers.ipynb',
 '__pycache__']

In [144]:
os.path.exists(os.path.join("scrapping stats", f"processes_{current_date}"))

False