In [2]:
import pandas as pd
import numpy as np
import os
import re
from matplotlib import pyplot as plt
import xml
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
import requests

In [3]:
url = "https://www.ojdinteractiva.es/medios-digitales#"
response = requests.get(url)
parsed_html = BeautifulSoup(response.content, "html.parser")

def find_data_rows(tag):
    return tag.name == "tr" and "data-offer" in tag.attrs
tags_with_data = parsed_html.find_all("table", attrs={"class": "medios"})[-1].find_all(find_data_rows)
data = [re.sub("\n+", "\n", x.get_text("\n")).split("\n") for x in tags_with_data]
data = [x[2:-1] if len(x) <= 12 else x[4:-1] for x in data]
keys = (
    "name",
    "category",
    "sub_category",
    "daily_unique_ips_avg",
    "total_unique_ips",
    "variation",
    "sequences_of_visits",
    "visit_duration_avg",
    "pages"
)

In [12]:
traffic = pd.DataFrame(data, columns=keys).drop("variation", axis=1)
traffic.replace("--", np.nan, inplace=True)

In [14]:
numeric_cols = ["daily_unique_ips_avg", "total_unique_ips", "sequences_of_visits", "pages"]
traffic[numeric_cols] = traffic[numeric_cols].apply(lambda col: col.replace(["[.]", "[,]"], ["", "."], regex=True), axis=1)
traffic[numeric_cols] = traffic[numeric_cols].astype(float)
visit_duration_splits = traffic["visit_duration_avg"].str.extract("(?P<hours>[0-9]+):(?P<minutes>[0-9]+):(?P<seconds>[0-9]+)").astype(float)
traffic = pd.concat([traffic.drop("visit_duration_avg", axis=1), 
                     visit_duration_splits.drop("hours", axis=1)
                    ], axis=1)

In [15]:
traffic

Unnamed: 0,name,category,sub_category,daily_unique_ips_avg,total_unique_ips,sequences_of_visits,pages,minutes,seconds
0,7DIESACTUALITAT.COM,Noticias e Información,Noticias globales y actualidad,434.0,9303.0,17193.0,28929.0,1.0,39.0
1,7TELEVALENCIA.COM,Noticias e Información,Noticias globales y actualidad,7682.0,182966.0,251828.0,312441.0,1.0,4.0
2,7TVANDALUCIA.ES,Entretenimiento,Broadcast,1643.0,37572.0,63859.0,180869.0,1.0,25.0
3,ACTUALIDADRIOJABAJA.COM,Noticias e Información,Noticias globales y actualidad,2510.0,59854.0,84214.0,106288.0,0.0,38.0
4,ACTUALITATVALENCIANA.COM,Noticias e Información,Noticias globales y actualidad,3854.0,98610.0,127268.0,293923.0,1.0,8.0
...,...,...,...,...,...,...,...,...,...
558,XCATALUNYA.CAT,Noticias e Información,Noticias globales y actualidad,15658.0,239224.0,549739.0,646193.0,0.0,37.0
559,XTRADIO.ES,Entretenimiento,Broadcast,56.0,1331.0,1965.0,3589.0,1.0,39.0
560,ZA49.ES,Noticias e Información,Noticias globales y actualidad,4862.0,94374.0,162722.0,217017.0,0.0,33.0
561,ZAMORA3PUNTO0.COM,Noticias e Información,Noticias globales y actualidad,1708.0,46142.0,55974.0,94094.0,1.0,8.0


In [18]:
from datetime import datetime
datetime_str = str(datetime.today()).split(".")[0].replace(":", "_")
file_name = f"../data/spain_audience_total_traffic_{datetime_str}.csv"
traffic.to_csv(file_name)
print(f"Saved at {datetime_str}")

Saved at 2023-07-09 11_08_51


In [19]:
traffic.head()

Unnamed: 0,name,category,sub_category,daily_unique_ips_avg,total_unique_ips,sequences_of_visits,pages,minutes,seconds
0,7DIESACTUALITAT.COM,Noticias e Información,Noticias globales y actualidad,434.0,9303.0,17193.0,28929.0,1.0,39.0
1,7TELEVALENCIA.COM,Noticias e Información,Noticias globales y actualidad,7682.0,182966.0,251828.0,312441.0,1.0,4.0
2,7TVANDALUCIA.ES,Entretenimiento,Broadcast,1643.0,37572.0,63859.0,180869.0,1.0,25.0
3,ACTUALIDADRIOJABAJA.COM,Noticias e Información,Noticias globales y actualidad,2510.0,59854.0,84214.0,106288.0,0.0,38.0
4,ACTUALITATVALENCIANA.COM,Noticias e Información,Noticias globales y actualidad,3854.0,98610.0,127268.0,293923.0,1.0,8.0
