In [1]:
# local files
import variables as var

# libraries
import requests
import csv
import json
import hashlib
import os
import pandas as pd

import subprocess
print("\nLibrary versions from pip:")
libraries = ['requests', 'csv', 'json', 'hashlib', 'os', 'pandas']
for lib in libraries:
    version = subprocess.run(['pip', 'show', lib], capture_output=True, text=True)
    version_info = next((line for line in version.stdout.splitlines() if line.startswith("Version")), None)
    print(f"{lib}: {version_info}")


Library versions from pip:
requests: Version: 2.32.3
csv: None
json: None
hashlib: None
os: None
pandas: Version: 2.2.3


In [2]:
def short_hash(text, length=4):
    """
    Generate a shortened MD5 hash of the given text.

    Args:
        text (str): The input text to hash.
        length (int): The length of the hash to return.

    Returns:
        str: A substring of the MD5 hash of the input text, truncated to the specified length.

    Raises:
        ValueError: If the specified length is greater than the length of the full hash.
    """
    hash_object = hashlib.md5(text.encode('utf-8'))
    return hash_object.hexdigest()[:length]

In [3]:
def process_apirequest(api, savename):
    """
    Fetches data from the given API endpoint, processes the JSON lines, and writes the 'timestamp' and 'url' fields to a CSV file.

    Args:
        api (str): The API endpoint URL to fetch data from.

    Raises:
        requests.exceptions.RequestException: If there is an issue with the HTTP request.
        json.JSONDecodeError: If there is an issue with decoding a JSON line.

    Writes:
        A CSV file named 'urls.csv' with the following columns:
        - 'timestamp': The timestamp from the JSON data.
        - 'url': The URL from the JSON data.
    """
    response = requests.get(api)
    with open(f'{savename}.csv', 'w', newline='') as csvfile:
        csv_writer = csv.writer(csvfile)
        csv_writer.writerow(['timestamp', 'url'])
        raw_data = response.text.splitlines()
        for line in raw_data:
            if line.strip():
                try:
                    # Parse each JSON line individually
                    json_obj = json.loads(line)
                    timestamp = json_obj.get('timestamp', '')
                    url = json_obj.get('url', '')
                    # Write to CSV
                    csv_writer.writerow([timestamp, url])
                except json.JSONDecodeError:
                    print(f"Skipping malformed JSON: {line}")

def do_api_requests(link, source):
    """
    Perform API requests to retrieve archived URLs and timestamps from the Arquivo.pt service.

    This function constructs a list of API request URLs with different parameters and iterates over them,
    processing each request and saving the results to a specified file.

    Args:
        link (str): The base URL to be queried in the API requests.
        source (str): A string identifier for the source of the URLs, used in naming the output files.

    Returns:
        None
    """
    apis_formats = [
        f"https://arquivo.pt/wayback/cdx?url={link}*&output=json&filter==mime:text/html&fields=url,timestamp"#,
        #f"https://arquivo.pt/wayback/cdx?url={link}*&output=json&filter==mime:text/html&fields=url,timestamp&to={2005}&limit=1500",
        #f"https://arquivo.pt/wayback/cdx?url={link}*&output=json&filter==mime:text/html&fields=url,timestamp&to={2010}&limit=3000",
        #f"https://arquivo.pt/wayback/cdx?url={link}*&output=json&filter==mime:text/html&fields=url,timestamp&from={2010}",
        #f"https://arquivo.pt/wayback/cdx?url={link}*&output=json&filter==mime:text/html&fields=url,timestamp&from={2015}&limit=20000",
        #f"https://arquivo.pt/wayback/cdx?url={link}*&output=json&filter==mime:text/html&fields=url,timestamp&from={2020}&limit=10000",
    ]
    for i, api in enumerate(apis_formats):
        savename = f"data/urls/{source}_{short_hash(link)}_{i}"
        if os.path.isfile(f"{savename}.csv"):
            print(f"Skipping saved file {savename}.")
        else:
            process_apirequest(api, savename)
            print(f"Processed {savename}.")

In [4]:
# Perform API requests for each news source
for link, source in var.news_sources:
    do_api_requests(link, source)

Skipping saved file data/urls/RTP_d43a_0.
Skipping saved file data/urls/RTP_d02b_0.
Skipping saved file data/urls/RTP_0a5c_0.
Skipping saved file data/urls/RTP_e6de_0.
Skipping saved file data/urls/RTP_3710_0.
Skipping saved file data/urls/RTP_4e69_0.
Skipping saved file data/urls/RTP_ad94_0.
Skipping saved file data/urls/RTP_7bd2_0.
Skipping saved file data/urls/RTP_f020_0.
Skipping saved file data/urls/Público_1723_0.
Skipping saved file data/urls/Público_5a7a_0.
Skipping saved file data/urls/Público_bdda_0.
Skipping saved file data/urls/Público_97ae_0.
Skipping saved file data/urls/Público_6d80_0.
Skipping saved file data/urls/Público_a569_0.
Skipping saved file data/urls/Público_3e1d_0.
Skipping saved file data/urls/Público_1c93_0.
Skipping saved file data/urls/Público_0dcd_0.
Skipping saved file data/urls/Público_7a4e_0.
Skipping saved file data/urls/Público_b6ba_0.
Skipping saved file data/urls/Correio da Manhã_c8ca_0.
Skipping saved file data/urls/Correio da Manhã_d735_0.
Skippi

In [5]:
# Combine all CSV files into a single DataFrame
data_folder = 'data/urls'
dfs = []
for file in os.listdir(data_folder):
    if file.endswith('.csv'):
        df = pd.read_csv(f'{data_folder}/{file}')
        df["source"] = file.split('_')[0]
        dfs.append(df)
df = pd.concat(dfs)

# Remove duplicates and sort by timestamp
df.drop_duplicates(inplace=True)
df.sort_values(by='timestamp', inplace=True)

# Filter out URLs that have been archived multiple times in the same month, after June 2010
df['timestamp_prefix'] = df['timestamp'].astype(str).str[:6]
df01 = df[df['timestamp_prefix'] <= '201006']
df02 = df[df['timestamp_prefix'] > '201006']
df02 = df02.drop_duplicates(subset=['timestamp_prefix', "url"], keep='last')
df = pd.concat([df01, df02])
df = df.drop(columns=['timestamp_prefix'])

# Add archive URLs
def get_archive(timestamp, url):
    return f"https://arquivo.pt/noFrame/replay/{timestamp}/{url}"
df['archive'] = df.apply(lambda row: get_archive(row['timestamp'], row['url']), axis=1)

df.reset_index(drop=True, inplace=True)
df['id'] = df.index
df.to_csv('data/urls.csv', index=False)

In [6]:
# Print the shape of the final DataFrame
print(df.shape)

# Print the number of URLs per source (avg. ~20 news/day)
df["source"].value_counts()

(3056418, 5)


source
Correio da Manhã      471684
RTP                   358896
Notícias ao Minuto    325957
SAPO                  310735
SIC Notícias          293420
Público               261402
Expresso              157668
Jornal de Negócios    139033
IOL                   135812
Observador            111529
Record                 96406
AEIOU                  81705
NiT                    79326
Lusa                   75907
O Mirante              55556
Dinheiro Vivo          49116
Diário de Notícias     16317
Jornal de Notícias     14424
CNN Portugal           12827
TSF                     8698
Name: count, dtype: int64