In [4]:
import boto3
import polars as pl
import botocore
import tarfile
import os
from io import BytesIO
from pypdf import PdfReader, PdfWriter
import gzip
from io import BytesIO
from fastwarc import ArchiveIterator
from fastwarc.stream_io import GZipStream
from resiliparse.extract.html2text import extract_plain_text
from resiliparse.parse.lang import detect_fast
from resiliparse.parse.html import HTMLTree
from fastwarc.warc import is_http
from surt import surt
import tldextract
import idna
import re
from urllib.parse import urljoin, urlparse
from resiliparse.parse.encoding import detect_encoding, bytes_to_str

client = boto3.client('s3', 
                      aws_access_key_id=os.getenv('ASCII_AWS_ACCESS_KEY_ID'),
        aws_secret_access_key=os.getenv('ASCII_AWS_SECRET_ACCESS_KEY'))

  _RE_HAS_PROTOCOL = re.compile(b"^([a-zA-Z][a-zA-Z0-9\+\-\.]*):")
  ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
  re.compile(b"^(.*/)(\((?:[a-z]\([0-9a-z]{24}\))+\)/)([^\?]+\.aspx.*)$", re.I),
  input, safe=b'''!"$&'()*+,-./:;<=>?@[\]^_`{|}~''').encode(
  _RE_WWWDIGITS = re.compile(b'www\d*\.')


In [None]:
crawls = [
'crawl-data/CC-NEWS/2025/11/CC-NEWS-20251115220136-05256.warc.gz',
'crawl-data/CC-NEWS/2025/11/CC-NEWS-20251115234907-05257.warc.gz',
'crawl-data/CC-NEWS/2025/11/CC-NEWS-20251116014712-05258.warc.gz',
'crawl-data/CC-NEWS/2025/11/CC-NEWS-20251116034034-05259.warc.gz',
'crawl-data/CC-NEWS/2025/11/CC-NEWS-20251116054722-05260.warc.gz',
'crawl-data/CC-NEWS/2025/11/CC-NEWS-20251116072531-05261.warc.gz',
'crawl-data/CC-NEWS/2025/11/CC-NEWS-20251116085944-05262.warc.gz',
'crawl-data/CC-NEWS/2025/11/CC-NEWS-20251116103419-05263.warc.gz',
'crawl-data/CC-NEWS/2025/11/CC-NEWS-20251116120640-05264.warc.gz',
'crawl-data/CC-NEWS/2025/11/CC-NEWS-20251116134805-05265.warc.gz',
'crawl-data/CC-NEWS/2025/11/CC-NEWS-20251116152516-05266.warc.gz',
'crawl-data/CC-NEWS/2025/11/CC-NEWS-20251116170848-05267.warc.gz',
'crawl-data/CC-NEWS/2025/11/CC-NEWS-20251116190217-05268.warc.gz']
key = 'crawl-data/CC-NEWS/index.html'
response = client.get_object(Bucket='commoncrawl', Key=key)

In [None]:
key_gz = 'crawl-data/CC-NEWS/2025/11/warc.paths.gz'

response = client.get_object(Bucket='commoncrawl', Key=key_gz)
    
gzipped_body_bytes = response['Body'].read()
    
decompressed_bytes = gzip.decompress(gzipped_body_bytes)
content_str = decompressed_bytes.decode('utf-8')
    
print(content_str)

In [None]:
ip_pattern = re.compile(r"^(?:www\.)?\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\Z")
host_part_pattern = re.compile(
    r"^[a-z0-9]([a-z0-9_-]{0,61}[a-z0-9])?\Z", re.IGNORECASE | re.ASCII
)


In [None]:
def get_surt_host(url):  # noqa: C901
    extracted = tldextract.extract(url, include_psl_private_domains=True)
    registered_domain = extracted.top_domain_under_public_suffix

    if registered_domain == "":
        registered_domain = f"{extracted.subdomain}.{extracted.domain}"
        if registered_domain == "":
            try:
                # Fallback to urlparse if tldextract fails
                host = urlparse(url).hostname
            except Exception as e:
                print(f"Failed to parse URL {url}: {e}")
                return None
            if not host:
                return None
        else:
            host = registered_domain
    else:
        host = registered_domain

    host = host.strip().lower()
    if len(host) < 1 or len(host) > 253:
        return None
    if ip_pattern.match(host):
        return None
    parts = host.split(".")
    if parts[-1] == "":
        # trailing dot is allowed, strip it
        parts = parts[0:-1]
    if len(parts) <= 1:
        # do not accept single-word hosts, must be at least `domain.tld'
        return None
    if len(parts) > 2 and parts[0] == "www":
        # strip leading 'www' to reduce number of "duplicate" hosts,
        # but leave at least 2 trailing parts (www.com is a valid domain)
        parts = parts[1:]
    for i, part in enumerate(parts):
        if len(part) > 63:
            return None
        if not host_part_pattern.match(part):
            try:
                idn = idna.encode(part).decode("ascii")
            except (
                idna.IDNAError,
                idna.core.InvalidCodepoint,
                UnicodeError,
                IndexError,
                Exception,
            ):
                print("Invalid host name: {}".format(url))
                return None

            # TODO: idna verifies the resulting string for length restrictions or invalid chars,
            #       maybe no further verification is required:
            if host_part_pattern.match(idn):
                parts[i] = idn
            else:
                print("Invalid host name: {}".format(url))
                return None
    parts.reverse()
    return ".".join(parts)

In [None]:
s3_response = client.get_object(Bucket='commoncrawl', Key=key)
s3_stream = s3_response['Body']
stream = GZipStream(s3_stream)
from fastwarc.warc import ArchiveIterator, WarcRecordType
tmp = []
for key in crawls:
    for record in ArchiveIterator(stream, record_types=WarcRecordType.response, func_filter=is_http):
        uri = record.headers.get('WARC-Target-URI')
        body_bytes = record.reader.read()
        html = bytes_to_str(body_bytes, detect_encoding(body_bytes))
        text = extract_plain_text(html)
        http_date =record.http_date 
        http_last_modified = record.http_last_modified 
        http_charset= record.http_charset 
        surt_uri = surt(uri)
        host = get_surt_host(uri)
        r = detect_fast(text, n_results=3)
        langs = []
        confs = []
        for i in range(len(r)):
            langs.append(r[i][0])
            confs.append(r[i][1])
        tmp.append({
            'uri': uri,
            'tree': html,
            'text': text,
            'main_lang': r[0][0], 
            'langs': langs, 
            'confs': confs,
            'http_date': http_date,
            'http_last_modified': http_last_modified,
            'http_charset': http_charset,
            'surt_uri': surt_uri,
            'host': host})
    pl.from_dicts(tmp).with_columns(
        pl.lit(key.split('/')[-1]).alias('path'),
    pl.lit(key.split('/')[-2]).alias('year'),
    pl.lit(key.split('/')[-3]).alias('month'),
    ).write_parquet('/data/raid5/data/picatto/ascii/news/', partition_by=['year','month', 'path', 'main_lang'])

In [None]:
import os
os.listdir('../../../data')

['pixi.toml',
 '.gitignore',
 '.gitattributes',
 '.git',
 'src',
 'docs',
 '.pixi',
 'pixi.lock',
 'data']

In [None]:
tmp_df = pl.from_dicts(tmp)
tmp_df.head()