In [20]:
import subprocess

from huggingface_hub import HfApi

In [21]:
api = HfApi()

In [22]:
repo_files = api.list_repo_tree(
        "allenai/c4",
        repo_type="dataset",
        recursive=True,
    )

In [23]:
repo_files = list(repo_files)

In [24]:
repo_files[100].path

'en.noblocklist/c4-train.00093-of-01024.json.gz'

In [25]:
test_download_urls = [
    "https://huggingface.co/datasets/allenai/c4/resolve/main/" + i.path
    for i in repo_files[:50]
]

In [26]:
with open("test_urls.txt", "w") as f:
    for url in test_download_urls:
        f.write(url + "\n")

In [29]:
aria_command = [
    'aria2c',
    '-d', "testdl",       # Destination directory
    '-c',                     # Resume partial downloads
    '--console-log-level=warn', # Reduce verbosity
    '-j', '4',                 # Number of parallel downloads (adjust based on HPC policy/files)
    '-i', 'test_urls.txt',     # Input file with URLs
    # Add other aria2c options as needed
]

try:
    result = subprocess.run(aria_command, check=True, capture_output=True, text=True)
except subprocess.CalledProcessError as e:
    print(f"Return code: {e.returncode}")
    print(f"Stdout: {e.stdout}")
    print(f"Stderr: {e.stderr}")

Return code: 3
Stdout: 
04/09 13:46:31 [[1;31mERROR[0m] CUID#10 - Download aborted. URI=https://huggingface.co/datasets/allenai/c4/resolve/main/multilingual
Exception: [AbstractCommand.cc:351] errorCode=3 URI=https://huggingface.co/datasets/allenai/c4/resolve/main/multilingual
  -> [HttpSkipResponseCommand.cc:219] errorCode=3 Resource not found

04/09 13:46:31 [[1;31mERROR[0m] CUID#7 - Download aborted. URI=https://huggingface.co/datasets/allenai/c4/resolve/main/en.noblocklist
Exception: [AbstractCommand.cc:351] errorCode=3 URI=https://huggingface.co/datasets/allenai/c4/resolve/main/en.noblocklist
  -> [HttpSkipResponseCommand.cc:219] errorCode=3 Resource not found

04/09 13:46:31 [[1;31mERROR[0m] CUID#8 - Download aborted. URI=https://huggingface.co/datasets/allenai/c4/resolve/main/en.noclean
Exception: [AbstractCommand.cc:351] errorCode=3 URI=https://huggingface.co/datasets/allenai/c4/resolve/main/en.noclean
  -> [HttpSkipResponseCommand.cc:219] errorCode=3 Resource not found



In [30]:
from datasets import load_dataset

In [32]:
ds = load_dataset("json", data_dir="testdl")

Downloading data: 100%|██████████| 43/43 [00:00<00:00, 39421.87files/s]
Generating train split: 16519370 examples [01:34, 174478.92 examples/s]


In [33]:
import duckdb

In [34]:
con = duckdb.connect()

In [45]:
con.execute("SELECT url FROM READ_JSON('testdl/*', format = 'newline_delimited', compression='gzip') LIMIT 10").fetchall()

[('http://www.stonetalk.org/webpage/2018/07',),
 ('https://hub.awin.com/tag/family/',),
 ('http://krugerquarterhorses.com/wwwboard/messages/rx/lexapro-to-buy/',),
 ('https://www.hsiangneng.com/dc-gear-motors.htm',),
 ('https://ceswhite.wordpress.com/2014/12/22/award-winner-most-underrated-music-video/',),
 ('https://www.macworld.com/article/1150757/dvdripping-roundup.html',),
 ('https://www.shapeoftraining.co.uk/aboutus/1744.asp',),
 ('http://www.seobook.com/publicize-your-publicity-create-successful-viral-marketing-campaigns',),
 ('https://www.otherpress.com/books/the-book-of-fathers/',),
 ('http://realestate.elpasotimes.com/property/tx/houston/77019/-/3333-allen-parkway-703/5c96e52fa78e9254890000ca/',)]

In [None]:
import duckdb
from duckdb.typing import VARCHAR
import tldextract

def extract_domain(url: str) -> str:
    if url is None:
        return None
    # Use tldextract to parse the URL
    extracted = tldextract.extract(url)

    # Return the full domain information as a formatted string
    return f"{extracted.domain}.{extracted.suffix}"

# Create a connection to DuckDB
con = duckdb.connect()

# Register the function as a UDF
con.create_function(
    "extract_domain",      # Name of the UDF in SQL queries
    extract_domain,        # The Python function to use
    [VARCHAR],             # Input parameter type (URL as string)
    VARCHAR,               # Return type (domain as string)
    null_handling="special"
)

<duckdb.duckdb.DuckDBPyConnection at 0x1069d8af0>

In [6]:
con.execute("COPY (SELECT url, extract_domain(url) FROM READ_JSON('testdl/*', format = 'newline_delimited', compression='gzip')) TO 'test_out.parquet' ").fetchall()

[(14982685,)]