### Start spark session

In [3]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
    .appName("GetData")
    .config(
        "spark.jars.packages",
        "org.apache.hadoop:hadoop-aws:3.3.4,"
        "com.amazonaws:aws-java-sdk-bundle:1.12.262"
    )
    .config(
        "spark.hadoop.fs.s3a.impl",
        "org.apache.hadoop.fs.s3a.S3AFileSystem"
    )
    .config(
        "spark.hadoop.fs.s3a.aws.credentials.provider",
        "com.amazonaws.auth.DefaultAWSCredentialsProviderChain"
    )
    .getOrCreate()
)


### Get all urls from gdelt metadata and store in dataframe

In [36]:
from pyspark.sql.functions import col

df = spark.read.option("multiLine", True).json("./data/bquxjob_4cd3ca9c_19b553d5fe2.json")
df = df.select(col("URL"))
df = df.withColumnRenamed("URL", "url")
df.show()
print(df.count())


+--------------------+
|                 url|
+--------------------+
|https://www.wthit...|
|https://www.nytim...|
|https://news.yaho...|
|https://www.slash...|
|https://www.jdsup...|
|https://japantoda...|
|https://bgr.com/t...|
|https://techmoran...|
|https://gazette.c...|
|https://www.prwee...|
|https://biztoc.co...|
|https://www.engad...|
|https://nypost.co...|
|https://gizmodo.c...|
|https://www.sott....|
|https://www.bnnbl...|
|https://www.times...|
|https://www.metro...|
|https://www.popsc...|
|https://www.singa...|
+--------------------+
only showing top 20 rows

3415


### Find common crawl indexes for each url

In [38]:
import requests

indexes  = [
  # 'CC-MAIN-2024-10',
  # 'CC-MAIN-2024-18',
  # 'CC-MAIN-2024-22',
  # 'CC-MAIN-2024-26',
  # 'CC-MAIN-2024-30',
  # 'CC-MAIN-2024-33',
  # 'CC-MAIN-2024-38',
  # 'CC-MAIN-2024-42',
  'CC-MAIN-2025-05',
  'CC-MAIN-2025-08',
  'CC-MAIN-2025-13',
  'CC-MAIN-2025-18',
  'CC-MAIN-2025-21',
  'CC-MAIN-2025-26',
  'CC-MAIN-2025-30',
  'CC-MAIN-2025-33',
  'CC-MAIN-2025-38',
  'CC-MAIN-2025-43',
  'CC-MAIN-2025-47',
  'CC-MAIN-2025-51'
]

In [27]:
headers = {"User-Agent": "Mozilla/5.0 (compatible; CommonCrawlFetcher/1.0)"}

In [28]:
from urllib.parse import quote_plus

def find_url_in_indexes(url):
  encoded_url = quote_plus(url)
  for index in indexes:
    print("trying", index)
    api_url = f"https://index.commoncrawl.org/{index}-index?url={encoded_url}&output=json"
    try:
      response = requests.get(api_url, headers=headers, timeout=10)
      if response.status_code == 200 and response.text.strip():
        first_line = response.text.strip().splitlines()[0]
        import json
        record = json.loads(first_line)
        return {
            "filename": record["filename"],
            "offset": int(record["offset"]),
            "length": int(record["length"])
        }, index
    except requests.RequestException:
      continue
  return None, None

### Fetch warc content

In [29]:
import gzip
import io

def fetch_warc_record(filename, offset, length):
    url = f"https://data.commoncrawl.org/{filename}"
    headers_range = {
        "Range": f"bytes={offset}-{offset + length - 1}",
        "User-Agent": headers["User-Agent"]
    }

    response = requests.get(url, headers=headers_range, timeout=10)
    response.raise_for_status()

    # Decompress WARC in memory
    with gzip.GzipFile(fileobj=io.BytesIO(response.content)) as gz:
        warc_record = gz.read().decode("utf-8", errors="replace")
    
    return warc_record


In [30]:
def extract_html_from_warc(warc_record):
    # WARC headers + HTTP headers end with a double newline
    split_marker = "\r\n\r\n"
    parts = warc_record.split(split_marker, 2)
    if len(parts) == 3:
        html = parts[2]
        return html
    return None


### Save in S3

In [31]:
import boto3
from botocore.exceptions import NoCredentialsError
from urllib.parse import quote

BUCKET_NAME = "bigdata-mapping-ai-project"
REGION = "eu-central-1"

s3 = boto3.client("s3", region_name=REGION)

def upload_to_s3(html, index, url):
    safe_url = quote(url, safe="")
    key = f"common-crawl/{index}/{safe_url}.html"

    try:
        s3.put_object(
            Bucket=BUCKET_NAME,
            Key=key,
            Body=html.encode("utf-8"),
            ContentType="text/html; charset=utf-8"
        )
        print(f"Uploaded s3://{BUCKET_NAME}/{key}")

    except NoCredentialsError:
        print("No AWS credentials found. Run `aws configure`.")
    except Exception as e:
        print(f"Upload failed: {e}")


### Run

In [39]:
for url in df.collect():
  print("trying: ", url)
  result, index = find_url_in_indexes(url['url'])
  if not result or not index:
    continue
  print("found index:", index)

  record = fetch_warc_record(result['filename'], result['offset'], result['length'])
  if not record:
    continue
  print("fetched content")

  html = extract_html_from_warc(record)
  if not html:
    continue
  print("extracted content")

  upload_to_s3(html, index, url['url'])

trying:  Row(url='https://www.wthitv.com/news/leading-tech-firms-pledge-to-address-election-risks-posed-by-ai/article_f1b5f219-44dd-5272-a119-561d39606374.html')
trying CC-MAIN-2025-05
trying CC-MAIN-2025-08
trying CC-MAIN-2025-13
trying CC-MAIN-2025-18
trying CC-MAIN-2025-21
trying CC-MAIN-2025-26
trying CC-MAIN-2025-30
trying CC-MAIN-2025-33
trying CC-MAIN-2025-38
trying CC-MAIN-2025-43
trying CC-MAIN-2025-47
trying CC-MAIN-2025-51
trying:  Row(url='https://www.nytimes.com/2024/02/16/technology/openai-artificial-intelligence-deal-valuation.html')
trying CC-MAIN-2025-05
trying CC-MAIN-2025-08
trying CC-MAIN-2025-13
trying CC-MAIN-2025-18
trying CC-MAIN-2025-21
trying CC-MAIN-2025-26
trying CC-MAIN-2025-30
trying CC-MAIN-2025-33
trying CC-MAIN-2025-38
trying CC-MAIN-2025-43
trying CC-MAIN-2025-47
trying CC-MAIN-2025-51
trying:  Row(url='https://news.yahoo.com/openai-valued-80-billion-deal-224312281.html')
trying CC-MAIN-2025-05
trying CC-MAIN-2025-08
trying CC-MAIN-2025-13
trying CC-MA