In [2]:
import os
import requests
import numpy as np
from pathlib import Path
from bs4 import BeautifulSoup
from PIL import Image
from io import BytesIO


In [4]:
base_dir = "data/plaintext"
for d in ["text", "code", "raw_images"]:
    os.makedirs(f"{base_dir}/{d}", exist_ok=True)

In [10]:
wiki_pages = [
    "Cryptography",
    "Machine_learning",
    "Computer_security",
    "Information_theory",
    "Operating_system"
]

headers = {
    "User-Agent": "Mozilla/5.0"
}

for title in wiki_pages:
    url = f"https://en.wikipedia.org/wiki/{title}"
    html = requests.get(url, headers=headers, timeout=10).text
    soup = BeautifulSoup(html, "html.parser")
    text = "\n".join(p.get_text() for p in soup.find_all("p"))
    print(f"length of {title}: {len(text)} characters")
    with open(f"{base_dir}/text/{title}.txt", "w", encoding="utf-8") as f:
        f.write(text)

length of Cryptography: 52190 characters
length of Machine_learning: 59239 characters
length of Computer_security: 72979 characters
length of Information_theory: 29527 characters
length of Operating_system: 34528 characters


In [11]:
code_sources = {
    "aes.py": "https://raw.githubusercontent.com/bozhu/AES-Python/master/aes.py",
    "sha256.c": "https://raw.githubusercontent.com/B-Con/crypto-algorithms/master/sha256.c",
    "base64.py": "https://raw.githubusercontent.com/python/cpython/main/Lib/base64.py",
    "md5.c": "https://raw.githubusercontent.com/B-Con/crypto-algorithms/master/md5.c",
    "random.py": "https://raw.githubusercontent.com/python/cpython/main/Lib/random.py"
}

for name, url in code_sources.items():
    r = requests.get(url, timeout=10)
    print(f"length of {name}: {len(r.content)} bytes")
    with open(f"{base_dir}/code/{name}", "wb") as f:
        f.write(r.content)

length of aes.py: 9287 bytes
length of sha256.c: 5263 bytes
length of base64.py: 22057 bytes
length of md5.c: 5832 bytes
length of random.py: 37260 bytes


In [None]:
image_urls = [
    "https://upload.wikimedia.org/wikipedia/commons/3/3f/Fronalpstock_big.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/9/99/Black_square.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/d/d4/Wikimedia-servers-Sept04.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/5/50/Vd-Orig.png",
    "https://upload.wikimedia.org/wikipedia/commons/a/a9/Example.jpg"
]
saved = 0
i = 0

for url in image_urls:
    r = requests.get(url, headers=headers, timeout=10)
    if r.status_code != 200:
        continue

    if not r.headers.get("Content-Type", "").startswith("image"):
        continue

    try:
        img = Image.open(BytesIO(r.content)).convert("RGB")
    except Exception:
        continue
    
    print(f"Saving image {i} from {url} with size {img.size}")
    arr = np.array(img, dtype=np.uint8)
    arr.tofile(f"data/plaintext/raw_images/img_{i}.rgb")
    i += 1
    saved += 1

saved

Saving image 0 from https://upload.wikimedia.org/wikipedia/commons/3/3f/Fronalpstock_big.jpg with size (10109, 4542)
Saving image 1 from https://upload.wikimedia.org/wikipedia/commons/9/99/Black_square.jpg with size (360, 360)
Saving image 2 from https://upload.wikimedia.org/wikipedia/commons/d/d4/Wikimedia-servers-Sept04.jpg with size (450, 600)
Saving image 3 from https://upload.wikimedia.org/wikipedia/commons/5/50/Vd-Orig.png with size (100, 100)
Saving image 4 from https://upload.wikimedia.org/wikipedia/commons/a/a9/Example.jpg with size (172, 178)


5

In [18]:
for d in ["text", "code", "csv", "raw_images"]:
    print(d, len(list(Path(f"{base_dir}/{d}").iterdir())))

text 5
code 5
csv 5
raw_images 5
