In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# Download category page 
url = "https://en.wikisource.org/wiki/Category:German_speeches"

headers = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/123.0 Safari/537.36")}

response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")

In [5]:
# Extract all <li><a> links
pages_div = soup.find("div", id="mw-pages")
links = pages_div.select("li > a")

base = "https://en.wikisource.org"

records = []

In [6]:
# Build structured rows
for link in links:
    title = link.get_text()
    href = link.get("href", "")
    full_url = base + href if href.startswith("/") else href
    # extract date in parentheses
    match = re.search(r"\(([^()]*(\d{4})[^()]*)\)$", title)
    date = match.group(1) if match else None
    records.append({
        "title": title,
        "date": date,
        "URL": full_url})

In [7]:
# Convert to DataFrame
df = pd.DataFrame(records)
df.index.name = "index"
df

Unnamed: 0_level_0,title,date,URL
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Adolf Hitler Explains His Reasons for Invading...,,https://en.wikisource.org/wiki/Adolf_Hitler_Ex...
1,Adolf Hitler's Address at the Opening of the W...,30 September 1942,https://en.wikisource.org/wiki/Adolf_Hitler%27...
2,Adolf Hitler's Address to the Reichstag (1 Sep...,1 September 1939,https://en.wikisource.org/wiki/Adolf_Hitler%27...
3,Adolf Hitler's Address to the Reichstag (26 Ap...,26 April 1942,https://en.wikisource.org/wiki/Adolf_Hitler%27...
4,Adolf Hitler's Address to the Reichstag (4 May...,4 May 1941,https://en.wikisource.org/wiki/Adolf_Hitler%27...
5,Adolf Hitler's Address to the Wehrmacht (1 Jan...,1 January 1942,https://en.wikisource.org/wiki/Adolf_Hitler%27...
6,Adolf Hitler's Address to the Wehrmacht (1 Jan...,1 January 1943,https://en.wikisource.org/wiki/Adolf_Hitler%27...
7,Adolf Hitler's Appeal for the Fourth War Winte...,,https://en.wikisource.org/wiki/Adolf_Hitler%27...
8,Adolf Hitler's Appeal for the Second War Winte...,,https://en.wikisource.org/wiki/Adolf_Hitler%27...
9,Adolf Hitler's Appeal to the French People on ...,,https://en.wikisource.org/wiki/Adolf_Hitler%27...


In [16]:
# Create function for scraping URL's
def scrape_speech(url):
    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/123.0 Safari/537.36"
        )
    }
    response = requests.get(url, headers=headers) #verify=False
    html_string = response.text
    return html_string

In [17]:
# Scrape URLs and add text column to dataframe
for index, row in df.iterrows():
    try:
        df.loc[index, 'text'] = scrape_speech(row['URL'])
    except:
        df.loc[index, 'text'] = "URL not available"

In [18]:
df

Unnamed: 0_level_0,title,date,URL,text
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Adolf Hitler Explains His Reasons for Invading...,,https://en.wikisource.org/wiki/Adolf_Hitler_Ex...,"<!DOCTYPE html>\n<html class=""client-nojs vect..."
1,Adolf Hitler's Address at the Opening of the W...,30 September 1942,https://en.wikisource.org/wiki/Adolf_Hitler%27...,"<!DOCTYPE html>\n<html class=""client-nojs vect..."
2,Adolf Hitler's Address to the Reichstag (1 Sep...,1 September 1939,https://en.wikisource.org/wiki/Adolf_Hitler%27...,"<!DOCTYPE html>\n<html class=""client-nojs vect..."
3,Adolf Hitler's Address to the Reichstag (26 Ap...,26 April 1942,https://en.wikisource.org/wiki/Adolf_Hitler%27...,"<!DOCTYPE html>\n<html class=""client-nojs vect..."
4,Adolf Hitler's Address to the Reichstag (4 May...,4 May 1941,https://en.wikisource.org/wiki/Adolf_Hitler%27...,"<!DOCTYPE html>\n<html class=""client-nojs vect..."
5,Adolf Hitler's Address to the Wehrmacht (1 Jan...,1 January 1942,https://en.wikisource.org/wiki/Adolf_Hitler%27...,"<!DOCTYPE html>\n<html class=""client-nojs vect..."
6,Adolf Hitler's Address to the Wehrmacht (1 Jan...,1 January 1943,https://en.wikisource.org/wiki/Adolf_Hitler%27...,"<!DOCTYPE html>\n<html class=""client-nojs vect..."
7,Adolf Hitler's Appeal for the Fourth War Winte...,,https://en.wikisource.org/wiki/Adolf_Hitler%27...,"<!DOCTYPE html>\n<html class=""client-nojs vect..."
8,Adolf Hitler's Appeal for the Second War Winte...,,https://en.wikisource.org/wiki/Adolf_Hitler%27...,"<!DOCTYPE html>\n<html class=""client-nojs vect..."
9,Adolf Hitler's Appeal to the French People on ...,,https://en.wikisource.org/wiki/Adolf_Hitler%27...,"<!DOCTYPE html>\n<html class=""client-nojs vect..."


In [23]:
outfile = "german_speeches.csv"
df.to_csv(outfile, index=False)
outfile

'german_speeches.csv'

In [24]:
import os
os.getcwd()

'C:\\Users\\maart\\Documents\\1 - Digital humanities\\Collecting Data\\Assignment3\\web-scraping\\code'