In [2]:
import requests
from bs4 import BeautifulSoup
import csv
import re

In [4]:


# Dictionary of countries with their possible alternative names
COUNTRY_ALIASES = {
    "Afghanistan": ["Afghanistan"],
    "Albania": ["Albania"],
    "Algeria": ["Algeria"],
    "Andorra": ["Andorra"],
    "Angola": ["Angola"],
    "Antigua and Barbuda": ["Antigua and Barbuda", "Antigua", "Barbuda"],
    "Argentina": ["Argentina"],
    "Armenia": ["Armenia"],
    "Australia": ["Australia"],
    "Austria": ["Austria"],
    "Azerbaijan": ["Azerbaijan"],
    "Bahamas": ["Bahamas", "The Bahamas"],
    "Bahrain": ["Bahrain"],
    "Bangladesh": ["Bangladesh"],
    "Barbados": ["Barbados"],
    "Belarus": ["Belarus"],
    "Belgium": ["Belgium"],
    "Belize": ["Belize"],
    "Benin": ["Benin"],
    "Bhutan": ["Bhutan"],
    "Bolivia": ["Bolivia", "Plurinational State of Bolivia"],
    "Bosnia and Herzegovina": ["Bosnia and Herzegovina", "Bosnia", "Herzegovina"],
    "Botswana": ["Botswana"],
    "Brazil": ["Brazil"],
    "Brunei": ["Brunei", "Brunei Darussalam"],
    "Bulgaria": ["Bulgaria"],
    "Burkina Faso": ["Burkina Faso"],
    "Burundi": ["Burundi"],
    "Cabo Verde": ["Cabo Verde", "Cape Verde"],
    "Cambodia": ["Cambodia"],
    "Cameroon": ["Cameroon"],
    "Canada": ["Canada"],
    "Central African Republic": ["Central African Republic", "CAR"],
    "Chad": ["Chad"],
    "Chile": ["Chile"],
    "China": ["China", "PRC", "People's Republic of China"],
    "Colombia": ["Colombia"],
    "Comoros": ["Comoros", "Union of the Comoros"],
    "Congo": ["Congo", "Republic of the Congo"],
    "Democratic Republic of the Congo": ["Democratic Republic of the Congo", "DR Congo", "DRC", "Congo-Kinshasa"],
    "Costa Rica": ["Costa Rica"],
    "Cote dIvoire": ["Cote dIvoire", "Ivory Coast"],
    "Croatia": ["Croatia"],
    "Cuba": ["Cuba"],
    "Cyprus": ["Cyprus"],
    "Czech Republic": ["Czech Republic", "Czechia"],
    "Denmark": ["Denmark"],
    "Djibouti": ["Djibouti"],
    "Dominica": ["Dominica"],
    "Dominican Republic": ["Dominican Republic"],
    "Ecuador": ["Ecuador"],
    "Egypt": ["Egypt"],
    "El Salvador": ["El Salvador"],
    "Equatorial Guinea": ["Equatorial Guinea"],
    "Eritrea": ["Eritrea"],
    "Estonia": ["Estonia"],
    "Eswatini": ["Eswatini", "Swaziland"],
    "Ethiopia": ["Ethiopia"],
    "Fiji": ["Fiji"],
    "Finland": ["Finland"],
    "France": ["France"],
    "Gabon": ["Gabon"],
    "Gambia": ["Gambia", "The Gambia"],
    "Georgia": ["Georgia"],
    "Germany": ["Germany", "Deutschland"],
    "Ghana": ["Ghana"],
    "Greece": ["Greece"],
    "Grenada": ["Grenada"],
    "Guatemala": ["Guatemala"],
    "Guinea": ["Guinea"],
    "Guinea-Bissau": ["Guinea-Bissau", "Guinea Bissau"],
    "Guyana": ["Guyana"],
    "Haiti": ["Haiti"],
    "Honduras": ["Honduras"],
    "Hungary": ["Hungary"],
    "Iceland": ["Iceland"],
    "India": ["India"],
    "Indonesia": ["Indonesia"],
    "Iran": ["Iran", "Islamic Republic of Iran"],
    "Iraq": ["Iraq"],
    "Ireland": ["Ireland"],
    "Israel": ["Israel"],
    "Italy": ["Italy"],
    "Jamaica": ["Jamaica"],
    "Japan": ["Japan"],
    "Jordan": ["Jordan"],
    "Kazakhstan": ["Kazakhstan"],
    "Kenya": ["Kenya"],
    "Kiribati": ["Kiribati"],
    "North Korea": ["North Korea", "DPRK"],
    "South Korea": ["South Korea", "Republic of Korea"],
    "Kosovo": ["Kosovo"],
    "Kuwait": ["Kuwait"],
    "Kyrgyzstan": ["Kyrgyzstan"],
    "Laos": ["Laos", "Lao People's Democratic Republic"],
    "Latvia": ["Latvia"],
    "Lebanon": ["Lebanon"],
    "Lesotho": ["Lesotho"],
    "Liberia": ["Liberia"],
    "Libya": ["Libya"],
    "Liechtenstein": ["Liechtenstein"],
    "Lithuania": ["Lithuania"],
    "Luxembourg": ["Luxembourg"],
    "Madagascar": ["Madagascar"],
    "Malawi": ["Malawi"],
    "Malaysia": ["Malaysia"],
    "Maldives": ["Maldives"],
    "Mali": ["Mali"],
    "Malta": ["Malta"],
    "Marshall Islands": ["Marshall Islands"],
    "Mauritania": ["Mauritania"],
    "Mauritius": ["Mauritius"],
    "Mexico": ["Mexico"],
    "Micronesia": ["Micronesia", "Federated States of Micronesia"],
    "Moldova": ["Moldova"],
    "Monaco": ["Monaco"],
    "Mongolia": ["Mongolia"],
    "Montenegro": ["Montenegro"],
    "Morocco": ["Morocco"],
    "Mozambique": ["Mozambique"],
    "Myanmar": ["Myanmar", "Burma"],
    "Namibia": ["Namibia"],
    "Nauru": ["Nauru"],
    "Nepal": ["Nepal"],
    "Netherlands": ["Netherlands", "Holland"],
    "New Zealand": ["New Zealand"],
    "Nicaragua": ["Nicaragua"],
    "Niger": ["Niger"],
    "Nigeria": ["Nigeria"],
    "North Macedonia": ["North Macedonia", "Macedonia"],
    "Norway": ["Norway"],
    "Oman": ["Oman"],
    "Pakistan": ["Pakistan"],
    "Palau": ["Palau"],
    "Panama": ["Panama"],
    "Papua New Guinea": ["Papua New Guinea"],
    "Paraguay": ["Paraguay"],
    "Peru": ["Peru"],
    "Philippines": ["Philippines"],
    "Poland": ["Poland"],
    "Portugal": ["Portugal"],
    "Qatar": ["Qatar"],
    "Romania": ["Romania"],
    "Russia": ["Russia", "Russian Federation"],
    "Rwanda": ["Rwanda"],
    "Saint Kitts and Nevis": ["Saint Kitts and Nevis"],
    "Saint Lucia": ["Saint Lucia"],
    "Saint Vincent and the Grenadines": ["Saint Vincent and the Grenadines"],
    "Samoa": ["Samoa"],
    "San Marino": ["San Marino"],
    "Sao Tome and Principe": ["Sao Tome and Principe"],
    "Saudi Arabia": ["Saudi Arabia"],
    "Senegal": ["Senegal"],
    "Serbia": ["Serbia"],
    "Seychelles": ["Seychelles"],
    "Sierra Leone": ["Sierra Leone"],
    "Singapore": ["Singapore"],
    "Slovakia": ["Slovakia"],
    "Slovenia": ["Slovenia"],
    "Solomon Islands": ["Solomon Islands"],
    "Somalia": ["Somalia"],
    "South Africa": ["South Africa"],
    "South Sudan": ["South Sudan"],
    "Spain": ["Spain"],
    "Sri Lanka": ["Sri Lanka"],
    "Sudan": ["Sudan"],
    "Suriname": ["Suriname"],
    "Sweden": ["Sweden"],
    "Switzerland": ["Switzerland"],
    "Syria": ["Syria"],
    "Taiwan": ["Taiwan", "Republic of China"],
    "Tajikistan": ["Tajikistan"],
    "Tanzania": ["Tanzania"],
    "Thailand": ["Thailand"],
    "Timor-Leste": ["Timor-Leste", "East Timor"],
    "Togo": ["Togo"],
    "Tonga": ["Tonga"],
    "Trinidad and Tobago": ["Trinidad and Tobago"],
    "Tunisia": ["Tunisia"],
    "Turkey": ["Turkey"],
    "Turkmenistan": ["Turkmenistan"],
    "Tuvalu": ["Tuvalu"],
    "Uganda": ["Uganda"],
    "Ukraine": ["Ukraine"],
    "United Arab Emirates": ["United Arab Emirates", "UAE"],
    "United Kingdom": ["United Kingdom", "UK", "Great Britain", "Britain"],
    "United States": ["United States", "United States of America", "USA", "US"],
    "Uruguay": ["Uruguay"],
    "Uzbekistan": ["Uzbekistan"],
    "Vanuatu": ["Vanuatu"],
    "Vatican City": ["Vatican City", "Holy See"],
    "Venezuela": ["Venezuela", "Bolivarian Republic of Venezuela"],
    "Vietnam": ["Vietnam"],
    "Yemen": ["Yemen"],
    "Zambia": ["Zambia"],
    "Zimbabwe": ["Zimbabwe"]
}

In [6]:
def extract_country_associations(card):
    found_countries = set()
    # Get all text content from the card
    card_text = card.get_text(" ", strip=True)
    
    # Check each country and its aliases in the text
    for country, aliases in COUNTRY_ALIASES.items():
        for alias in aliases:
            if alias.lower() in card_text.lower():
                found_countries.add(country)
                break  # If one alias matches, no need to check the rest for this country
    
    # Also check the href of the first link in the card if available
    link = card.find("a")
    if link and 'href' in link.attrs:
        href = link['href']
        for country, aliases in COUNTRY_ALIASES.items():
            for alias in aliases:
                if alias.lower() in href.lower():
                    found_countries.add(country)
                    break

    if found_countries:
        # Return countries as a single space separated string
        return " ".join(sorted(found_countries))
    return "Not specified"

In [10]:
def extract_dates(details_text):
    # Extract case opened date using the "Summons to appear:" pattern
    case_opened_date = "Not available"
    opened_match = re.search(r"Summons to appear:\s*([\d]{1,2}\s+\w+\s+\d{4})", details_text, re.IGNORECASE)
    if opened_match:
        case_opened_date = opened_match.group(1)

    # Extract arrest date using common phrases
    arrest_date = "Not arrested"
    arrest_match = re.search(
        r"(transferred to ICC custody on|surrendered on|arrest warrant issued on|arrested on)\s*([\d]{1,2}\s+\w+\s+\d{4})",
        details_text, re.IGNORECASE)
    if arrest_match:
        arrest_date = arrest_match.group(2)

    return arrest_date, case_opened_date

In [12]:
def extract_name_alias(card):
    # Extract the full name from the card header h5
    header = card.find("div", class_="card-header")
    full_name = header.h5.get_text(strip=True) if header and header.h5 else "Not found"
    # If the full name contains a comma then use the part after the comma as a potential alias
    if "," in full_name:
        parts = [part.strip() for part in full_name.split(",")]
        name = parts[0]
        alias = " ".join(parts[1:])  # join remaining parts as alias if more than one
    else:
        name = full_name
        alias = full_name
    return name, alias


In [30]:

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}


url = "https://www.icc-cpi.int/defendants"
session = requests.Session()
session.headers.update(headers)
response = session.get(url)
#response = requests.get(url, headers = headers)
response

<Response [403]>

In [22]:
url = f"https://www.icc-cpi.int/defendants?page=1"
print(f"Scraping {url}")
response = requests.get(url)
if response.status_code != 200:
    print(f"Failed to retrieve page 1")
return []
soup = BeautifulSoup(response.content, "html.parser")
    
    # Select all cards inside the view-content section based on the provided HTML structure
cards = soup.select("div.view-content div.mb-2 div.card")
cases = []
    
for card in cards:
    name, alias = extract_name_alias(card)
    body = card.find("div", class_="card-body")
    details_text = body.get_text(" ", strip=True) if body else ""
    arrest_date, case_opened_date = extract_dates(details_text)
    country_associations = extract_country_associations(card)
    cases.append({
        "Name": name,
        "Aliases": alias,
        "ArrestDate": arrest_date,
        "CaseOpenedDate": case_opened_date,
        "CountryAssociations": country_associations
        })
    return cases

Scraping https://www.icc-cpi.int/defendants?page=1
Failed to retrieve page 1


SyntaxError: 'return' outside function (2312344421.py, line 6)

In [8]:
def scrape_icc_cases(page):
    url = f"https://www.icc-cpi.int/defendants?page={page}"
    print(f"Scraping {url}")
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to retrieve page {page}")
        return []
    soup = BeautifulSoup(response.content, "html.parser")
    
    # Select all cards inside the view-content section based on the provided HTML structure
    cards = soup.select("div.view-content div.mb-2 div.card")
    cases = []
    
    for card in cards:
        # Extract name and potential alias from the card header
        name, alias = extract_name_alias(card)
        
        # Extract all text from the card-body for date extraction
        body = card.find("div", class_="card-body")
        details_text = body.get_text(" ", strip=True) if body else ""
        arrest_date, case_opened_date = extract_dates(details_text)
        
        # Extract country associations using both card text and link href
        country_associations = extract_country_associations(card)
        
        cases.append({
            "Name": name,
            "Aliases": alias,
            "ArrestDate": arrest_date,
            "CaseOpenedDate": case_opened_date,
            "CountryAssociations": country_associations
        })
        
    return cases

In [14]:
def main():
    all_cases = []
    # Loop through pages 1 to 4
    for page in range(1, 7):
        cases = scrape_icc_cases(page)
        all_cases.extend(cases)
    
    # Write the extracted data to a CSV file
    with open("icc_cases.csv", "w", newline="", encoding="utf-8") as csvfile:
        fieldnames = ["Name", "Aliases", "ArrestDate", "CaseOpenedDate", "CountryAssociations"]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for case in all_cases:
            writer.writerow(case)
    
    print("Scraping completed and data saved to icc_cases.csv")

if __name__ == "__main__":
    main()


Scraping https://www.icc-cpi.int/defendants?page=1
Failed to retrieve page 1
Scraping https://www.icc-cpi.int/defendants?page=2
Failed to retrieve page 2
Scraping https://www.icc-cpi.int/defendants?page=3
Failed to retrieve page 3
Scraping https://www.icc-cpi.int/defendants?page=4
Failed to retrieve page 4
Scraping completed and data saved to icc_cases.csv
