In [6]:
import pandas as pd


In [9]:
# Read the CSV file, fallback to download if not found
import os
csv_path = 'prison.csv'
if not os.path.exists(csv_path):
    # Download from Wikipedia if not present
    import requests
    import pandas as pd
    from bs4 import BeautifulSoup
    url = "https://en.wikipedia.org/wiki/List_of_helicopter_prison_escapes"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
    }
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")
    tables = soup.find_all("table")
    desired_table = None
    header_row = None
    rows = None
    for table in tables:
        rows = table.find_all("tr")
        if len(rows) == 0:
            continue
        header_row = [cell.text.strip() for cell in rows[0].find_all(["th", "td"])]
        if any("Date" in h for h in header_row) and any("Prison name" in h for h in header_row):
            desired_table = table
            break
    if desired_table is None or header_row is None or len(header_row) == 0:
        raise ValueError("Could not find the desired table or header row on the page. Last header_row found: {}".format(header_row))
    data = []
    for row in rows[1:]:
        cells = row.find_all("td")
        if not cells:
            continue
        row_data = [cell.text.strip() for cell in cells]
        data.append(row_data)
    df = pd.DataFrame(data, columns=header_row)
    df.to_csv(csv_path, index=False)
else:
    df = pd.read_csv(csv_path)

In [11]:
# Convert the Date column to datetime format for proper sorting, handling mixed/incomplete formats
df['Date'] = pd.to_datetime(df['Date'], errors='coerce', infer_datetime_format=True)
if df['Date'].isna().sum() > 0:
    print(f"Warning: {df['Date'].isna().sum()} date(s) could not be parsed and are set as NaT.")



  df['Date'] = pd.to_datetime(df['Date'], errors='coerce', infer_datetime_format=True)


In [12]:
# Sort the data by date in ascending order (oldest to newest)
sorted_df = df.sort_values('Date', ascending=True)


In [13]:
# Save the sorted data to a new CSV file
sorted_df.to_csv('prison_dates.csv', index=False)

In [14]:
print("Data has been sorted by date and saved to 'prison_dates.csv'")
print(f"Date range: {sorted_df['Date'].min()} to {sorted_df['Date'].max()}")

Data has been sorted by date and saved to 'prison_dates.csv'
Date range: 1971-08-19 00:00:00 to 2020-09-25 00:00:00
