 Imports and Constants

In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import logging

# List of ICPAC countries
icpac_countries = [
    "Kenya", "Somalia", "Ethiopia", "South Sudan", "Uganda", "Tanzania", "Rwanda", "Burundi", 'Sudan', 'Djibouti', 'Eritrea'
]

# Base URL for GLIDE search
search_url = "https://www.glidenumber.net/glide/public/search/search.jsp"
report_url = "https://www.glidenumber.net/glide/public/result/report.jsp"

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


Function Definition

In [4]:
# Function to perform search and extract data
def perform_search_and_extract():
    data = []
    page = 1

    while True:
        search_payload = {
            'continent': 'Africa',
            'country': 'icpac_countries',
            'event': 'Any',
            'hits_per_page': 25,
            'sort_order': 'desc',
            'page': page
        }

        try:
            # Perform the search
            search_response = requests.post(search_url, data=search_payload)
            search_response.raise_for_status()  # Check for request errors
        except requests.RequestException as e:
            logging.error(f"Search failed with exception: {e}")
            break

        # Parse the HTML content
        soup = BeautifulSoup(search_response.content, 'html.parser')

        # Extract table rows from the search results
        rows = soup.find_all('tr')[1:]  # Skip header row
        if not rows:
            logging.info(f"No more data found on page {page}. Ending extraction.")
            break  # Exit loop if no rows are found (end of pages)

        for row in rows:
            cols = row.find_all('td')
            if len(cols) >= 11:  # Ensure there are enough columns
                record = {
                    'GLIDE Number': cols[0].text.strip(),
                    'Event': cols[1].text.strip(),
                    'Country': cols[2].text.strip(),
                    'Date': cols[3].text.strip() if len(cols) > 3 else "",
                    'Event Code': cols[4].text.strip() if len(cols) > 4 else "",
                    'GLIDE Serial': cols[5].text.strip() if len(cols) > 5 else "",
                    'Country Code': cols[6].text.strip() if len(cols) > 6 else "",
                    'Year': cols[7].text.strip() if len(cols) > 7 else "",
                    'Month': cols[8].text.strip() if len(cols) > 8 else "",
                    'Day': cols[9].text.strip() if len(cols) > 9 else "",
                    'Time': cols[10].text.strip() if len(cols) > 10 else "",
                    'Location': cols[11].text.strip() if len(cols) > 11 else "",
                    'Duration': cols[12].text.strip() if len(cols) > 12 else "",
                    'Magnitude': cols[13].text.strip() if len(cols) > 13 else "",
                    'Info Source': cols[14].text.strip() if len(cols) > 14 else "",
                    'Comments': cols[15].text.strip() if len(cols) > 15 else "",
                    'Latitude': cols[16].text.strip() if len(cols) > 16 else "",
                    'Longitude': cols[17].text.strip() if len(cols) > 17 else "",
                    'Date Created': cols[20].text.strip() if len(cols) > 20 else "",
                    'Updated': cols[21].text.strip() if len(cols) > 21 else ""
                }

                if record['Country'] in icpac_countries:
                    data.append(record)

        logging.info(f"Page {page} processed with {len(rows)} records.")
        page += 1  # Move to the next page
        time.sleep(1)

    return data


Data Extraction

In [None]:
# Extract data
icpac_data = await perform_search_and_extract()

Data Handling and Output

In [None]:
# Convert to DataFrame and save to CSV
df = pd.DataFrame(icpac_data)
df.to_csv('icpac_disaster_data.csv', index=False)
logging.info("Data extraction completed and saved to icpac_disaster_data.csv")

In [None]:
from google.colab import files
files.download('icpac_disaster_data.csv')
