In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re

In [2]:
url = 'https://en.wikipedia.org/wiki/1975_Pacific_hurricane_season'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')

In [3]:
# Step 2: Initialize lists to store the scraped data
hurricane_names = []
date_start = []
date_end = []
num_deaths = []
areas_affected = []

In [4]:
# Step 3: Scrape hurricane names
name_headers = soup.find_all(class_='mw-heading mw-heading3')
for header in name_headers:
    name = header.text.replace('[edit]', '').strip()  # Remove [edit] from the name
    hurricane_names.append(name)

In [5]:
# Step 4: Scrape the date ranges (start and end dates)
infoboxes = soup.find_all('td', {'class': 'infobox-data'})
for box in infoboxes:
    if '–' in box.text:
        # Split the start and end dates by '–'
        date_range = box.text.split('–')
        if len(date_range) == 2:
            date_start.append(date_range[0].strip())
            date_end.append(date_range[1].strip())

In [6]:
num_deaths = []

# Find all the sections with the class 'mw-heading mw-heading3'
headings = soup.find_all('div', class_='mw-heading mw-heading3')

# Iterate over each heading
for heading in headings:
    # Find the next sibling after the heading to start searching for content
    start = heading.find_next_sibling()

    # Continue until the next heading or the end of the document
    while start:
        # If another heading is encountered, stop this section
        if start.name == 'div' and 'mw-heading' in start.get('class', []):
            break

        # If the sibling is a paragraph, process it
        if start.name == 'p':
            text = start.text.lower()
            num = '0'  # Default to 0 if no match is found

            # Look for phrases like "killed X", "X fatalities", or "X deaths"
            match = re.search(r'(?:killed\s+(\d+)\s+people)|(\d+)\s+(fatalities|deaths)', text)
            if match:
                if match.group(1):  
                    num = match.group(1)
                elif match.group(2):  
                    num = match.group(2)
                    
            num_deaths.append(num)
            break  # Stop after processing the first paragraph

        # Move to the next sibling in the document
        start = start.find_next_sibling()

# Print the result for verification
print("Number of Deaths:", num_deaths)

Number of Deaths: ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '30', '0', '0']


In [7]:
areas_affected = []

# Find all the sections with the class 'mw-heading mw-heading3'
headings = soup.find_all('div', class_='mw-heading mw-heading3')

# List of directions to look for
directions = ['south of', 'north of', 'east of', 'west of', 'northeast of', 'northwest of', 'southeast of', 'southwest of', 'eastern']

stop_after_second_word = ['lily', 'organized','and']
stop_at_fifth_word = 'organized'
stop_at_fifth_word = 'from'

# Iterate over each heading
for heading in headings:
    # Find the next sibling after the heading to start searching for content
    start = heading.find_next_sibling()

    # Continue until the next heading or the end of the document
    while start:
        # If another heading is encountered, stop this section
        if start.name == 'div' and 'mw-heading' in start.get('class', []):
            break

        # If the sibling is a paragraph, process it
        if start.name == 'p':
            text = start.text.lower()
            areas_text = 'no areas'  # Default to 'no areas' if no match found
            
            # Iterate through each direction to find a match in the text
            for direction in directions:
                if direction in text:
                    # Split the paragraph into sentences
                    sentences = text.split('.')
                    
                    # Search each sentence for the direction
                    for sentence in sentences:
                        if direction in sentence:
                            # Find the part of the sentence after the direction
                            location_part = sentence.split(direction)[-1].strip()
                            # Split the part after direction into words
                            location_words = location_part.split()

                            # Initialize a list to hold the selected words
                            selected_words = []
                            
                            # Loop through the words (up to 5) and apply conditions
                            for idx, word in enumerate(location_words[:5]):
                                selected_words.append(word)

                                # Stop early if the second word is in stop_after_second_word
                                if idx == 1 and word in stop_after_second_word:
                                    selected_words = selected_words[:1]  # Only keep the first word after direction
                                    break

                                # Stop early if the fourth word is in stop_after_second_word
                                if idx == 3 and word in stop_after_second_word:
                                    selected_words = selected_words[:3]  # Keep up to the third word
                                    break

                                # Stop early if the fifth word is 'from'
                                if idx == 4 and word == stop_at_fifth_word:
                                    selected_words = selected_words[:4]  # Keep up to the fourth word
                                    break
                            
                            # Join the direction with the selected words
                            areas_text = f"{direction} " + ' '.join(selected_words)
                            break  # Break after finding the first match
                    break  # Break after finding the first matching direction
            
            # Append the areas affected to the list (default is 'no areas')
            areas_affected.append(areas_text)
            break

        # Move to the next sibling in the document
        start = start.find_next_sibling()

# Print the result for verification
print("Areas affected:", areas_affected)

Areas affected: ['south of the tres marias islands', 'south of the tip of the baja', 'south of acapulco', 'no areas', 'south of acapulco', 'no areas', 'west of cabo san lucas', 'no areas', 'south of the gulf of tehuantepec', 'south of acapulco', 'no areas', 'south of alaska', 'south of manzanillo', 'eastern pacific ocean', 'no areas', 'south of mexico', 'no areas', 'eastern pacific']


In [8]:
print(len(hurricane_names), len(date_start), len(date_end), len(num_deaths), len(areas_affected))
max_len = max(len(hurricane_names), len(date_start), len(date_end), len(num_deaths), len(areas_affected))
hurricane_names += ['N/A'] * (max_len - len(hurricane_names))
date_start += ['N/A'] * (max_len - len(date_start))
date_end += ['N/A'] * (max_len - len(date_end))
num_deaths += ['0'] * (max_len - len(num_deaths))
areas_affected += ['no areas'] * (max_len - len(areas_affected))

18 17 17 18 18


In [9]:
# Step 8: Structure the data into a DataFrame
data = {
    'Hurricane Storm Name': hurricane_names,
    'Start Date': date_start,
    'End Date': date_end,
    'Number of Deaths': num_deaths,
    'Areas Affected': areas_affected,
}
df = pd.DataFrame(data)

In [10]:
# Step 9: Save the DataFrame to a CSV file
df.to_csv('1975_Pacific_hurricane_season.csv', index=False)

# Print a sample of the DataFrame to verify
print(df)

        Hurricane Storm Name    Start Date      End Date Number of Deaths  \
0           Hurricane Agatha        June 2        June 5                0   
1     Tropical Storm Bridget       June 28        July 3                0   
2         Hurricane Carlotta        July 2       July 11                0   
3           Hurricane Denise        July 5       July 15                0   
4     Tropical Storm Eleanor       July 10       July 12                0   
5    Tropical Storm Francene       July 27       July 30                0   
6   Tropical Storm Georgette     August 11     August 14                0   
7      Tropical Storm Hilary     August 13     August 17                0   
8             Hurricane Ilsa     August 18     August 26                0   
9            Hurricane Jewel     August 24     August 31                0   
10         Hurricane Katrina     August 29   September 7                0   
11         Unnamed hurricane     August 31   September 5                0   

        Hurricane Storm Name    Start Date      End Date Number of Deaths  \
0           Hurricane Agatha        June 2        June 5                0   
1     Tropical Storm Bridget       June 28        July 3                0   
2         Hurricane Carlotta        July 2       July 11                0   
3           Hurricane Denise        July 5       July 15                0   
4     Tropical Storm Eleanor       July 10       July 12                0   
5    Tropical Storm Francene       July 27       July 30                0   
6   Tropical Storm Georgette     August 11     August 14                0   
7      Tropical Storm Hilary     August 13     August 17                0   
8             Hurricane Ilsa     August 18     August 26                0   
9            Hurricane Jewel     August 24     August 31                0   
10         Hurricane Katrina     August 29   September 7                0   
11         Unnamed hurricane     August 31   September 5                0   