In [1]:
##Scraping the Inci web for information of parties involved in
##each forest fire

In [2]:
## Import base packages 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from seaborn import set_style
set_style("whitegrid")
## this import BeautifulSoup
from bs4 import BeautifulSoup
import requests
import csv

In [21]:

def scrape_inciweb_data(start_tail_num, loop_count):
    """
    Scrapes incident data from InciWeb based on a starting tail number and loop count.

    Parameters:
    - start_tail_num (int): The starting incident tail number to begin scraping.
    - loop_count (int): The number of incidents to scrape data for.

    Returns:
    - DataFrame: A Pandas DataFrame containing scraped incident data, including fire name,
      fire overview, URLs, incident cooperators, and any available table data.
    """
    all_data = []
    tail_num = start_tail_num

    for _ in range(loop_count):
        url = f"https://inciweb.nwcg.gov/incident/{tail_num}/"
        page = requests.get(url)
        soup = BeautifulSoup(page.content, 'html.parser')

        title = soup.find('title').text.strip()

        tail_num += 1  # Increment the tail number

        if title.strip() != "404 - Page Not Found -  InciWeb the Incident Information System":
            # Extract fire name
            fire_name = soup.find('h1').text if soup.find('h1') else "N/A"

            # Extract incident overview
            fire_overview = soup.find(id="incidentOverview").text if soup.find(id="incidentOverview") else "N/A"

            row_data = {"fire Name": fire_name, "fire Overview": fire_overview, "url": url}

            # Extracting tables if available
            inci_tables = soup.findAll("table")
            if inci_tables:
                for table in inci_tables:
                    rows = table.find_all('tr')
                    for row in rows:
                        cols = [ele.text.strip() for ele in row.find_all('td')]
                        if cols:
                            row_data.update({f"table_data_{rows.index(row)}": cols})

            # Extract Incident Cooperators if available
            cooperator_divs = soup.find_all('div', {'class': 'ibox-title'})
            inci_coop = ""
            for div in cooperator_divs:
                if div.find('h2', text='Incident Cooperators'):
                    next_div = div.find_next_siblings('div')

                    if next_div:
                        next_div = next_div[0].find_all('li')
                        for item in next_div:
                            inci_coop += item.text + ", "
                        inci_coop = inci_coop[:-2]
                    else:
                        inci_coop = "N/A"
                    break

            row_data["Incident Cooperator"] = inci_coop
            all_data.append(row_data)

    # Convert the collected data into a Pandas DataFrame
    return pd.DataFrame(all_data)

df = scrape_inciweb_data(6000, 2000) 

In [28]:
def extract_data(allData):
    """
    Extracts and organizes data from a nested list structure.

    Parameters:
    - allData (list of lists): A nested list containing data, where each inner list represents attributes and values.

    Returns:
    - Tuple: A tuple containing two lists:
        1. allAttri (list): A list of unique attribute names extracted from the data.
        2. allRow (list of lists): A 2D list where each inner list corresponds to an attribute and contains
           values extracted from the data. Missing values are represented as "N/A".
    """
    allAttri = []

    for i in allData:
        for j in i:
            if j[0] not in allAttri:
                if j[0] != "Current as of":
                    allAttri.append(j[0])

    dataRow = []
    allRow = []
    counter = 0

    for i in allAttri:
        allRow.append([])
        for j in allData:
            exist = 0
            for k in j:
                if k[0] == i and len(k) == 2:
                    allRow[counter].append(k[1])
                    exist = 1
            if exist == 0:
                allRow[counter].append("N/A")
        counter += 1

    return allAttri, allRow

allAttri,allRow=extract_data(df)

In [29]:
##
##open file here

file = open('FireIncidentCommander.csv', 'w', newline ='')

with file:
    # identifying header  
    writer = csv.DictWriter(file, fieldnames = allAttri)
    writer.writeheader()

    # writing data row-wise into the csv file
    #goes for number of fire time
    for i in range(len(allRow[0])):
        inci={}
        for j in range(len(allAttri)):
            inci[allAttri[j]]=allRow[j][i]
        writer.writerow(inci)
    
      

def write_data_to_csv(filename, headers, data):
    """
    Writes data to a CSV file.

    Parameters:
    - filename (str): The name of the CSV file to create or write to.
    - headers (list): List of header/column names.
    - data (list of lists): Data to be written to the CSV file.

    Returns:
    - None
    """
    with open(filename, 'w', newline='') as file:
        # Identifying header
        writer = csv.DictWriter(file, fieldnames=headers)
        writer.writeheader()

        # Writing data row-wise into the CSV file
        for i in range(len(data[0])):
            inci = {}
            for j in range(len(headers)):
                inci[headers[j]] = data[j][i]
            writer.writerow(inci)
            
write_data_to_csv('FireIncidentCommander.csv', allAttri, allRow)