In [2]:
# Import necessary libraries  
import fitz  # PyMuPDF  
import os  
import json  

# Define the PDF file path  
pdf_path = 'annual_report_2008_508.pdf'  
  
# Check if the file exists  
if not os.path.exists(pdf_path):  
    print("File not found. Please check the file path.")  
else:  
    print("File found. Starting PDF to JSON conversion...")  
  
    # Open the PDF file  
    pdf_document = fitz.open(pdf_path)  
    num_pages = len(pdf_document)  
    print("Number of pages:", num_pages)  
  
    # Create a dictionary where each page's text is stored  
    pdf_data = {}  
  
    for page_num in range(num_pages):  
        page = pdf_document[page_num]  
        text = page.get_text()  
        # Store the text in the dictionary with page number as key (starting at 1)  
        pdf_data[page_num + 1] = text  
  
    pdf_document.close()  
    print("Finished extracting text from all pages.")  
  
    # Specify the output JSON file path  
    json_output_path = 'annual_report_2008_508.json'  
      
    # Make sure the directory exists  
    output_dir = os.path.dirname(json_output_path)  
    if output_dir and not os.path.exists(output_dir):  
        os.makedirs(output_dir)  
      
    # Write the dictionary to a JSON file  
    with open(json_output_path, 'w', encoding='utf-8') as json_file:  
        json.dump(pdf_data, json_file, ensure_ascii=False, indent=4)  
      
    print("JSON file saved as:", json_output_path)    

File found. Starting PDF to JSON conversion...
Number of pages: 76
Finished extracting text from all pages.
JSON file saved as: annual_report_2008_508.json


In [6]:
# Let's first load the JSON file and examine its structure
import json

# Load the JSON file
with open('annual_report_2008_508.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

# Check the structure - how many pages and what the keys look like
print(f"Number of keys (pages): {len(data)}")
print(f"Keys (first 5): {list(data.keys())[:5]}")

# Let's look at the first page to understand the content
print("\
Sample content from first page (first 300 characters):")
first_page_key = list(data.keys())[0]
print(f"Page {first_page_key}: {data[first_page_key][:300]}...")

# Let's also look at a page from the middle of the document
middle_page_key = list(data.keys())[len(data)//2]
print(f"\
Sample content from middle page {middle_page_key} (first 300 characters):")
print(f"{data[middle_page_key][:300]}...")

# Let's check if there are any tables or structured data patterns
# by looking for common table indicators like rows of numbers
import re

# Function to check if a line might be part of a table
def might_be_table_row(line):
    # Check if line has multiple numbers or tabular patterns
    return bool(re.search(r'\d+\s+\d+', line)) or '|' in line or '\	' in line

# Check a few pages for potential table content
table_pages = []
for page_key in list(data.keys())[:20]:  # Check first 20 pages
    page_text = data[page_key]
    lines = page_text.split('\
')
    for line in lines:
        if might_be_table_row(line):
            table_pages.append(page_key)
            break

print(f"\
Pages that might contain tables: {table_pages[:5]}")

# Let's also look for key sections or headers that might indicate important data
def find_headers(text):
    # Look for capitalized lines that might be headers
    headers = []
    lines = text.split('\
')
    for line in lines:
        # If line is all caps and not too long, it might be a header
        if line.strip() and line.strip().isupper() and len(line.strip()) < 50:
            headers.append(line.strip())
    return headers

# Check for potential headers in the first few pages
all_headers = []
for page_key in list(data.keys())[:10]:  # Check first 10 pages
    headers = find_headers(data[page_key])
    if headers:
        all_headers.extend(headers)

print(f"\
Potential section headers found: {all_headers[:10]}")

# Let's also look for dates, which could be important for time dimensions
def find_dates(text):
    # Simple regex for dates in format like 2008, MM/DD/YYYY, or Month DD, YYYY
    date_patterns = [
        r'\\d{4}\',  # Year like 2008
        r'\\d{1,2}/\d{1,2}/\d{4}\',  # MM/DD/YYYY
        r'\(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},\s+\d{4}\'  # Month DD, YYYY
    ]
    
    dates = []
    for pattern in date_patterns:
        matches = re.findall(pattern, text)
        dates.extend(matches)
    
    return dates

# Check for dates in the document
all_dates = []
for page_key in list(data.keys())[:10]:  # Check first 10 pages
    dates = find_dates(data[page_key])
    if dates:
        all_dates.extend(dates)

print(f"\
Dates found in the document: {all_dates[:10]}")

print("\
This initial analysis helps us understand what kind of data we're working with.")

Number of keys (pages): 76
Keys (first 5): ['1', '2', '3', '4', '5']
Sample content from first page (first 300 characters):
Page 1: National Interagency 
Coordination Center 
Wildland Fire 
Summary and Statistics 
Annual Report 
2008 
 
 
 
 
 
...
Sample content from middle page 39 (first 300 characters):
 
 38
Types 1 and 2 IMT Summary 2008 
Incident Management Team summary: The tables below depict total Type 1 and Type 2 Incident 
Management Teams requested through NICC. 
 
By Requesting Agency 
 
                    
By Requesting Geographic Area 
 
...


  return bool(re.search(r'\d+\s+\d+', line)) or '|' in line or '\	' in line
  return bool(re.search(r'\d+\s+\d+', line)) or '|' in line or '\	' in line


ValueError: empty separator

In [4]:
# Let's examine page 15 which might contain a table
import json

with open('annual_report_2008_508.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

# Print the content of page 15
print("Content of page 15:")
print(data['15'])

# Let's also look at a few more pages to find structured data
# Pages that might contain statistics or summary data
potential_data_pages = ['15', '20', '25', '30', '35', '40']

# Check for keywords that might indicate important data sections
keywords = ['summary', 'statistics', 'total', 'acres', 'fires', 'cost', 'budget', 'resources']

for page_key in potential_data_pages:
    if page_key in data:
        page_text = data[page_key].lower()
        found_keywords = [keyword for keyword in keywords if keyword in page_text]
        if found_keywords:
            print(f"\
Page {page_key} contains keywords: {found_keywords}")
            # Print a snippet around the first keyword found
            for keyword in found_keywords[:2]:  # Just check first two keywords
                index = page_text.find(keyword)
                if index >= 0:
                    start = max(0, index - 50)
                    end = min(len(page_text), index + 150)
                    print(f"\
Context around '{keyword}' on page {page_key}:")
                    print(f"...{page_text[start:end]}...")

Content of page 15:
 
 14
Significant Incidents Over 40,000 Acres 
 
Name 
Inc. 
Type 
GACC 
State 
Start Date 
Contain or 
Control 
Date 
Size 
(Acres) 
Cause 
Cost 
Glass Fire 
WF 
SA 
TX 
2/25/2008 
3/2/2008 
219,556 
H 
NR 
Klamath Theater 
WF 
NO 
CA 
6/21/2008 
9/26/2008 
192,038 
L 
$126,086,065 
Basin Complex 
WF 
SO 
CA 
6/21/2008 
7/29/2008 
162,818 
L 
$78,096,079 
Iron & Alps 
Complexes 
WF 
NO 
CA 
6/21/2008 
9/4/2008 
105,805 
L 
$73,974,917 
Dunn Mtn. 
Assist 
WF 
NR 
MT 
8/21/2008 
9/2/2008 
102,383 
L 
$2,900,000 
Lime Complex 
WF 
NO 
CA 
6/20/2008 
8/30/2008 
99,585 
L 
$59,329,698 
Huckabee 
WF 
SA 
TX 
4/30/2008 
5/8/2008 
98,200 
U 
NR 
SHU Lightning 
Complex 
WF 
NO 
CA 
6/21/2008 
7/30/2008 
86,500 
L 
$56,438,391 
Siskiyou / Blue 2 
Complex 
WF 
NO 
CA 
6/21/2008 
9/13/2008 
82,186 
L 
$65,692,836 
Indians 
WF 
SO 
CA 
6/8/2008 
7/12/2008 
76,554 
H 
$42,500,000 
Panther 
WF 
NO 
CA 
7/22/2008 
10/8/2008 
72,344 
L 
NR 
Gunbarrel 
WF 
RM 
WY 
7/26/2008 
9/8/200

In [12]:
# Let's look at more pages with potential tables and structured data
import json
import re

with open('annual_report_2008_508.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

# Let's identify key sections in the report that could form dimensions in our star schema
# We'll look for patterns that indicate tables, summaries, or categorized data

# First, let's identify potential fact tables (numerical data)
# We'll look for pages with tables containing numbers, which could be our measures

def has_table_structure(text):
    # Check if text has patterns that look like a table
    lines = text.split(' ')
    num_lines_with_numbers = 0
    for line in lines:
        # If line has multiple numbers, it might be a table row
        if re.search(r'\\d+\.*\\d+\', line):
            num_lines_with_numbers += 1
    
    # If we have several lines with numbers, it might be a table
    return num_lines_with_numbers >= 3

# Let's identify potential dimension tables
# 1. Time dimension - pages with year/date information
# 2. Geographic dimension - pages with state/region information
# 3. Incident type dimension - pages with fire type classifications

def extract_potential_dimensions(text):
    dimensions = {
        'time': False,
        'geographic': False,
        'incident_type': False
    }
    
    # Check for time dimension indicators
    if re.search(r'\(year|month|quarter|annual|2008|2007)\', text, re.IGNORECASE):
        dimensions['time'] = True
    
    # Check for geographic dimension indicators
    if re.search(r'\(state|region|area|county|GACC)\', text, re.IGNORECASE):
        dimensions['geographic'] = True
    
    # Check for incident type dimension indicators
    if re.search(r'\(wildfire|prescribed fire|fire use|incident type)\', text, re.IGNORECASE):
        dimensions['incident_type'] = True
    
    return dimensions

# Analyze each page for potential star schema components
schema_components = {}

for page_key in data.keys():
    page_text = data[page_key]
    
    # Check if page has table structure (potential fact table)
    has_table = has_table_structure(page_text)
    
    # Check for potential dimensions
    dimensions = extract_potential_dimensions(page_text)
    
    # Store the analysis
    schema_components[page_key] = {
        'has_table': has_table,
        'dimensions': dimensions
    }

# Let's summarize our findings
pages_with_tables = [page for page, info in schema_components.items() if info['has_table']]
pages_with_time_dim = [page for page, info in schema_components.items() 
                      if info['dimensions']['time']]
pages_with_geo_dim = [page for page, info in schema_components.items() 
                     if info['dimensions']['geographic']]
pages_with_incident_dim = [page for page, info in schema_components.items() 
                          if info['dimensions']['incident_type']]

print(f"Pages with potential tables (first 10): {pages_with_tables[:10]}")
print(f"Pages with time dimension (first 10): {pages_with_time_dim[:10]}")
print(f"Pages with geographic dimension (first 10): {pages_with_geo_dim[:10]}")
print(f"Pages with incident type dimension (first 10): {pages_with_incident_dim[:10]}")

# Let's examine a few pages that have both tables and multiple dimensions
# These are good candidates for our fact tables
good_candidates = [page for page, info in schema_components.items() 
                  if info['has_table'] and sum(info['dimensions'].values()) >= 2]

print(f"\
Best candidates for fact tables (pages with tables and multiple dimensions): {good_candidates[:10]}")

# Let's look at one of these good candidates in detail
if good_candidates:
    sample_page = good_candidates[0]
    print(f"\
Sample content from page {sample_page} (potential fact table):")
    print(data[sample_page])

Pages with potential tables (first 10): []
Pages with time dimension (first 10): []
Pages with geographic dimension (first 10): []
Pages with incident type dimension (first 10): []
Best candidates for fact tables (pages with tables and multiple dimensions): []


In [13]:
# Let's take a more careful approach to extract the table from page 15
import json
import re
import pandas as pd

# Load the JSON file
with open('annual_report_2008_508.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

# Get the content of page 15 which has the table of significant incidents
page_15_text = data['15']

# Print the raw text to better understand the structure
print("Raw text from page 15:")
print(page_15_text)

# The table structure is complex due to PDF formatting
# Let's try a different approach by identifying fire incident names and their associated data

# Based on visual inspection, we can see that each row starts with a fire name
# followed by data like WF, state code, dates, acres, etc.

# Let's define a pattern to identify fire names and extract rows
# Fire names are typically at the beginning of lines and are followed by WF (Wildfire)
fire_pattern = r'([A-Za-z\s&\.]+)\s+(WF)\s+([A-Z]{2})\s+([A-Z]{2})\s+(\d+/\d+/\d{4})\s+(\d+/\d+/\d{4})\s+(\d+,?\d*)\s+([A-Z])\s+(.*)'

# Extract all matches
matches = re.findall(fire_pattern, page_15_text)

# If we don't find matches with the complex pattern, try a simpler approach
if not matches:
    print("\
Complex pattern didn't match. Trying a simpler approach...")
    
    # Let's manually identify the rows by looking for patterns like fire names followed by WF
    lines = page_15_text.split('\
')
    
    # Define the expected columns based on visual inspection
    columns = ['Name', 'Inc_Type', 'GACC', 'State', 'Start_Date', 
               'Contain_Control_Date', 'Size_Acres', 'Cause', 'Cost']
    
    # Initialize a list to store our structured data
    structured_data = []
    
    # Process each line
    current_row = {}
    column_index = 0
    
    for line in lines:
        line = line.strip()
        if not line:
            continue
            
        # Skip header lines
        if "Significant Incidents" in line or "Name" in line or "Inc." in line:
            continue
            
        # If we find a line that looks like it starts a new fire entry
        # (typically starts with a name followed by WF)
        if "WF" in line and column_index == 0:
            # If we have a current row with data, add it to our structured data
            if current_row and len(current_row) > 0:
                structured_data.append(current_row)
                
            # Start a new row
            current_row = {}
            column_index = 0
            
            # Split the line into tokens
            tokens = line.split()
            
            # Process tokens to fill in columns
            i = 0
            while i < len(tokens):
                # Fire name might be multiple words
                if column_index == 0:
                    name_parts = []
                    while i < len(tokens) and tokens[i] != "WF":
                        name_parts.append(tokens[i])
                        i += 1
                    current_row['Name'] = " ".join(name_parts)
                
                # Incident type (WF)
                if i < len(tokens) and tokens[i] == "WF":
                    current_row['Inc_Type'] = tokens[i]
                    i += 1
                    column_index += 1
                
                # GACC (Geographic Area Coordination Center)
                if column_index == 1 and i < len(tokens):
                    current_row['GACC'] = tokens[i]
                    i += 1
                    column_index += 1
                
                # State
                if column_index == 2 and i < len(tokens):
                    current_row['State'] = tokens[i]
                    i += 1
                    column_index += 1
                
                # Start Date
                if column_index == 3 and i < len(tokens):
                    current_row['Start_Date'] = tokens[i]
                    i += 1
                    column_index += 1
                
                # Contain/Control Date
                if column_index == 4 and i < len(tokens):
                    current_row['Contain_Control_Date'] = tokens[i]
                    i += 1
                    column_index += 1
                
                # Size (Acres)
                if column_index == 5 and i < len(tokens):
                    current_row['Size_Acres'] = tokens[i]
                    i += 1
                    column_index += 1
                
                # Cause
                if column_index == 6 and i < len(tokens):
                    current_row['Cause'] = tokens[i]
                    i += 1
                    column_index += 1
                
                # Cost
                if column_index == 7 and i < len(tokens):
                    current_row['Cost'] = tokens[i]
                    i += 1
                    column_index += 1
                
                # If we've processed all columns, break
                if column_index > 7:
                    break
        
        # If we're in the middle of processing a row and find additional data
        elif current_row and len(current_row) > 0:
            # This might be continuation data for the current row
            # For simplicity, we'll just append it to the notes
            if 'Notes' not in current_row:
                current_row['Notes'] = line
            else:
                current_row['Notes'] += " " + line
    
    # Add the last row if it has data
    if current_row and len(current_row) > 0:
        structured_data.append(current_row)
    
    # Create a DataFrame from the structured data
    df = pd.DataFrame(structured_data)
    
    # Display the first few rows
    print("\
Extracted table data (first 5 rows):")
    print(df.head())
    
    # Save to JSON
    with open('structured_incidents_data.json', 'w') as f:
        json.dump({"significant_incidents": structured_data}, f, indent=4)
    
    print("\
Structured data saved to 'structured_incidents_data.json'")
else:
    # If we found matches with the complex pattern
    print(f"\
Found {len(matches)} fire incidents using regex pattern")
    
    # Convert matches to a DataFrame
    columns = ['Name', 'Inc_Type', 'GACC', 'State', 'Start_Date', 
               'Contain_Control_Date', 'Size_Acres', 'Cause', 'Cost']
    df = pd.DataFrame(matches, columns=columns)
    
    # Display the first few rows
    print("\
Extracted table data (first 5 rows):")
    print(df.head())
    
    # Save to JSON
    structured_data = df.to_dict(orient='records')
    with open('structured_incidents_data.json', 'w') as f:
        json.dump({"significant_incidents": structured_data}, f, indent=4)
    
    print("\
Structured data saved to 'structured_incidents_data.json'")

Raw text from page 15:
 
 14
Significant Incidents Over 40,000 Acres 
 
Name 
Inc. 
Type 
GACC 
State 
Start Date 
Contain or 
Control 
Date 
Size 
(Acres) 
Cause 
Cost 
Glass Fire 
WF 
SA 
TX 
2/25/2008 
3/2/2008 
219,556 
H 
NR 
Klamath Theater 
WF 
NO 
CA 
6/21/2008 
9/26/2008 
192,038 
L 
$126,086,065 
Basin Complex 
WF 
SO 
CA 
6/21/2008 
7/29/2008 
162,818 
L 
$78,096,079 
Iron & Alps 
Complexes 
WF 
NO 
CA 
6/21/2008 
9/4/2008 
105,805 
L 
$73,974,917 
Dunn Mtn. 
Assist 
WF 
NR 
MT 
8/21/2008 
9/2/2008 
102,383 
L 
$2,900,000 
Lime Complex 
WF 
NO 
CA 
6/20/2008 
8/30/2008 
99,585 
L 
$59,329,698 
Huckabee 
WF 
SA 
TX 
4/30/2008 
5/8/2008 
98,200 
U 
NR 
SHU Lightning 
Complex 
WF 
NO 
CA 
6/21/2008 
7/30/2008 
86,500 
L 
$56,438,391 
Siskiyou / Blue 2 
Complex 
WF 
NO 
CA 
6/21/2008 
9/13/2008 
82,186 
L 
$65,692,836 
Indians 
WF 
SO 
CA 
6/8/2008 
7/12/2008 
76,554 
H 
$42,500,000 
Panther 
WF 
NO 
CA 
7/22/2008 
10/8/2008 
72,344 
L 
NR 
Gunbarrel 
WF 
RM 
WY 
7/26/2008 
9/8/

In [16]:
# Convert the 'Date' column in time_dim to string to avoid JSON serialization issues

time_dim['Date'] = time_dim['Date'].astype(str)

# Now update our dimensions object with converted time_dim

dimensions = {
    'state_dimension': state_dim.to_dict(orient='records'),
    'gacc_dimension': gacc_dim.to_dict(orient='records'),
    'time_dimension': time_dim.to_dict(orient='records')
}

with open('dimension_tables.json', 'w') as f:
    json.dump(dimensions, f, indent=4)

print("\
Dimension tables saved to 'dimension_tables.json' with cleaned fire names and formatted dates")

Dimension tables saved to 'dimension_tables.json' with cleaned fire names and formatted dates


In [18]:
# Let's fix the JSON serialization issues and clean up the fire names properly
import json
import pandas as pd
import re

# Load our structured data
with open('structured_incidents_data.json', 'r') as f:
    structured_data = json.load(f)

# Convert to DataFrame
df = pd.DataFrame(structured_data['significant_incidents'])

# Clean up the fire names by removing any leading/trailing underscores, spaces, and other artifacts
df['Name'] = df['Name'].str.replace('^[_\s]+', '', regex=True)  # Remove leading underscores and spaces
df['Name'] = df['Name'].str.replace('[_\s]+$', '', regex=True)  # Remove trailing underscores and spaces
df['Name'] = df['Name'].str.replace('_', ' ', regex=True)       # Replace remaining underscores with spaces

# Fix specific issues with the first row which appears to have "Cause Cost Glass Fire"
if 'Cause' in df.iloc[0]['Name'] and 'Cost' in df.iloc[0]['Name']:
    df.iloc[0, df.columns.get_loc('Name')] = 'Glass Fire'

# Display the cleaned fire names
print("Cleaned fire names:")
for i, name in enumerate(df['Name']):
    print(f"{i+1}. {name}")

# Update our structured data with the cleaned names
# Convert DataFrame to dict, ensuring all values are JSON serializable
cleaned_data = []
for _, row in df.iterrows():
    row_dict = {}
    for col in df.columns:
        # Convert any non-serializable types to strings
        row_dict[col] = str(row[col]) if not isinstance(row[col], (str, int, float, bool, type(None))) else row[col]
    cleaned_data.append(row_dict)

structured_data['significant_incidents'] = cleaned_data

# Save the updated structured data
with open('structured_incidents_data.json', 'w') as f:
    json.dump(structured_data, f, indent=4)

print("\
Final structured data saved to 'structured_incidents_data.json'")

# Let's also create a more complete view of the data for our star schema
print("\
Final structured data (first 5 rows):")
print(df[['Name', 'Inc_Type', 'GACC', 'State', 'Start_Date', 'Contain_Control_Date', 'Size_Acres', 'Cause']].head())

# Let's also create a dimension table for the states
states = df['State'].unique()
state_dim = pd.DataFrame({
    'State_Code': states,
    'Region': ['Unknown'] * len(states)  # We would need additional data to fill this
})

# Create a dimension table for the GACCs (Geographic Area Coordination Centers)
gaccs = df['GACC'].unique()
gacc_dim = pd.DataFrame({
    'GACC_Code': gaccs,
    'GACC_Name': ['Unknown'] * len(gaccs)  # We would need additional data to fill this
})

# Create a time dimension from the start dates
# First, ensure Start_Date is a string to avoid serialization issues
if 'Start_Date' in df.columns:
    # Try to convert to datetime first, then back to string in a consistent format
    try:
        df['Start_Date'] = pd.to_datetime(df['Start_Date'], errors='coerce')
        # Extract month and year before converting back to string
        months = df['Start_Date'].dt.month.fillna(0).astype(int).tolist()
        years = df['Start_Date'].dt.year.fillna(0).astype(int).tolist()
        # Convert back to string in ISO format
        dates = df['Start_Date'].dt.strftime('%Y-%m-%d').fillna('Unknown').tolist()
    except:
        # If conversion fails, just use the original strings
        dates = df['Start_Date'].tolist()
        months = [0] * len(dates)
        years = [0] * len(dates)
else:
    dates = ['Unknown'] * len(df)
    months = [0] * len(df)
    years = [0] * len(df)

time_dim = pd.DataFrame({
    'Date_ID': range(1, len(df) + 1),
    'Date': dates,
    'Month': months,
    'Year': years
})

# Display our dimension tables
print("\
State Dimension Table:")
print(state_dim)

print("\
GACC Dimension Table:")
print(gacc_dim)

print("\
Time Dimension Table (first 5 rows):")
print(time_dim.head())

# Save these dimension tables to JSON
# Convert DataFrames to dicts, ensuring all values are JSON serializable
state_dim_dict = state_dim.to_dict(orient='records')
gacc_dim_dict = gacc_dim.to_dict(orient='records')
time_dim_dict = time_dim.to_dict(orient='records')

dimensions = {
    'state_dimension': state_dim_dict,
    'gacc_dimension': gacc_dim_dict,
    'time_dimension': time_dim_dict
}

with open('dimension_tables.json', 'w') as f:
    json.dump(dimensions, f, indent=4)

print("\
Dimension tables saved to 'dimension_tables.json'")


Cleaned fire names:
1. Glass Fire
2. Klamath Theater
3. Basin Complex
4. Iron & Alps 
Complexes
5. Dunn Mtn. 
Assist
6. Lime Complex
7. Huckabee
8. SHU Lightning 
Complex
9. Complex
10. Indians
11. Panther
12. Gunbarrel
13. 
14. Stiles Complex
15. BTU Lightning 
Complex
16. South 
Complex
17. MEU Lightning 
Complex
18. East Slide Rock 
Ridge
19. NR 
NR 
Rocky
20. Canyon 
Complex
21. Bridger Fire
22. Hughes Ranch
23. Evans Road
Final structured data saved to 'structured_incidents_data.json'
Final structured data (first 5 rows):
                      Name Inc_Type GACC State Start_Date  \
0               Glass Fire       WF   SA    TX  2/25/2008   
1          Klamath Theater       WF   NO    CA  6/21/2008   
2            Basin Complex       WF   SO    CA  6/21/2008   
3  Iron & Alps \nComplexes       WF   NO    CA  6/21/2008   
4       Dunn Mtn. \nAssist       WF   NR    MT  8/21/2008   

  Contain_Control_Date Size_Acres Cause  
0             3/2/2008    219,556     H  
1            9/2

  df['Name'] = df['Name'].str.replace('^[_\s]+', '', regex=True)  # Remove leading underscores and spaces
  df['Name'] = df['Name'].str.replace('[_\s]+$', '', regex=True)  # Remove trailing underscores and spaces


In [19]:
# Let's fix the JSON serialization issues one more time, ensuring all dates are properly converted to strings
import json
import pandas as pd

# Load our structured data
with open('structured_incidents_data.json', 'r') as f:
    structured_data = json.load(f)

# Let's also create a time dimension from the start dates
# Create a new time dimension table with proper string formatting
time_dim_data = []
for i, incident in enumerate(structured_data['significant_incidents']):
    date_str = incident.get('Start_Date', 'Unknown')
    
    # Try to parse the date
    try:
        date_obj = pd.to_datetime(date_str)
        month = date_obj.month
        year = date_obj.year
        formatted_date = date_obj.strftime('%Y-%m-%d')
    except:
        month = 0
        year = 0
        formatted_date = date_str
    
    time_dim_data.append({
        'Date_ID': i + 1,
        'Date': formatted_date,
        'Month': month,
        'Year': year
    })

# Load the other dimension tables
with open('dimension_tables.json', 'r') as f:
    dimensions = json.load(f)

# Update the time dimension
dimensions['time_dimension'] = time_dim_data

# Save the updated dimension tables
with open('dimension_tables.json', 'w') as f:
    json.dump(dimensions, f, indent=4)

print("Updated dimension tables saved to 'dimension_tables.json' with properly formatted dates")

# Let's verify the structure of our JSON files to ensure they're valid
try:
    with open('structured_incidents_data.json', 'r') as f:
        incidents_data = json.load(f)
    print("structured_incidents_data.json is valid JSON")
    
    with open('dimension_tables.json', 'r') as f:
        dim_tables = json.load(f)
    print("dimension_tables.json is valid JSON")
    
    # Display the first few records from each file to verify content
    print("\
Sample from structured_incidents_data.json:")
    print(json.dumps(incidents_data['significant_incidents'][0], indent=2))
    
    print("\
Sample from dimension_tables.json (time dimension):")
    print(json.dumps(dim_tables['time_dimension'][0], indent=2))
    
except json.JSONDecodeError as e:
    print(f"JSON error: {e}")
except Exception as e:
    print(f"Error: {e}")

Updated dimension tables saved to 'dimension_tables.json' with properly formatted dates
structured_incidents_data.json is valid JSON
dimension_tables.json is valid JSON
Sample from structured_incidents_data.json:
{
  "Name": "Glass Fire",
  "Inc_Type": "WF",
  "GACC": "SA",
  "State": "TX",
  "Start_Date": "2/25/2008",
  "Contain_Control_Date": "3/2/2008",
  "Size_Acres": "219,556",
  "Cause": "H",
  "Cost": "NR "
}
Sample from dimension_tables.json (time dimension):
{
  "Date_ID": 1,
  "Date": "2008-02-25",
  "Month": 2,
  "Year": 2008
}
