# FEMA Disaster Data Analysis

This notebook explores the FEMA Disaster Declarations dataset to understand patterns and inform our prediction model.

## Contents
1. Data Loading and Initial Exploration
2. Temporal Analysis
3. Geographic Analysis
4. Disaster Type Analysis
5. Feature Engineering Exploration
6. Model Testing

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

# Set plotting style
plt.style.use('seaborn')
sns.set_palette("husl")

## 1. Data Loading and Initial Exploration

In [None]:
# Load the data
df = pd.read_csv('../data/fema_disasters_complete.csv')

# Display basic information
print("Dataset Shape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nData Types:\n", df.dtypes)
print("\nMissing Values:\n", df.isnull().sum())

## 2. Temporal Analysis

In [None]:
# Convert date columns
date_columns = ['declarationDate', 'incidentBeginDate', 'incidentEndDate']
for col in date_columns:
    df[col] = pd.to_datetime(df[col])

# Analyze disasters over time
plt.figure(figsize=(15, 6))
df.groupby(df['declarationDate'].dt.year)['incidentType'].count().plot(kind='line')
plt.title('Number of Disaster Declarations by Year')
plt.xlabel('Year')
plt.ylabel('Number of Declarations')
plt.show()

# Seasonal patterns
plt.figure(figsize=(15, 6))
df.groupby(df['declarationDate'].dt.month)['incidentType'].count().plot(kind='bar')
plt.title('Disaster Declarations by Month')
plt.xlabel('Month')
plt.ylabel('Number of Declarations')
plt.show()

## 3. Geographic Analysis

In [None]:
# Analyze disasters by region and state
plt.figure(figsize=(12, 8))
df['region'].value_counts().plot(kind='bar')
plt.title('Disaster Declarations by Region')
plt.xlabel('Region')
plt.ylabel('Number of Declarations')
plt.xticks(rotation=45)
plt.show()

# Create a heatmap of disaster types by region
pivot_table = pd.crosstab(df['region'], df['incidentType'])
plt.figure(figsize=(15, 8))
sns.heatmap(pivot_table, annot=True, fmt='d', cmap='YlOrRd')
plt.title('Disaster Types by Region')
plt.show()

## 4. Disaster Type Analysis

In [None]:
# Analyze disaster types
plt.figure(figsize=(12, 6))
df['incidentType'].value_counts().plot(kind='bar')
plt.title('Distribution of Disaster Types')
plt.xlabel('Disaster Type')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

# Calculate and display average duration by disaster type
df['incident_duration'] = (df['incidentEndDate'] - df['incidentBeginDate']).dt.days

plt.figure(figsize=(12, 6))
sns.boxplot(x='incidentType', y='incident_duration', data=df)
plt.title('Incident Duration by Disaster Type')
plt.xticks(rotation=45)
plt.show()

## 5. Feature Engineering Exploration

In [None]:
# Calculate seasonal risk scores
def calculate_seasonal_risk(data):
    risk_scores = np.zeros(len(data))
    
    for region in data['region'].unique():
        for incident_type in data['incidentType'].unique():
            mask = (data['region'] == region) & (data['incidentType'] == incident_type)
            monthly_counts = data[mask].groupby(data['declarationDate'].dt.month).size()
            
            if not monthly_counts.empty:
                normalized_counts = (monthly_counts - monthly_counts.min()) / \
                                   (monthly_counts.max() - monthly_counts.min() + 1e-6)
                
                for month, score in normalized_counts.items():
                    month_mask = mask & (data['declarationDate'].dt.month == month)
                    risk_scores[month_mask] = score
                    
    return risk_scores

df['seasonal_risk'] = calculate_seasonal_risk(df)

# Visualize seasonal risk patterns
plt.figure(figsize=(15, 6))
for disaster_type in df['incidentType'].unique():
    disaster_data = df[df['incidentType'] == disaster_type]
    plt.plot(disaster_data.groupby(disaster_data['declarationDate'].dt.month)['seasonal_risk'].mean(),
             label=disaster_type)

plt.title('Seasonal Risk Patterns by Disaster Type')
plt.xlabel('Month')
plt.ylabel('Risk Score')
plt.legend(bbox_to_anchor=(1.05, 1))
plt.show()

## 6. Model Testing

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

# Prepare features
features = ['month', 'day_of_year', 'region', 'seasonal_risk', 'fipsStateCode']
df['month'] = df['declarationDate'].dt.month
df['day_of_year'] = df['declarationDate'].dt.dayofyear

X = df[features].copy()
y = df['incidentType']

# Encode categorical variables
label_encoders = {}
for col in X.select_dtypes(include=['object']).columns:
    label_encoders[col] = LabelEncoder()
    X[col] = label_encoders[col].fit_transform(X[col])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Test different numbers of trees
n_trees = [50, 100, 200]
results = {}

for n in n_trees:
    clf = RandomForestClassifier(n_estimators=n, random_state=42)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    results[n] = classification_report(y_test, y_pred)
    print(f"\nResults for {n} trees:")
    print(results[n])

# Feature importance
feature_importance = pd.DataFrame({
    'feature': features,
    'importance': clf.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='importance', y='feature', data=feature_importance)
plt.title('Feature Importance in Disaster Type Prediction')
plt.show()

## Conclusions

From this analysis, we can conclude:
1. Temporal patterns in disaster occurrences
2. Geographic distribution of different disaster types
3. Effectiveness of our seasonal risk scoring
4. Most important features for prediction

These insights have been incorporated into our prediction model in the main application.