# Nigeria IBF: Data Exploration

This notebook explores the available datasets for multi-hazard impact-based forecasting in Borno, Adamawa, and Yobe (BAY) states.

In [None]:
import sys
sys.path.insert(0, '../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from data_processing.data_loader import DataLoader
from data_processing.preprocessor import DataPreprocessor

# Set plotting style
plt.style.use('seaborn-v0_8-darkgrid')
%matplotlib inline

## 1. Load Data

In [None]:
# Initialize data loader
loader = DataLoader(data_dir='../data/raw')

# Load all datasets
datasets = loader.load_all_data()

# Display summary
loader.get_data_summary()

## 2. Explore Displacement Data

In [None]:
if 'displacement' in datasets:
    disp_df = datasets['displacement']
    
    print(f"Total displacement events: {len(disp_df)}")
    print(f"\nColumns: {disp_df.columns.tolist()}")
    print(f"\nData types:\n{disp_df.dtypes}")
    print(f"\nFirst few rows:")
    display(disp_df.head())
    
    # Basic statistics
    print(f"\nBasic statistics:")
    display(disp_df.describe())

## 3. Explore Flood Data

In [None]:
if 'flood_events' in datasets:
    flood_df = datasets['flood_events']
    
    print(f"Total flood events: {len(flood_df)}")
    print(f"\nColumns: {flood_df.columns.tolist()}")
    print(f"\nFirst few rows:")
    display(flood_df.head())
    
    # Visualize flood frequency by state
    if 'State' in flood_df.columns or 'state' in flood_df.columns:
        state_col = 'State' if 'State' in flood_df.columns else 'state'
        
        plt.figure(figsize=(10, 6))
        flood_df[state_col].value_counts().head(10).plot(kind='barh', color='steelblue')
        plt.title('Top 10 States by Flood Events')
        plt.xlabel('Number of Events')
        plt.ylabel('State')
        plt.tight_layout()
        plt.show()

## 4. Filter for BAY States

In [None]:
# Filter datasets for BAY states
bay_datasets = {}

for name, df in datasets.items():
    if isinstance(df, pd.DataFrame):
        bay_df = loader.filter_bay_states(df)
        bay_datasets[name] = bay_df
        print(f"{name}: {len(df)} -> {len(bay_df)} records (BAY states only)")

## 5. Time Series Analysis

In [None]:
if 'displacement_monthly' in datasets:
    monthly_df = datasets['displacement_monthly']
    
    # Identify date and value columns
    date_cols = [col for col in monthly_df.columns if 'month' in col.lower() or 'date' in col.lower()]
    value_cols = [col for col in monthly_df.columns if 'displaced' in col.lower()]
    
    if date_cols and value_cols:
        date_col = date_cols[0]
        value_col = value_cols[0]
        
        # Plot time series
        monthly_df[date_col] = pd.to_datetime(monthly_df[date_col], errors='coerce')
        monthly_df = monthly_df.sort_values(date_col)
        
        plt.figure(figsize=(14, 6))
        plt.plot(monthly_df[date_col], monthly_df[value_col], linewidth=2, marker='o')
        plt.title('Displacement Trends Over Time')
        plt.xlabel('Date')
        plt.ylabel('Number Displaced')
        plt.grid(alpha=0.3)
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()

## 6. LGA-Level Analysis

In [None]:
if 'displacement_stats' in datasets:
    stats_df = datasets['displacement_stats']
    
    print(f"LGA-level statistics:")
    display(stats_df.head())
    
    print(f"\nNumber of LGAs: {len(stats_df)}")
    print(f"\nColumn summary:")
    display(stats_df.describe())

## 7. Next Steps

1. Proceed to `02_hazard_analysis.ipynb` for hazard modeling
2. Run `03_impact_assessment.ipynb` for impact calculations
3. Use `04_forecast_generation.ipynb` to generate forecasts