In [3]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller
import pmdarima as pm
import folium
from folium.plugins import HeatMap
import geopandas as gpd
import contextily as ctx
from shapely.geometry import Point
from pathlib import Path

In [None]:
# Load the datasets.
hpai_mammals = pd.read_csv('./data/hpai_mammals.csv')
hpai_wild_birds = pd.read_csv('./data/hpai_wild_birds.csv')
hpai_livestock = pd.read_csv('./data/hpai_livestock.csv')
hpai_flocks = pd.read_csv('./data/hpai_flocks.csv')

# Display basic information about each dataset
print("Mammals dataset info:")
print(hpai_mammals.info())
print("Wild birds dataset info:")
print(hpai_wild_birds.info())
print("Livestock dataset info:")
print(hpai_livestock.info())
print("Flocks dataset info:")
print(hpai_flocks.info())

FileNotFoundError: [Errno 2] No such file or directory: '/data/hpai_mammals.csv'

In [None]:
# Clean and preprocess the data
def clean_dataset(df):
    df = df.copy()
    # Clean state and county names
    df['State'] = df['State'].str.strip()
    df['County'] = df['County'].str.strip()
    # Convert dates
    df['Outbreak Date'] = pd.to_datetime(df['Outbreak Date'])
    return df

# Clean all datasets
hpai_mammals = clean_dataset(hpai_mammals)
hpai_wild_birds = clean_dataset(hpai_wild_birds)
hpai_livestock = clean_dataset(hpai_livestock)
hpai_flocks = clean_dataset(hpai_flocks)

# Clean the 'Birds Affected' column in hpai_flocks
hpai_flocks['Birds Affected'] = pd.to_numeric(hpai_flocks['Birds Affected'], errors='coerce')

KeyError: 'State'

In [None]:
# Basic statistics and summary
print("Summary Statistics:")
print("Total cases by category:")
print("Mammal cases:", len(hpai_mammals))
print("Wild bird cases:", len(hpai_wild_birds))
print("Livestock cases:", len(hpai_livestock))
print("Flock cases:", len(hpai_flocks))

# Calculate total affected birds in flocks
total_birds = hpai_flocks['Birds Affected'].sum()
print("Total birds affected in flocks:", format(total_birds, ",.0f"))

# Show date range
all_dates = pd.concat([
    hpai_mammals['Outbreak Date'],
    hpai_wild_birds['Outbreak Date'],
    hpai_livestock['Outbreak Date'],
    hpai_flocks['Outbreak Date']
])

print("Date range of outbreaks:")
print("Earliest:", all_dates.min())
print("Latest:", all_dates.max())

In [None]:
# Create time series data
def create_daily_cases(df):
    return df['Outbreak Date'].value_counts().sort_index()

# Combine all cases into one time series
mammals_cases = create_daily_cases(hpai_mammals)
birds_cases = create_daily_cases(hpai_wild_birds)
livestock_cases = create_daily_cases(hpai_livestock)
flocks_cases = create_daily_cases(hpai_flocks)

# Create a date range covering all dates
date_range = pd.date_range(start=all_dates.min(), end=all_dates.max())
total_cases = pd.Series(0, index=date_range)

# Add cases from each category
for cases in [mammals_cases, birds_cases, livestock_cases, flocks_cases]:
    total_cases = total_cases.add(cases, fill_value=0)

# Save the processed data
total_cases.to_csv('processed_outbreak_data.csv', header=True, index=True)

In [None]:
# Time series decomposition
decomposition = seasonal_decompose(total_cases, period=7)

# Plot the decomposition
fig, (ax1, ax2, ax3, ax4) = plt.subplots(4, 1, figsize=(15, 12))

decomposition.observed.plot(ax=ax1)
ax1.set_title('Observed')
decomposition.trend.plot(ax=ax2)
ax2.set_title('Trend')
decomposition.seasonal.plot(ax=ax3)
ax3.set_title('Seasonal')
decomposition.resid.plot(ax=ax4)
ax4.set_title('Residual')

plt.tight_layout()
plt.show()

In [None]:
# Perform Augmented Dickey-Fuller test
adf_result = adfuller(total_cases.dropna())
print('Augmented Dickey-Fuller Test Results:')
print('ADF Statistic:', adf_result[0])
print('p-value:', adf_result[1])

In [None]:
# ARIMA modeling
model = pm.auto_arima(total_cases,
                     start_p=0, start_q=0, max_p=3, max_q=3,
                     m=7,  # Weekly seasonality
                     start_P=0, start_Q=0, max_P=2, max_Q=2,
                     seasonal=True,
                     d=None, D=None, trace=True,
                     error_action='ignore',
                     suppress_warnings=True,
                     stepwise=True)

print('Best ARIMA Model:')
print(model.summary())

In [None]:
# Forecast future outbreaks
forecast_steps = 30  # Forecast for the next 30 days
forecast, conf_int = model.predict(n_periods=forecast_steps, return_conf_int=True)

# Create a DataFrame for the forecast
forecast_index = pd.date_range(start=total_cases.index[-1] + pd.Timedelta(days=1), 
                             periods=forecast_steps, freq='D')
forecast_df = pd.DataFrame({
    'Forecast': forecast,
    'Lower CI': conf_int[:, 0],
    'Upper CI': conf_int[:, 1]
}, index=forecast_index)

# Plot the forecast
plt.figure(figsize=(15, 7))
plt.plot(total_cases.index, total_cases, label='Observed', alpha=0.6)
plt.plot(forecast_df.index, forecast_df['Forecast'], label='Forecast', color='orange')
plt.fill_between(forecast_df.index, 
                 forecast_df['Lower CI'],
                 forecast_df['Upper CI'],
                 color='orange', alpha=0.2,
                 label='Confidence Interval')
plt.title('Bird Flu Outbreak Forecast')
plt.xlabel('Date')
plt.ylabel('Number of Cases')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# Save the forecast
forecast_df.to_csv('bird_flu_forecast.csv')

In [None]:
# Create interactive map
m = folium.Map(location=[39.8283, -98.5795], zoom_start=4)

# Add markers for different types of outbreaks
def add_outbreak_markers(data, color, group_name):
    feature_group = folium.FeatureGroup(name=group_name)
    
    for _, row in data.iterrows():
        folium.CircleMarker(
            location=[row['Latitude'], row['Longitude']],
            radius=5,
            color=color,
            fill=True,
            popup=f"{row['State']}, {row['County']}<br>Date: {row['Outbreak Date']}",
        ).add_to(feature_group)
    
    feature_group.add_to(m)

# Add markers for each category
add_outbreak_markers(hpai_mammals, 'red', 'Mammals')
add_outbreak_markers(hpai_wild_birds, 'blue', 'Wild Birds')
add_outbreak_markers(hpai_livestock, 'green', 'Livestock')
add_outbreak_markers(hpai_flocks, 'purple', 'Flocks')

# Add layer control
folium.LayerControl().add_to(m)

# Save the map
m.save('outbreak_map.html')

In [None]:
# Monthly trend analysis
monthly_counts = pd.DataFrame(total_cases.resample('M').sum())
monthly_counts.columns = ['Cases']

# Plot monthly trends
plt.figure(figsize=(15, 7))
plt.plot(monthly_counts.index, monthly_counts['Cases'], marker='o')
plt.title('Monthly Bird Flu Outbreaks')
plt.xlabel('Date')
plt.ylabel('Number of Cases')
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Calculate seasonal statistics
monthly_counts['Month'] = monthly_counts.index.month
seasonal_pattern = monthly_counts.groupby('Month')['Cases'].mean()

print("Average cases by month:")
for month, cases in seasonal_pattern.items():
    print(f"Month {month}: {cases:.1f} cases")

In [None]:
# State-level analysis
state_counts = pd.DataFrame({
    'Mammals': hpai_mammals.groupby('State').size(),
    'Wild Birds': hpai_wild_birds.groupby('State').size(),
    'Livestock': hpai_livestock.groupby('State').size(),
    'Flocks': hpai_flocks.groupby('State').size()
}).fillna(0)

state_counts['Total'] = state_counts.sum(axis=1)

# Display top 10 states by total cases
print("Top 10 states by total cases:")
print(state_counts.sort_values('Total', ascending=False).head(10))

# Create a bar plot of top 10 states
plt.figure(figsize=(15, 7))
top_10_states = state_counts.sort_values('Total', ascending=False).head(10)
top_10_states.drop('Total', axis=1).plot(kind='bar', stacked=True)
plt.title('Top 10 States by Outbreak Category')
plt.xlabel('State')
plt.ylabel('Number of Cases')
plt.legend(title='Category')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()