## NOTE: Includes: ##
backyard_flock.csv, 
hpai-wild-birds.csv, 
nst-est2023-pop.xlsx (now "census2023.csv")

In [115]:
# Initial imports
import pandas as pd
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from matplotlib import pyplot as plt
# Needed for decision tree visualization
import pydotplus
from IPython.display import Image
from sklearn.preprocessing import LabelEncoder

In [116]:
wildbirds = pd.read_csv('./data/hpai-wild-birds.csv')
wildbirds = wildbirds.rename(columns={'Date Detected': 'Outbreak Date'})
wildbirds.head()

Unnamed: 0,State,County,Collection Date,Outbreak Date,HPAI Strain,Bird Species,WOAH Classification,Sampling Method,Submitting Agency
0,South Carolina,Colleton,12/30/2021,1/13/2022,EA H5N1,American wigeon,Wild bird,Hunter harvest,NWDP
1,South Carolina,Colleton,12/30/2021,1/13/2022,EA H5N1,Blue-winged teal,Wild bird,Hunter harvest,NWDP
2,North Carolina,Hyde,12/30/2021,1/12/2022,EA H5N1,Northern shoveler,Wild bird,Hunter harvest,NWDP
3,North Carolina,Hyde,1/8/2022,1/20/2022,EA H5N1,American wigeon,Wild bird,Hunter harvest,NWDP
4,North Carolina,Hyde,1/8/2022,1/20/2022,EA H5,Gadwall,Wild bird,Hunter harvest,NWDP


In [117]:
wildbirds = wildbirds.drop(columns = ['Submitting Agency'])
wildbirds.head()

Unnamed: 0,State,County,Collection Date,Outbreak Date,HPAI Strain,Bird Species,WOAH Classification,Sampling Method
0,South Carolina,Colleton,12/30/2021,1/13/2022,EA H5N1,American wigeon,Wild bird,Hunter harvest
1,South Carolina,Colleton,12/30/2021,1/13/2022,EA H5N1,Blue-winged teal,Wild bird,Hunter harvest
2,North Carolina,Hyde,12/30/2021,1/12/2022,EA H5N1,Northern shoveler,Wild bird,Hunter harvest
3,North Carolina,Hyde,1/8/2022,1/20/2022,EA H5N1,American wigeon,Wild bird,Hunter harvest
4,North Carolina,Hyde,1/8/2022,1/20/2022,EA H5,Gadwall,Wild bird,Hunter harvest


In [118]:
census = pd.read_csv('./data/census2023.csv')
census.rename(columns={"Unnamed: 0": "State"}, inplace=True)
census["State"] = census['State'].str.lstrip('.')
census.head()

Unnamed: 0,State,2020,2021,2022,2023
0,Alabama,5031864,5050380,5073903,5108468
1,Alaska,732964,734923,733276,733406
2,Arizona,7186683,7272487,7365684,7431344
3,Arkansas,3014348,3028443,3046404,3067732
4,California,39503200,39145060,39040616,38965193


In [119]:
backyardflock = pd.read_csv('./data/backyard_flock.csv')
backyardflock.head()

Unnamed: 0,County,State,Outbreak Date,Flock Type,Flock Size
0,Ottawa,Michigan,12-31-2024,Commercial Turkey Meat Bird,29400
1,Riverside,California,12-31-2024,Commercial Table Egg Layer,181300
2,Spartanburg,South Carolina,12-31-2024,Commercial Upland Gamebird Producer,920
3,Butte,California,12-31-2024,WOAH Non-Poultry,70
4,Miner,South Dakota,12-31-2024,WOAH Poultry,1500


In [120]:
df_combined = pd.concat([wildbirds, census, backyardflock], ignore_index=True)
df_combined.head(1000000)

Unnamed: 0,State,County,Collection Date,Outbreak Date,HPAI Strain,Bird Species,WOAH Classification,Sampling Method,2020,2021,2022,2023,Flock Type,Flock Size
0,South Carolina,Colleton,12/30/2021,1/13/2022,EA H5N1,American wigeon,Wild bird,Hunter harvest,,,,,,
1,South Carolina,Colleton,12/30/2021,1/13/2022,EA H5N1,Blue-winged teal,Wild bird,Hunter harvest,,,,,,
2,North Carolina,Hyde,12/30/2021,1/12/2022,EA H5N1,Northern shoveler,Wild bird,Hunter harvest,,,,,,
3,North Carolina,Hyde,1/8/2022,1/20/2022,EA H5N1,American wigeon,Wild bird,Hunter harvest,,,,,,
4,North Carolina,Hyde,1/8/2022,1/20/2022,EA H5,Gadwall,Wild bird,Hunter harvest,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13228,California,Butte,,01-03-2025,,,,,,,,,Commercial Raised for Release Upland Game Bird,44700.0
13229,California,Merced,,01-03-2024,,,,,,,,,Commercial Table Egg Layer,764300.0
13230,California,Sonoma,,01-03-2024,,,,,,,,,Commercial Broiler Production,77900.0
13231,California,Merced,,01-03-2024,,,,,,,,,Commercial Broiler Production,534800.0


In [121]:
df_combined = df_combined.sort_values('Flock Type')
df_combined.head(10000000)

Unnamed: 0,State,County,Collection Date,Outbreak Date,HPAI Strain,Bird Species,WOAH Classification,Sampling Method,2020,2021,2022,2023,Flock Type,Flock Size
12675,New Mexico,Roosevelt,,04-17-2024,,,,,,,,,Commercial Breeder (Multiple Bird Species),51800.0
12682,New Mexico,Roosevelt,,04-15-2024,,,,,,,,,Commercial Breeder (Multiple Bird Species),15900.0
12135,Iowa,Hamilton,,11-06-2023,,,,,,,,,Commercial Breeder (Multiple Bird Species),15000.0
12707,New Mexico,Roosevelt,,04-12-2024,,,,,,,,,Commercial Breeder (Multiple Bird Species),61500.0
12239,Minnesota,Becker,,10-19-2023,,,,,,,,,Commercial Breeder Operation,20200.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11674,Washington,,,,,,,,7724566,7741433,7784477,7812880,,
11675,West Virginia,,,,,,,,1791562,1785249,1774035,1770071,,
11676,Wisconsin,,,,,,,,5896700,5879978,5890543,5910955,,
11677,Wyoming,,,,,,,,577664,579548,581629,584057,,


In [122]:
df_combined.drop(columns=['Collection Date'], inplace=True)
df_combined.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13233 entries, 12675 to 11678
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   State                13233 non-null  object 
 1   County               13181 non-null  object 
 2   Outbreak Date        13158 non-null  object 
 3   HPAI Strain          11626 non-null  object 
 4   Bird Species         11627 non-null  object 
 5   WOAH Classification  11627 non-null  object 
 6   Sampling Method      11627 non-null  object 
 7   2020                 52 non-null     object 
 8   2021                 52 non-null     object 
 9   2022                 52 non-null     object 
 10  2023                 52 non-null     object 
 11  Flock Type           1554 non-null   object 
 12  Flock Size           1554 non-null   float64
dtypes: float64(1), object(12)
memory usage: 1.4+ MB


In [123]:
df_combined.dropna(subset=['County'], inplace=True)

In [124]:
label_encoder = LabelEncoder()
df_combined['County Encoded'] = label_encoder.fit_transform(df_combined['County'])
df_combined.head(1000)

Unnamed: 0,State,County,Outbreak Date,HPAI Strain,Bird Species,WOAH Classification,Sampling Method,2020,2021,2022,2023,Flock Type,Flock Size,County Encoded
12675,New Mexico,Roosevelt,04-17-2024,,,,,,,,,Commercial Breeder (Multiple Bird Species),51800.0,806
12682,New Mexico,Roosevelt,04-15-2024,,,,,,,,,Commercial Breeder (Multiple Bird Species),15900.0,806
12135,Iowa,Hamilton,11-06-2023,,,,,,,,,Commercial Breeder (Multiple Bird Species),15000.0,381
12707,New Mexico,Roosevelt,04-12-2024,,,,,,,,,Commercial Breeder (Multiple Bird Species),61500.0,806
12239,Minnesota,Becker,10-19-2023,,,,,,,,,Commercial Breeder Operation,20200.0,63
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12912,Virginia,Rockingham,03-07-2023,,,,,,,,,WOAH Non-Poultry,250.0,799
13113,Vermont,Windsor,01-24-2025,,,,,,,,,WOAH Non-Poultry,20.0,1032
11753,Idaho,Gem,12-20-2022,,,,,,,,,WOAH Non-Poultry,80.0,340
12895,Maine,York,03-14-2022,,,,,,,,,WOAH Non-Poultry,180.0,1053


In [125]:
df_combined["Flock Type"].unique()

array(['Commercial Breeder (Multiple Bird Species)',
       'Commercial Breeder Operation', 'Commercial Broiler Breeder',
       'Commercial Broiler Breeder Pullets',
       'Commercial Broiler Production', 'Commercial Duck Breeder',
       'Commercial Duck Meat Bird',
       'Commercial Raised for Release Upland Game Bird',
       'Commercial Raised for Release Waterfowl',
       'Commercial Table Egg Breeder', 'Commercial Table Egg Layer',
       'Commercial Table Egg Pullets', 'Commercial Turkey Breeder Hens',
       'Commercial Turkey Breeder Replacement Hens',
       'Commercial Turkey Breeder Toms', 'Commercial Turkey Meat Bird',
       'Commercial Turkey Poult Supplier',
       'Commercial Upland Gamebird Producer', 'Live Bird Market',
       'Live Bird Sales  (non-slaughter)',
       'Primary Broiler Breeder Pedigree Farm', 'WOAH Non-Poultry',
       'WOAH Poultry', nan], dtype=object)

In [126]:
df_combined["Bird Species"].value_counts()

Bird Species
Mallard                  1881
Green-winged teal         997
Canada goose              823
Snow goose                719
Bald eagle                625
                         ... 
Brandt's cormorant          1
Teal (unidentified)         1
African crowned crane       1
Brazilian teal              1
Ruffed grouse               1
Name: count, Length: 231, dtype: int64

In [127]:
label_encoder = LabelEncoder()
df_combined['Flock Type Encoded'] = label_encoder.fit_transform(df_combined['Flock Type'])
df_combined['State Encoded'] = label_encoder.fit_transform(df_combined['State'])

# df_combined['Outbreak Date'] = pd.to_datetime(df_combined['Outbreak Date'], errors='coerce')
# df_combined['Outbreak Date'] = df_combined['Outbreak Date'].fillna(0).dt.strftime('%Y%m%d').astype(int)

# df_combined['Outbreak Date'] = pd.to_datetime(df_combined['Outbreak Date'], errors='coerce')
df_combined['Outbreak Date'] = pd.to_datetime(df_combined['Outbreak Date'], format='mixed')

# # Convert datetime to string format first, keeping NaT values
# df_combined['Outbreak Date'] = df_combined['Outbreak Date'].dt.strftime('%Y%m%d')

# # Replace NaN (from NaT conversion) with '0', then convert to integer
# df_combined['Outbreak Date'] = df_combined['Outbreak Date'].fillna('0').astype(int)


df_combined.head(1000)

Unnamed: 0,State,County,Outbreak Date,HPAI Strain,Bird Species,WOAH Classification,Sampling Method,2020,2021,2022,2023,Flock Type,Flock Size,County Encoded,Flock Type Encoded,State Encoded
12675,New Mexico,Roosevelt,2024-04-17,,,,,,,,,Commercial Breeder (Multiple Bird Species),51800.0,806,0,31
12682,New Mexico,Roosevelt,2024-04-15,,,,,,,,,Commercial Breeder (Multiple Bird Species),15900.0,806,0,31
12135,Iowa,Hamilton,2023-11-06,,,,,,,,,Commercial Breeder (Multiple Bird Species),15000.0,381,0,15
12707,New Mexico,Roosevelt,2024-04-12,,,,,,,,,Commercial Breeder (Multiple Bird Species),61500.0,806,0,31
12239,Minnesota,Becker,2023-10-19,,,,,,,,,Commercial Breeder Operation,20200.0,63,1,23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12912,Virginia,Rockingham,2023-03-07,,,,,,,,,WOAH Non-Poultry,250.0,799,21,47
13113,Vermont,Windsor,2025-01-24,,,,,,,,,WOAH Non-Poultry,20.0,1032,21,46
11753,Idaho,Gem,2022-12-20,,,,,,,,,WOAH Non-Poultry,80.0,340,21,12
12895,Maine,York,2022-03-14,,,,,,,,,WOAH Non-Poultry,180.0,1053,21,19


In [128]:
df_combined.dtypes

State                          object
County                         object
Outbreak Date          datetime64[ns]
HPAI Strain                    object
Bird Species                   object
WOAH Classification            object
Sampling Method                object
2020                           object
2021                           object
2022                           object
2023                           object
Flock Type                     object
Flock Size                    float64
County Encoded                  int32
Flock Type Encoded              int32
State Encoded                   int32
dtype: object

In [129]:
df_combined.drop(columns=['Sampling Method', 'County'], inplace=True)
df_combined.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13181 entries, 12675 to 11626
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   State                13181 non-null  object        
 1   Outbreak Date        13158 non-null  datetime64[ns]
 2   HPAI Strain          11626 non-null  object        
 3   Bird Species         11627 non-null  object        
 4   WOAH Classification  11627 non-null  object        
 5   2020                 0 non-null      object        
 6   2021                 0 non-null      object        
 7   2022                 0 non-null      object        
 8   2023                 0 non-null      object        
 9   Flock Type           1554 non-null   object        
 10  Flock Size           1554 non-null   float64       
 11  County Encoded       13181 non-null  int32         
 12  Flock Type Encoded   13181 non-null  int32         
 13  State Encoded        13181 non-n

In [130]:
y = df_combined['State']
X = df_combined.drop(columns=['State'])
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [131]:
# Since the target column is an object, we need to convert the data to numerical classes
# Encode the y data
# Create an instance of the label encoder
le = LabelEncoder()
# Fit and transform the y training and testing data using the label encoder
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)
y_train_encoded

array([17,  4, 17, ...,  1, 25, 14])