## NOTE: Includes: ##
backyard_flock.csv, 
hpai-wild-birds.csv, 
nst-est2023-pop.xlsx (now "census2023.csv")

In [44]:
# Initial imports
import pandas as pd
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from matplotlib import pyplot as plt
# Needed for decision tree visualization
import pydotplus
from IPython.display import Image

In [45]:
wildbirds = pd.read_csv('./data/hpai-wild-birds.csv')
wildbirds.head()

Unnamed: 0,State,County,Collection Date,Date Detected,HPAI Strain,Bird Species,WOAH Classification,Sampling Method,Submitting Agency
0,South Carolina,Colleton,12/30/2021,1/13/2022,EA H5N1,American wigeon,Wild bird,Hunter harvest,NWDP
1,South Carolina,Colleton,12/30/2021,1/13/2022,EA H5N1,Blue-winged teal,Wild bird,Hunter harvest,NWDP
2,North Carolina,Hyde,12/30/2021,1/12/2022,EA H5N1,Northern shoveler,Wild bird,Hunter harvest,NWDP
3,North Carolina,Hyde,1/8/2022,1/20/2022,EA H5N1,American wigeon,Wild bird,Hunter harvest,NWDP
4,North Carolina,Hyde,1/8/2022,1/20/2022,EA H5,Gadwall,Wild bird,Hunter harvest,NWDP


In [46]:
wildbirds = wildbirds.drop(columns = ['Submitting Agency'])
wildbirds.head()

Unnamed: 0,State,County,Collection Date,Date Detected,HPAI Strain,Bird Species,WOAH Classification,Sampling Method
0,South Carolina,Colleton,12/30/2021,1/13/2022,EA H5N1,American wigeon,Wild bird,Hunter harvest
1,South Carolina,Colleton,12/30/2021,1/13/2022,EA H5N1,Blue-winged teal,Wild bird,Hunter harvest
2,North Carolina,Hyde,12/30/2021,1/12/2022,EA H5N1,Northern shoveler,Wild bird,Hunter harvest
3,North Carolina,Hyde,1/8/2022,1/20/2022,EA H5N1,American wigeon,Wild bird,Hunter harvest
4,North Carolina,Hyde,1/8/2022,1/20/2022,EA H5,Gadwall,Wild bird,Hunter harvest


In [47]:
census = pd.read_csv('./data/census2023.csv')
census.rename(columns={"Unnamed: 0": "State"}, inplace=True)
census["State"] = census['State'].str.lstrip('.')
census.head()

Unnamed: 0,State,2020,2021,2022,2023
0,Alabama,5031864,5050380,5073903,5108468
1,Alaska,732964,734923,733276,733406
2,Arizona,7186683,7272487,7365684,7431344
3,Arkansas,3014348,3028443,3046404,3067732
4,California,39503200,39145060,39040616,38965193


In [48]:
backyardflock = pd.read_csv('./data/backyard_flock.csv')
backyardflock.head()

Unnamed: 0,County,State,Outbreak Date,Flock Type,Flock Size
0,Ottawa,Michigan,12-31-2024,Commercial Turkey Meat Bird,29400
1,Riverside,California,12-31-2024,Commercial Table Egg Layer,181300
2,Spartanburg,South Carolina,12-31-2024,Commercial Upland Gamebird Producer,920
3,Butte,California,12-31-2024,WOAH Non-Poultry,70
4,Miner,South Dakota,12-31-2024,WOAH Poultry,1500


In [49]:
df_combined = pd.concat([wildbirds, census, backyardflock], ignore_index=True)
df_combined.head(1000000)

Unnamed: 0,State,County,Collection Date,Date Detected,HPAI Strain,Bird Species,WOAH Classification,Sampling Method,2020,2021,2022,2023,Outbreak Date,Flock Type,Flock Size
0,South Carolina,Colleton,12/30/2021,1/13/2022,EA H5N1,American wigeon,Wild bird,Hunter harvest,,,,,,,
1,South Carolina,Colleton,12/30/2021,1/13/2022,EA H5N1,Blue-winged teal,Wild bird,Hunter harvest,,,,,,,
2,North Carolina,Hyde,12/30/2021,1/12/2022,EA H5N1,Northern shoveler,Wild bird,Hunter harvest,,,,,,,
3,North Carolina,Hyde,1/8/2022,1/20/2022,EA H5N1,American wigeon,Wild bird,Hunter harvest,,,,,,,
4,North Carolina,Hyde,1/8/2022,1/20/2022,EA H5,Gadwall,Wild bird,Hunter harvest,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13228,California,Butte,,,,,,,,,,,01-03-2025,Commercial Raised for Release Upland Game Bird,44700.0
13229,California,Merced,,,,,,,,,,,01-03-2024,Commercial Table Egg Layer,764300.0
13230,California,Sonoma,,,,,,,,,,,01-03-2024,Commercial Broiler Production,77900.0
13231,California,Merced,,,,,,,,,,,01-03-2024,Commercial Broiler Production,534800.0


In [50]:
df_combined = df_combined.sort_values('Flock Type')
df_combined.head(10000000)

Unnamed: 0,State,County,Collection Date,Date Detected,HPAI Strain,Bird Species,WOAH Classification,Sampling Method,2020,2021,2022,2023,Outbreak Date,Flock Type,Flock Size
12675,New Mexico,Roosevelt,,,,,,,,,,,04-17-2024,Commercial Breeder (Multiple Bird Species),51800.0
12682,New Mexico,Roosevelt,,,,,,,,,,,04-15-2024,Commercial Breeder (Multiple Bird Species),15900.0
12135,Iowa,Hamilton,,,,,,,,,,,11-06-2023,Commercial Breeder (Multiple Bird Species),15000.0
12707,New Mexico,Roosevelt,,,,,,,,,,,04-12-2024,Commercial Breeder (Multiple Bird Species),61500.0
12239,Minnesota,Becker,,,,,,,,,,,10-19-2023,Commercial Breeder Operation,20200.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11674,Washington,,,,,,,,7724566,7741433,7784477,7812880,,,
11675,West Virginia,,,,,,,,1791562,1785249,1774035,1770071,,,
11676,Wisconsin,,,,,,,,5896700,5879978,5890543,5910955,,,
11677,Wyoming,,,,,,,,577664,579548,581629,584057,,,


In [52]:
df_combined.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13233 entries, 12675 to 11678
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   State                13233 non-null  object 
 1   County               13181 non-null  object 
 2   Collection Date      11627 non-null  object 
 3   Date Detected        11604 non-null  object 
 4   HPAI Strain          11626 non-null  object 
 5   Bird Species         11627 non-null  object 
 6   WOAH Classification  11627 non-null  object 
 7   Sampling Method      11627 non-null  object 
 8   2020                 52 non-null     object 
 9   2021                 52 non-null     object 
 10  2022                 52 non-null     object 
 11  2023                 52 non-null     object 
 12  Outbreak Date        1554 non-null   object 
 13  Flock Type           1554 non-null   object 
 14  Flock Size           1554 non-null   float64
dtypes: float64(1), object(14)
memory usag