## NOTE: Includes: ##
backyard_flock.csv, 
hpai-wild-birds.csv, 
nst-est2023-pop.xlsx (now "census2023.csv")

In [175]:
# Initial imports
import pandas as pd
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from matplotlib import pyplot as plt
# Needed for decision tree visualization
import pydotplus
from IPython.display import Image
from sklearn.preprocessing import LabelEncoder

In [176]:
wildbirds = pd.read_csv('./data/hpai-wild-birds.csv')
wildbirds = wildbirds.rename(columns={'Date Detected': 'Outbreak Date'})
wildbirds.head()

Unnamed: 0,State,County,Collection Date,Outbreak Date,HPAI Strain,Bird Species,WOAH Classification,Sampling Method,Submitting Agency
0,South Carolina,Colleton,12/30/2021,1/13/2022,EA H5N1,American wigeon,Wild bird,Hunter harvest,NWDP
1,South Carolina,Colleton,12/30/2021,1/13/2022,EA H5N1,Blue-winged teal,Wild bird,Hunter harvest,NWDP
2,North Carolina,Hyde,12/30/2021,1/12/2022,EA H5N1,Northern shoveler,Wild bird,Hunter harvest,NWDP
3,North Carolina,Hyde,1/8/2022,1/20/2022,EA H5N1,American wigeon,Wild bird,Hunter harvest,NWDP
4,North Carolina,Hyde,1/8/2022,1/20/2022,EA H5,Gadwall,Wild bird,Hunter harvest,NWDP


In [177]:
wildbirds = wildbirds.drop(columns = ['Submitting Agency'])
wildbirds.head()

Unnamed: 0,State,County,Collection Date,Outbreak Date,HPAI Strain,Bird Species,WOAH Classification,Sampling Method
0,South Carolina,Colleton,12/30/2021,1/13/2022,EA H5N1,American wigeon,Wild bird,Hunter harvest
1,South Carolina,Colleton,12/30/2021,1/13/2022,EA H5N1,Blue-winged teal,Wild bird,Hunter harvest
2,North Carolina,Hyde,12/30/2021,1/12/2022,EA H5N1,Northern shoveler,Wild bird,Hunter harvest
3,North Carolina,Hyde,1/8/2022,1/20/2022,EA H5N1,American wigeon,Wild bird,Hunter harvest
4,North Carolina,Hyde,1/8/2022,1/20/2022,EA H5,Gadwall,Wild bird,Hunter harvest


In [178]:
census = pd.read_csv('./data/census2023.csv')
census.rename(columns={"Unnamed: 0": "State"}, inplace=True)
census["State"] = census['State'].str.lstrip('.')
census.head()

Unnamed: 0,State,2020,2021,2022,2023
0,Alabama,5031864,5050380,5073903,5108468
1,Alaska,732964,734923,733276,733406
2,Arizona,7186683,7272487,7365684,7431344
3,Arkansas,3014348,3028443,3046404,3067732
4,California,39503200,39145060,39040616,38965193


In [179]:
backyardflock = pd.read_csv('./data/backyard_flock.csv')
backyardflock.head()

Unnamed: 0,County,State,Outbreak Date,Flock Type,Flock Size
0,Ottawa,Michigan,12-31-2024,Commercial Turkey Meat Bird,29400
1,Riverside,California,12-31-2024,Commercial Table Egg Layer,181300
2,Spartanburg,South Carolina,12-31-2024,Commercial Upland Gamebird Producer,920
3,Butte,California,12-31-2024,WOAH Non-Poultry,70
4,Miner,South Dakota,12-31-2024,WOAH Poultry,1500


In [180]:
df_combined = pd.concat([wildbirds, census, backyardflock], ignore_index=True)
df_combined.head(1000000)

Unnamed: 0,State,County,Collection Date,Outbreak Date,HPAI Strain,Bird Species,WOAH Classification,Sampling Method,2020,2021,2022,2023,Flock Type,Flock Size
0,South Carolina,Colleton,12/30/2021,1/13/2022,EA H5N1,American wigeon,Wild bird,Hunter harvest,,,,,,
1,South Carolina,Colleton,12/30/2021,1/13/2022,EA H5N1,Blue-winged teal,Wild bird,Hunter harvest,,,,,,
2,North Carolina,Hyde,12/30/2021,1/12/2022,EA H5N1,Northern shoveler,Wild bird,Hunter harvest,,,,,,
3,North Carolina,Hyde,1/8/2022,1/20/2022,EA H5N1,American wigeon,Wild bird,Hunter harvest,,,,,,
4,North Carolina,Hyde,1/8/2022,1/20/2022,EA H5,Gadwall,Wild bird,Hunter harvest,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13228,California,Butte,,01-03-2025,,,,,,,,,Commercial Raised for Release Upland Game Bird,44700.0
13229,California,Merced,,01-03-2024,,,,,,,,,Commercial Table Egg Layer,764300.0
13230,California,Sonoma,,01-03-2024,,,,,,,,,Commercial Broiler Production,77900.0
13231,California,Merced,,01-03-2024,,,,,,,,,Commercial Broiler Production,534800.0


In [181]:
df_combined.drop(columns=['Collection Date', 'Sampling Method'], inplace=True)
df_combined.dropna(subset=['County'], inplace=True)
df_combined['Outbreak Date'] = pd.to_datetime(df_combined['Outbreak Date'], format='mixed')

df_combined.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13181 entries, 0 to 13232
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   State                13181 non-null  object        
 1   County               13181 non-null  object        
 2   Outbreak Date        13158 non-null  datetime64[ns]
 3   HPAI Strain          11626 non-null  object        
 4   Bird Species         11627 non-null  object        
 5   WOAH Classification  11627 non-null  object        
 6   2020                 0 non-null      object        
 7   2021                 0 non-null      object        
 8   2022                 0 non-null      object        
 9   2023                 0 non-null      object        
 10  Flock Type           1554 non-null   object        
 11  Flock Size           1554 non-null   float64       
dtypes: datetime64[ns](1), float64(1), object(10)
memory usage: 1.3+ MB


In [182]:
label_encoder1 = LabelEncoder()

birdflu_data_encoded = df_combined.copy()

columns_to_encode = ['County', 'Outbreak Date', 'HPAI Strain', 'Bird Species', 
                     'WOAH Classification', '2020', '2021', '2022', '2023', 'Flock Type']


for column in columns_to_encode:
    birdflu_data_encoded[column] = label_encoder1.fit_transform(birdflu_data_encoded[column])

birdflu_data_encoded.head(10)


Unnamed: 0,State,County,Outbreak Date,HPAI Strain,Bird Species,WOAH Classification,2020,2021,2022,2023,Flock Type,Flock Size
0,South Carolina,204,1,2,8,2,0,0,0,0,23,
1,South Carolina,204,1,2,31,2,0,0,0,0,23,
2,North Carolina,428,0,2,141,2,0,0,0,0,23,
3,North Carolina,428,3,2,8,2,0,0,0,0,23,
4,North Carolina,428,3,0,85,2,0,0,0,0,23,
5,North Carolina,428,3,0,85,2,0,0,0,0,23,
6,North Carolina,711,3,2,8,2,0,0,0,0,23,
7,North Carolina,711,3,0,8,2,0,0,0,0,23,
8,North Carolina,711,3,0,127,2,0,0,0,0,23,
9,North Carolina,711,3,2,85,2,0,0,0,0,23,


In [183]:
birdflu_data_encoded.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13181 entries, 0 to 13232
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   State                13181 non-null  object 
 1   County               13181 non-null  int32  
 2   Outbreak Date        13181 non-null  int64  
 3   HPAI Strain          13181 non-null  int32  
 4   Bird Species         13181 non-null  int32  
 5   WOAH Classification  13181 non-null  int32  
 6   2020                 13181 non-null  int32  
 7   2021                 13181 non-null  int32  
 8   2022                 13181 non-null  int32  
 9   2023                 13181 non-null  int32  
 10  Flock Type           13181 non-null  int32  
 11  Flock Size           1554 non-null   float64
dtypes: float64(1), int32(9), int64(1), object(1)
memory usage: 875.3+ KB


In [184]:
from sklearn.impute import SimpleImputer

# Replace NaN with Mean
imputer = SimpleImputer(strategy="mean")
birdflu_data_encoded["Flock Size"] = imputer.fit_transform(birdflu_data_encoded[["Flock Size"]])

# Alternative: Use Median
birdflu_data_encoded["Flock Size"] = SimpleImputer(strategy="median").fit_transform(birdflu_data_encoded[["Flock Size"]])

birdflu_data_encoded

Unnamed: 0,State,County,Outbreak Date,HPAI Strain,Bird Species,WOAH Classification,2020,2021,2022,2023,Flock Type,Flock Size
0,South Carolina,204,1,2,8,2,0,0,0,0,23,101528.089447
1,South Carolina,204,1,2,31,2,0,0,0,0,23,101528.089447
2,North Carolina,428,0,2,141,2,0,0,0,0,23,101528.089447
3,North Carolina,428,3,2,8,2,0,0,0,0,23,101528.089447
4,North Carolina,428,3,0,85,2,0,0,0,0,23,101528.089447
...,...,...,...,...,...,...,...,...,...,...,...,...
13228,California,124,616,12,231,3,0,0,0,0,7,44700.000000
13229,California,593,429,12,231,3,0,0,0,0,10,764300.000000
13230,California,886,429,12,231,3,0,0,0,0,4,77900.000000
13231,California,593,429,12,231,3,0,0,0,0,4,534800.000000


In [185]:
y = birdflu_data_encoded['State']
X = birdflu_data_encoded.drop(columns=['State'], axis=1)


In [186]:
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Scale the data
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [187]:
# Train the Random Forest model
clf = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train_scaled, y_train)

# Evaluate the model
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

Training Score: 0.9998988366211431
Testing Score: 0.710254854368932


In [188]:
# Get the feature importance array
feature_importances = clf.feature_importances_

In [189]:
# List the top 10 most important features
importances_sorted = sorted(zip(feature_importances, X.columns), reverse=True)
importances_sorted[:5]

[(0.4152771571999851, 'County'),
 (0.3385939308043085, 'Outbreak Date'),
 (0.15963724961508868, 'Bird Species'),
 (0.029205680486109154, 'Flock Size'),
 (0.027250968012005397, 'HPAI Strain')]

In [190]:
from sklearn.linear_model import LogisticRegression
logistic_regression_model = LogisticRegression()

logistic_regression_model.fit(X_train, y_train)

print(f"Training Data Score: {logistic_regression_model.score(X_train, y_train)}")
print(f"Testing Data Score: {logistic_regression_model.score(X_test, y_test)}")

Training Data Score: 0.06555386949924127
Testing Data Score: 0.06674757281553398


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
