<a href="https://colab.research.google.com/github/MaryPulley/Project_2/blob/Katie---Branch/Data_Preprocessing_Round_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import the dependencies

In [133]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Preprocess the census data

In [134]:
# Import the data for county census from xlsx
""" County Census Data that is 2020 to 2023 """
county_census_df = pd.read_csv('co-est2023-pop-updated - CO-EST2023-POP.csv', header=1)
county_census_df.head(15)


Unnamed: 0,Geography,2020,2021,2022,2023
0,United States,331526933,332048977,333271411,334914895
1,".Autauga County, Alabama",58915,59203,59726,60342
2,".Baldwin County, Alabama",233227,239439,246531,253507
3,".Barbour County, Alabama",24969,24533,24700,24585
4,".Bibb County, Alabama",22188,22359,21986,21868
5,".Blount County, Alabama",59107,59079,59516,59816
6,".Bullock County, Alabama",10229,10143,10143,9897
7,".Butler County, Alabama",19025,18890,18668,18382
8,".Calhoun County, Alabama",116243,115678,115780,116429
9,".Chambers County, Alabama",34651,34488,34164,34079


In [135]:
# Separate the "Geography" column by the comma to make 2 new columns "County" and "State"
county_census_df[["County", "State"]] = county_census_df["Geography"].str.split(",", n=1, expand=True)
county_census_df.head()

Unnamed: 0,Geography,2020,2021,2022,2023,County,State
0,United States,331526933,332048977,333271411,334914895,United States,
1,".Autauga County, Alabama",58915,59203,59726,60342,.Autauga County,Alabama
2,".Baldwin County, Alabama",233227,239439,246531,253507,.Baldwin County,Alabama
3,".Barbour County, Alabama",24969,24533,24700,24585,.Barbour County,Alabama
4,".Bibb County, Alabama",22188,22359,21986,21868,.Bibb County,Alabama


In [136]:
# Remove the "Geography" column from the county_census dataframe
county_census_df = county_census_df.drop(columns=["Geography"])
county_census_df.head()

Unnamed: 0,2020,2021,2022,2023,County,State
0,331526933,332048977,333271411,334914895,United States,
1,58915,59203,59726,60342,.Autauga County,Alabama
2,233227,239439,246531,253507,.Baldwin County,Alabama
3,24969,24533,24700,24585,.Barbour County,Alabama
4,22188,22359,21986,21868,.Bibb County,Alabama


In [137]:
# Remove the "." from the front of each value in the "County" column
county_census_df["County"] = county_census_df["County"].str.lstrip(".")
county_census_df.head()

Unnamed: 0,2020,2021,2022,2023,County,State
0,331526933,332048977,333271411,334914895,United States,
1,58915,59203,59726,60342,Autauga County,Alabama
2,233227,239439,246531,253507,Baldwin County,Alabama
3,24969,24533,24700,24585,Barbour County,Alabama
4,22188,22359,21986,21868,Bibb County,Alabama


In [138]:
# Reset the index of the county_census_df
county_census_df = county_census_df.reset_index(drop=True)
county_census_df.head()

Unnamed: 0,2020,2021,2022,2023,County,State
0,331526933,332048977,333271411,334914895,United States,
1,58915,59203,59726,60342,Autauga County,Alabama
2,233227,239439,246531,253507,Baldwin County,Alabama
3,24969,24533,24700,24585,Barbour County,Alabama
4,22188,22359,21986,21868,Bibb County,Alabama


In [139]:
# Index the county_census_df by State
county_census_df = county_census_df.set_index("State")
county_census_df.head()

Unnamed: 0_level_0,2020,2021,2022,2023,County
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
,331526933,332048977,333271411,334914895,United States
Alabama,58915,59203,59726,60342,Autauga County
Alabama,233227,239439,246531,253507,Baldwin County
Alabama,24969,24533,24700,24585,Barbour County
Alabama,22188,22359,21986,21868,Bibb County


# Preprocess the Wild Birds Data

In [140]:
# Import the data for Wild birds
""" HPAI Data"""
wild_birds_df = pd.read_csv('hpai-wild-birds.csv')
wild_birds_df.head()

Unnamed: 0,State,County,Collection Date,Date Detected,HPAI Strain,Bird Species,WOAH Classification,Sampling Method,Submitting Agency
0,South Carolina,Colleton,12/30/2021,1/13/2022,EA H5N1,American wigeon,Wild bird,Hunter harvest,NWDP
1,South Carolina,Colleton,12/30/2021,1/13/2022,EA H5N1,Blue-winged teal,Wild bird,Hunter harvest,NWDP
2,North Carolina,Hyde,12/30/2021,1/12/2022,EA H5N1,Northern shoveler,Wild bird,Hunter harvest,NWDP
3,North Carolina,Hyde,1/8/2022,1/20/2022,EA H5N1,American wigeon,Wild bird,Hunter harvest,NWDP
4,North Carolina,Hyde,1/8/2022,1/20/2022,EA H5,Gadwall,Wild bird,Hunter harvest,NWDP


In [141]:
# Reset the index
wild_birds_df = wild_birds_df.reset_index(drop=True)
wild_birds_df.head()

Unnamed: 0,State,County,Collection Date,Date Detected,HPAI Strain,Bird Species,WOAH Classification,Sampling Method,Submitting Agency
0,South Carolina,Colleton,12/30/2021,1/13/2022,EA H5N1,American wigeon,Wild bird,Hunter harvest,NWDP
1,South Carolina,Colleton,12/30/2021,1/13/2022,EA H5N1,Blue-winged teal,Wild bird,Hunter harvest,NWDP
2,North Carolina,Hyde,12/30/2021,1/12/2022,EA H5N1,Northern shoveler,Wild bird,Hunter harvest,NWDP
3,North Carolina,Hyde,1/8/2022,1/20/2022,EA H5N1,American wigeon,Wild bird,Hunter harvest,NWDP
4,North Carolina,Hyde,1/8/2022,1/20/2022,EA H5,Gadwall,Wild bird,Hunter harvest,NWDP


In [142]:
# Index the county_census_df by State
wild_birds_df = wild_birds_df.set_index("State")
wild_birds_df.head()

Unnamed: 0_level_0,County,Collection Date,Date Detected,HPAI Strain,Bird Species,WOAH Classification,Sampling Method,Submitting Agency
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
South Carolina,Colleton,12/30/2021,1/13/2022,EA H5N1,American wigeon,Wild bird,Hunter harvest,NWDP
South Carolina,Colleton,12/30/2021,1/13/2022,EA H5N1,Blue-winged teal,Wild bird,Hunter harvest,NWDP
North Carolina,Hyde,12/30/2021,1/12/2022,EA H5N1,Northern shoveler,Wild bird,Hunter harvest,NWDP
North Carolina,Hyde,1/8/2022,1/20/2022,EA H5N1,American wigeon,Wild bird,Hunter harvest,NWDP
North Carolina,Hyde,1/8/2022,1/20/2022,EA H5,Gadwall,Wild bird,Hunter harvest,NWDP


# Preprocess the Domestic Birds Data

In [143]:
# Import the data for Domestic birds
""" NEED SOURCE Data"""
domestic_birds_df = pd.read_csv('commercial-backyard-flocks - commercial-backyard-flocks.csv.csv')
domestic_birds_df.head()

Unnamed: 0,County,State,Outbreak Date,Flock Type,Flock Size
0,Ottawa,Michigan,12-31-2024,Commercial Turkey Meat Bird,29400
1,Riverside,California,12-31-2024,Commercial Table Egg Layer,181300
2,Spartanburg,South Carolina,12-31-2024,Commercial Upland Gamebird Producer,920
3,Butte,California,12-31-2024,WOAH Non-Poultry,70
4,Miner,South Dakota,12-31-2024,WOAH Poultry,1500


In [144]:
# Reset the index
domestic_birds_df = domestic_birds_df.reset_index(drop=True)
domestic_birds_df.head()

Unnamed: 0,County,State,Outbreak Date,Flock Type,Flock Size
0,Ottawa,Michigan,12-31-2024,Commercial Turkey Meat Bird,29400
1,Riverside,California,12-31-2024,Commercial Table Egg Layer,181300
2,Spartanburg,South Carolina,12-31-2024,Commercial Upland Gamebird Producer,920
3,Butte,California,12-31-2024,WOAH Non-Poultry,70
4,Miner,South Dakota,12-31-2024,WOAH Poultry,1500


In [145]:
# Index the dataframe by State
domestic_birds_df = domestic_birds_df.set_index("State")
domestic_birds_df.head()

Unnamed: 0_level_0,County,Outbreak Date,Flock Type,Flock Size
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Michigan,Ottawa,12-31-2024,Commercial Turkey Meat Bird,29400
California,Riverside,12-31-2024,Commercial Table Egg Layer,181300
South Carolina,Spartanburg,12-31-2024,Commercial Upland Gamebird Producer,920
California,Butte,12-31-2024,WOAH Non-Poultry,70
South Dakota,Miner,12-31-2024,WOAH Poultry,1500


In [146]:
# Rename "County Name" to "County"
domestic_birds_df = domestic_birds_df.rename(columns={"County Name": "County"})
domestic_birds_df.head()

Unnamed: 0_level_0,County,Outbreak Date,Flock Type,Flock Size
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Michigan,Ottawa,12-31-2024,Commercial Turkey Meat Bird,29400
California,Riverside,12-31-2024,Commercial Table Egg Layer,181300
South Carolina,Spartanburg,12-31-2024,Commercial Upland Gamebird Producer,920
California,Butte,12-31-2024,WOAH Non-Poultry,70
South Dakota,Miner,12-31-2024,WOAH Poultry,1500


# Create the Master Dataframe

In [147]:
# Reset the index for county_census
county_census_df = county_census_df.reset_index()
county_census_df

Unnamed: 0,State,2020,2021,2022,2023,County
0,,331526933,332048977,333271411,334914895,United States
1,Alabama,58915,59203,59726,60342,Autauga County
2,Alabama,233227,239439,246531,253507,Baldwin County
3,Alabama,24969,24533,24700,24585,Barbour County
4,Alabama,22188,22359,21986,21868,Bibb County
...,...,...,...,...,...,...
3140,Wyoming,42197,41626,41374,41249,Sweetwater County
3141,Wyoming,23379,23605,23297,23232,Teton County
3142,Wyoming,20457,20681,20727,20745,Uinta County
3143,Wyoming,7657,7719,7724,7710,Washakie County


In [148]:
# Drop the first row of the dataframe
county_census_df = county_census_df.drop(0)
county_census_df

Unnamed: 0,State,2020,2021,2022,2023,County
1,Alabama,58915,59203,59726,60342,Autauga County
2,Alabama,233227,239439,246531,253507,Baldwin County
3,Alabama,24969,24533,24700,24585,Barbour County
4,Alabama,22188,22359,21986,21868,Bibb County
5,Alabama,59107,59079,59516,59816,Blount County
...,...,...,...,...,...,...
3140,Wyoming,42197,41626,41374,41249,Sweetwater County
3141,Wyoming,23379,23605,23297,23232,Teton County
3142,Wyoming,20457,20681,20727,20745,Uinta County
3143,Wyoming,7657,7719,7724,7710,Washakie County


In [149]:
# For wild birds Dataframe, reset the index
wild_birds_df = wild_birds_df.reset_index()
wild_birds_df.head()

Unnamed: 0,State,County,Collection Date,Date Detected,HPAI Strain,Bird Species,WOAH Classification,Sampling Method,Submitting Agency
0,South Carolina,Colleton,12/30/2021,1/13/2022,EA H5N1,American wigeon,Wild bird,Hunter harvest,NWDP
1,South Carolina,Colleton,12/30/2021,1/13/2022,EA H5N1,Blue-winged teal,Wild bird,Hunter harvest,NWDP
2,North Carolina,Hyde,12/30/2021,1/12/2022,EA H5N1,Northern shoveler,Wild bird,Hunter harvest,NWDP
3,North Carolina,Hyde,1/8/2022,1/20/2022,EA H5N1,American wigeon,Wild bird,Hunter harvest,NWDP
4,North Carolina,Hyde,1/8/2022,1/20/2022,EA H5,Gadwall,Wild bird,Hunter harvest,NWDP


In [150]:
# For domestic birds Dataframe, reset the index
domestic_birds_df = domestic_birds_df.reset_index()
domestic_birds_df.head()

Unnamed: 0,State,County,Outbreak Date,Flock Type,Flock Size
0,Michigan,Ottawa,12-31-2024,Commercial Turkey Meat Bird,29400
1,California,Riverside,12-31-2024,Commercial Table Egg Layer,181300
2,South Carolina,Spartanburg,12-31-2024,Commercial Upland Gamebird Producer,920
3,California,Butte,12-31-2024,WOAH Non-Poultry,70
4,South Dakota,Miner,12-31-2024,WOAH Poultry,1500


In [151]:
# Combine all the 2 dataframes into 1 using "County" as the combining column
combined_df = pd.merge(county_census_df, wild_birds_df, on="County")
combined_df.head()

Unnamed: 0,State_x,2020,2021,2022,2023,County,State_y,Collection Date,Date Detected,HPAI Strain,Bird Species,WOAH Classification,Sampling Method,Submitting Agency
0,District of Columbia,670839,669037,670949,678972,District of Columbia,DC,5/25/2022,6/3/2022,EA/AM H5N1,Mallard,Wild bird,Morbidity/Mortality,NPS/USGS
1,District of Columbia,670839,669037,670949,678972,District of Columbia,DC,5/25/2022,6/3/2022,EA H5N1,Mallard,Wild bird,Morbidity/Mortality,NPS/USGS
2,District of Columbia,670839,669037,670949,678972,District of Columbia,DC,10/19/2022,10/25/2022,EA H5N1,Warbler (unidentified),Captive wild bird,Morbidity/Mortality,Private (non-government) submission
3,Louisiana,142834,146121,148056,150145,Livingston Parish,Louisiana,11/21/2023,8/14/2024,EA/AM H5N1,Canada goose,Wild bird,Morbidity/Mortality,SCWDS
4,Louisiana,160111,158866,157704,157568,Ouachita Parish,Louisiana,12/11/2023,8/14/2024,EA/AM H5N1,Snow goose,Wild bird,Morbidity/Mortality,SCWDS
