In [1]:
# Import library
import pandas as pd
from sqlalchemy import create_engine

# EXTRACT & TRANSFORM

Bees Colony Census Data By County

In [2]:
LandAreaCounty_file = "data/CountyLandArea.csv"
countyLandArea_df = pd.read_csv(LandAreaCounty_file)
countyLandArea_df = countyLandArea_df.rename(columns={'ST':'StateID', 'Unnamed: 2': 'CountyID', 'LND110210D': 'LandArea'})
countyLandArea_df

Unnamed: 0,Areaname,StateID,CountyID,LandArea
0,UNITED STATES,0,0,3531905.43
1,ALABAMA,1,0,50645.33
2,"Autauga, AL",1,1,594.44
3,"Baldwin, AL",1,3,1589.78
4,"Barbour, AL",1,5,884.88
...,...,...,...,...
3193,"Sweetwater, WY",56,37,10426.65
3194,"Teton, WY",56,39,3995.38
3195,"Uinta, WY",56,41,2081.26
3196,"Washakie, WY",56,43,2238.55


In [3]:
# Store beeColonyCensusDataByCounty CSV into DataFrame
beeColonyCensusDataByCounty_file = "data/beeColonyCensusDataByCounty.csv"
beeColonyCensusDataByCounty_df = pd.read_csv(beeColonyCensusDataByCounty_file)
beeColonyCensusDataByCounty_df

Unnamed: 0,Year,Period,State,State ANSI,Ag District,Ag District Code,County,County ANSI,Value,CV (%)
0,2012,END OF DEC,ALABAMA,1,BLACK BELT,40,AUTAUGA,1.0,119,27.7
1,2012,END OF DEC,ALABAMA,1,BLACK BELT,40,DALLAS,47.0,65,27.7
2,2012,END OF DEC,ALABAMA,1,BLACK BELT,40,ELMORE,51.0,190,27.7
3,2012,END OF DEC,ALABAMA,1,BLACK BELT,40,GREENE,63.0,14,27.7
4,2012,END OF DEC,ALABAMA,1,BLACK BELT,40,HALE,65.0,10,27.7
...,...,...,...,...,...,...,...,...,...,...
7825,2002,END OF DEC,WYOMING,56,SOUTHEAST,50,GOSHEN,15.0,(D),
7826,2002,END OF DEC,WYOMING,56,SOUTHEAST,50,LARAMIE,21.0,(D),
7827,2002,END OF DEC,WYOMING,56,SOUTHEAST,50,PLATTE,31.0,(D),
7828,2002,END OF DEC,WYOMING,56,WEST,30,LINCOLN,23.0,(D),


Merging LandArea Data source to County tables

In [4]:
new_df = pd.merge(countyLandArea_df, beeColonyCensusDataByCounty_df,  how='left', left_on=['StateID','CountyID'], 
                  right_on = ['State ANSI','County ANSI'])
new_df

Unnamed: 0,Areaname,StateID,CountyID,LandArea,Year,Period,State,State ANSI,Ag District,Ag District Code,County,County ANSI,Value,CV (%)
0,UNITED STATES,0,0,3531905.43,,,,,,,,,,
1,ALABAMA,1,0,50645.33,,,,,,,,,,
2,"Autauga, AL",1,1,594.44,2012.0,END OF DEC,ALABAMA,1.0,BLACK BELT,40.0,AUTAUGA,1.0,119,27.7
3,"Autauga, AL",1,1,594.44,2007.0,END OF DEC,ALABAMA,1.0,BLACK BELT,40.0,AUTAUGA,1.0,201,
4,"Autauga, AL",1,1,594.44,2002.0,END OF DEC,ALABAMA,1.0,BLACK BELT,40.0,AUTAUGA,1.0,212,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8115,"Uinta, WY",56,41,2081.26,2002.0,END OF DEC,WYOMING,56.0,WEST,30.0,UINTA,41.0,(D),
8116,"Washakie, WY",56,43,2238.55,2012.0,END OF DEC,WYOMING,56.0,NORTHWEST,10.0,WASHAKIE,43.0,(D),(D)
8117,"Washakie, WY",56,43,2238.55,2007.0,END OF DEC,WYOMING,56.0,NORTHWEST,10.0,WASHAKIE,43.0,(D),
8118,"Washakie, WY",56,43,2238.55,2002.0,END OF DEC,WYOMING,56.0,NORTHWEST,10.0,WASHAKIE,43.0,6,


In [5]:
# Create new data for CensusDataByCounty_df w selected columns - Removing 'Period' since all rows have the same value
CensusDataByCounty_df = new_df[[
    'Year', 'State', 'State ANSI', 'Ag District', 'Ag District Code', 'County', 'County ANSI', 'Value',
    'LandArea', 'Areaname']].copy()

CensusDataByCounty_rename_df = CensusDataByCounty_df.rename(columns={'CV (%)':'CV_pct'})
CensusDataByCounty_rename_df

Unnamed: 0,Year,State,State ANSI,Ag District,Ag District Code,County,County ANSI,Value,LandArea,Areaname
0,,,,,,,,,3531905.43,UNITED STATES
1,,,,,,,,,50645.33,ALABAMA
2,2012.0,ALABAMA,1.0,BLACK BELT,40.0,AUTAUGA,1.0,119,594.44,"Autauga, AL"
3,2007.0,ALABAMA,1.0,BLACK BELT,40.0,AUTAUGA,1.0,201,594.44,"Autauga, AL"
4,2002.0,ALABAMA,1.0,BLACK BELT,40.0,AUTAUGA,1.0,212,594.44,"Autauga, AL"
...,...,...,...,...,...,...,...,...,...,...
8115,2002.0,WYOMING,56.0,WEST,30.0,UINTA,41.0,(D),2081.26,"Uinta, WY"
8116,2012.0,WYOMING,56.0,NORTHWEST,10.0,WASHAKIE,43.0,(D),2238.55,"Washakie, WY"
8117,2007.0,WYOMING,56.0,NORTHWEST,10.0,WASHAKIE,43.0,(D),2238.55,"Washakie, WY"
8118,2002.0,WYOMING,56.0,NORTHWEST,10.0,WASHAKIE,43.0,6,2238.55,"Washakie, WY"


In [6]:
# Drop rows with NaN values
CensusDataByCounty_rename_df = CensusDataByCounty_rename_df.dropna(how='any')

CensusDataByCounty_rename_df

Unnamed: 0,Year,State,State ANSI,Ag District,Ag District Code,County,County ANSI,Value,LandArea,Areaname
2,2012.0,ALABAMA,1.0,BLACK BELT,40.0,AUTAUGA,1.0,119,594.44,"Autauga, AL"
3,2007.0,ALABAMA,1.0,BLACK BELT,40.0,AUTAUGA,1.0,201,594.44,"Autauga, AL"
4,2002.0,ALABAMA,1.0,BLACK BELT,40.0,AUTAUGA,1.0,212,594.44,"Autauga, AL"
5,2012.0,ALABAMA,1.0,COASTAL PLAINS & GULF COAST,50.0,BALDWIN,3.0,256,1589.78,"Baldwin, AL"
6,2007.0,ALABAMA,1.0,COASTAL PLAINS & GULF COAST,50.0,BALDWIN,3.0,275,1589.78,"Baldwin, AL"
...,...,...,...,...,...,...,...,...,...,...
8115,2002.0,WYOMING,56.0,WEST,30.0,UINTA,41.0,(D),2081.26,"Uinta, WY"
8116,2012.0,WYOMING,56.0,NORTHWEST,10.0,WASHAKIE,43.0,(D),2238.55,"Washakie, WY"
8117,2007.0,WYOMING,56.0,NORTHWEST,10.0,WASHAKIE,43.0,(D),2238.55,"Washakie, WY"
8118,2002.0,WYOMING,56.0,NORTHWEST,10.0,WASHAKIE,43.0,6,2238.55,"Washakie, WY"


In [13]:
# Exclude data rows with '(D)' or '(H)' in Value & CV (%) columns
CensusDataByCounty_rename_df = CensusDataByCounty_rename_df.loc[(CensusDataByCounty_rename_df['Value'] != ' (D)') ]
CensusDataByCounty_rename_df
# & (CensusDataByCounty_rename_df['CV_pct'] != '(H)') & (CensusDataByCounty_rename_df['CV_pct'] != '(D)')]

Unnamed: 0,Year,State,State ANSI,Ag District,Ag District Code,County,County ANSI,Value,LandArea,Areaname
2,2012.0,ALABAMA,1.0,BLACK BELT,40.0,AUTAUGA,1.0,119,594.44,"Autauga, AL"
3,2007.0,ALABAMA,1.0,BLACK BELT,40.0,AUTAUGA,1.0,201,594.44,"Autauga, AL"
4,2002.0,ALABAMA,1.0,BLACK BELT,40.0,AUTAUGA,1.0,212,594.44,"Autauga, AL"
5,2012.0,ALABAMA,1.0,COASTAL PLAINS & GULF COAST,50.0,BALDWIN,3.0,256,1589.78,"Baldwin, AL"
6,2007.0,ALABAMA,1.0,COASTAL PLAINS & GULF COAST,50.0,BALDWIN,3.0,275,1589.78,"Baldwin, AL"
...,...,...,...,...,...,...,...,...,...,...
8104,2012.0,WYOMING,56.0,NORTHEAST,20.0,SHERIDAN,33.0,4632,2523.99,"Sheridan, WY"
8108,2012.0,WYOMING,56.0,SOUTH CENTRAL,40.0,SWEETWATER,37.0,244,10426.65,"Sweetwater, WY"
8109,2007.0,WYOMING,56.0,SOUTH CENTRAL,40.0,SWEETWATER,37.0,765,10426.65,"Sweetwater, WY"
8113,2012.0,WYOMING,56.0,WEST,30.0,UINTA,41.0,136,2081.26,"Uinta, WY"


In [14]:
# Convert columns with string of numbers to float/int
CensusDataByCounty_rename_df['Value'] = CensusDataByCounty_rename_df.Value.str.replace(',', '').astype(int)
# CensusDataByCounty_rename_df['CV_pct'] = CensusDataByCounty_rename_df['CV_pct'].astype(float)

# CensusDataByCounty_df.dtypes
CensusDataByCounty_rename_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,Year,State,State ANSI,Ag District,Ag District Code,County,County ANSI,Value,LandArea,Areaname
2,2012.0,ALABAMA,1.0,BLACK BELT,40.0,AUTAUGA,1.0,119,594.44,"Autauga, AL"
3,2007.0,ALABAMA,1.0,BLACK BELT,40.0,AUTAUGA,1.0,201,594.44,"Autauga, AL"
4,2002.0,ALABAMA,1.0,BLACK BELT,40.0,AUTAUGA,1.0,212,594.44,"Autauga, AL"
5,2012.0,ALABAMA,1.0,COASTAL PLAINS & GULF COAST,50.0,BALDWIN,3.0,256,1589.78,"Baldwin, AL"
6,2007.0,ALABAMA,1.0,COASTAL PLAINS & GULF COAST,50.0,BALDWIN,3.0,275,1589.78,"Baldwin, AL"
...,...,...,...,...,...,...,...,...,...,...
8104,2012.0,WYOMING,56.0,NORTHEAST,20.0,SHERIDAN,33.0,4632,2523.99,"Sheridan, WY"
8108,2012.0,WYOMING,56.0,SOUTH CENTRAL,40.0,SWEETWATER,37.0,244,10426.65,"Sweetwater, WY"
8109,2007.0,WYOMING,56.0,SOUTH CENTRAL,40.0,SWEETWATER,37.0,765,10426.65,"Sweetwater, WY"
8113,2012.0,WYOMING,56.0,WEST,30.0,UINTA,41.0,136,2081.26,"Uinta, WY"


In [16]:
CensusDataByCounty_rename_df['Density'] =  CensusDataByCounty_rename_df['Value'] / CensusDataByCounty_rename_df['LandArea']
CensusDataByCounty_rename_df

Unnamed: 0_level_0,Year,State,State ANSI,Ag District,Ag District Code,County,County ANSI,Value,LandArea,Areaname,Density
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2,2012.0,ALABAMA,1.0,BLACK BELT,40.0,AUTAUGA,1.0,119,594.44,"Autauga, AL",0.200188
3,2007.0,ALABAMA,1.0,BLACK BELT,40.0,AUTAUGA,1.0,201,594.44,"Autauga, AL",0.338133
4,2002.0,ALABAMA,1.0,BLACK BELT,40.0,AUTAUGA,1.0,212,594.44,"Autauga, AL",0.356638
5,2012.0,ALABAMA,1.0,COASTAL PLAINS & GULF COAST,50.0,BALDWIN,3.0,256,1589.78,"Baldwin, AL",0.161029
6,2007.0,ALABAMA,1.0,COASTAL PLAINS & GULF COAST,50.0,BALDWIN,3.0,275,1589.78,"Baldwin, AL",0.172980
...,...,...,...,...,...,...,...,...,...,...,...
8104,2012.0,WYOMING,56.0,NORTHEAST,20.0,SHERIDAN,33.0,4632,2523.99,"Sheridan, WY",1.835190
8108,2012.0,WYOMING,56.0,SOUTH CENTRAL,40.0,SWEETWATER,37.0,244,10426.65,"Sweetwater, WY",0.023402
8109,2007.0,WYOMING,56.0,SOUTH CENTRAL,40.0,SWEETWATER,37.0,765,10426.65,"Sweetwater, WY",0.073370
8113,2012.0,WYOMING,56.0,WEST,30.0,UINTA,41.0,136,2081.26,"Uinta, WY",0.065345


In [15]:
# Add an id column
CensusDataByCounty_rename_df.reset_index(drop=False, inplace=True)
CensusDataByCounty_rename_df = CensusDataByCounty_rename_df.rename(columns={'index':'id'})
CensusDataByCounty_rename_df.set_index('id', inplace=True)

CensusDataByCounty_rename_df.dtypes

Year                float64
State                object
State ANSI          float64
Ag District          object
Ag District Code    float64
County               object
County ANSI         float64
Value                 int32
LandArea            float64
Areaname             object
dtype: object

In [None]:
CensusDataByCounty_rename_df.to_csv (r'data/CountyDataForJson.csv', index = False, header=True)

Bees Colony Loss Data

In [None]:
# Store beeColonyLoss xlsx into DataFrame
beeColonyLoss_file = "data/beeColonyLossCSV.csv"
beeColonyLoss_df = pd.read_csv(beeColonyLoss_file)
beeColonyLoss_df

In [None]:
# Create new data for ColonyLoss_df w selected columns - Removing 'Season' since all rows have the same value
ColonyLoss_df = beeColonyLoss_df[['Year', 'State', 'Total Annual Loss', 'Beekeepers',
       'Beekeepers Exclusive to State', 'Colonies',
       'Colonies Exclusive to State']].copy()

ColonyLoss_renamed_df = ColonyLoss_df.rename(columns={
    'Total Annual Loss':'Total Annual Loss_pct',
    'Beekeepers Exclusive to State': 'Beekeepers Exclusive to State_pct',
    'Colonies Exclusive to State': 'Colonies Exclusive to State_pct'
})

In [None]:
# Drop rows with NaN values
ColonyLoss_renamed_df = ColonyLoss_renamed_df.dropna(how='any')
ColonyLoss_renamed_df

In [None]:
# Convert columns with string of numbers to float
ColonyLoss_renamed_df['Total Annual Loss_pct'] = ColonyLoss_renamed_df['Total Annual Loss_pct'].str.replace('%', '').astype(float)
ColonyLoss_renamed_df['Beekeepers Exclusive to State_pct'] = ColonyLoss_renamed_df['Beekeepers Exclusive to State_pct'].str.replace('%', '').astype(float)
ColonyLoss_renamed_df['Colonies Exclusive to State_pct'] = ColonyLoss_renamed_df['Colonies Exclusive to State_pct'].str.replace('%', '').astype(float)

# ColonyLoss_renamed_df.dtypes
ColonyLoss_renamed_df

In [None]:
# Cleaning up the year column to show a single year
ColonyLoss_renamed_df['Year'] = ColonyLoss_renamed_df['Year'].str.slice(0, 4)
ColonyLoss_renamed_df['Year'] = ColonyLoss_renamed_df['Year'].astype(int)
ColonyLoss_renamed_df

In [None]:
# Add an id column
ColonyLoss_renamed_df.reset_index(drop=False, inplace=True)
ColonyLoss_renamed_df = ColonyLoss_renamed_df.rename(columns={'index':'id'})
ColonyLoss_renamed_df.set_index('id', inplace=True)

ColonyLoss_renamed_df.dtypes

Bee Colony Survey Data By State

In [None]:
# Store beeColonySurveyDataByState csv into DataFrame
beeColonySurveyDataByState = "data/beeColonySurveyDataByState.csv"
beeColonySurveyDataByState_df = pd.read_csv(beeColonySurveyDataByState)
beeColonySurveyDataByState_df

In [None]:
# Create new data for SurveyDataByState_df w selected columns - removing 'Week Ending', 'Watershed', 'CV (%)' b/c none of the rows have data
SurveyDataByState_df = beeColonySurveyDataByState_df[[
    'Year', 'Period', 'State', 'State ANSI', 'Data Item', 'Value']].copy()

# Note: There's no NaN values
SurveyDataByState_df = SurveyDataByState_df.dropna(how='any') 
SurveyDataByState_df

In [None]:
# Convert columns with string of numbers to float
SurveyDataByState_df['Value'] = SurveyDataByState_df.Value.str.replace(',', '').astype(int)

SurveyDataByState_df

In [None]:
# Add an id column
SurveyDataByState_df.reset_index(drop=False, inplace=True)
SurveyDataByState_df = SurveyDataByState_df.rename(columns={'index':'id'})
SurveyDataByState_df.set_index('id', inplace=True)

SurveyDataByState_df.dtypes

# LOAD

In [None]:
# Create database connection
engine = create_engine("sqlite:///data/bee_colony.sqlite")
conn = engine.connect()

In [None]:
# Confirm tables
engine.table_names()

In [None]:
# Load SurveyDataByState_df into database
SurveyDataByState_df.to_sql(name='census_state', con=engine, if_exists='append', index=True)

In [None]:
# Load CensusDataByCounty_df into database
CensusDataByCounty_rename_df.to_sql(name='census_county', con=engine, if_exists='append', index=True)

In [None]:
# Load ColonyLoss_renamed_df into database
ColonyLoss_renamed_df.to_sql(name='colonyloss', con=engine, if_exists='append', index=True)

In [None]:
# Verify we can query from the database
pd.read_sql_query('select * from census_state', con=engine).head()

In [None]:
pd.read_sql_query('select * from census_county', con=engine).head()

In [None]:
pd.read_sql_query('select * from colonyloss', con=engine).head()