In [None]:
# Import library
import pandas as pd
from sqlalchemy import create_engine
from config import username, password

# EXTRACT & TRANSFORM

Bees Colony Census Data By County

In [None]:
# Store beeColonyCensusDataByCounty CSV into DataFrame
beeColonyCensusDataByCounty_file = "data/beeColonyCensusDataByCounty.csv"
beeColonyCensusDataByCounty_df = pd.read_csv(beeColonyCensusDataByCounty_file)
beeColonyCensusDataByCounty_df

In [None]:
# Create new data for CensusDataByCounty_df w selected columns - Removing 'Period' since all rows have the same value
CensusDataByCounty_df = beeColonyCensusDataByCounty_df[[
    'Year', 'State', 'State ANSI', 'Ag District', 'Ag District Code', 'County', 'County ANSI', 'Value', 'CV (%)']].copy()
CensusDataByCounty_rename_df = CensusDataByCounty_df.rename(columns={'CV (%)':'CV_pct'})
CensusDataByCounty_rename_df

In [None]:
# Drop rows with NaN values
CensusDataByCounty_rename_df = CensusDataByCounty_rename_df.dropna(how='any')

# Exclude data rows with '(D)' or '(H)' in Value & CV (%) columns
CensusDataByCounty_rename_df = CensusDataByCounty_rename_df.loc[(CensusDataByCounty_rename_df['Value'] != '(D)') & (CensusDataByCounty_rename_df['CV_pct'] != '(H)') & (CensusDataByCounty_rename_df['CV_pct'] != '(D)')]

In [None]:
# Convert columns with string of numbers to float/int
CensusDataByCounty_rename_df['Value'] = CensusDataByCounty_rename_df.Value.str.replace(',', '').astype(int)
CensusDataByCounty_rename_df['CV_pct'] = CensusDataByCounty_rename_df['CV_pct'].astype(float)

# CensusDataByCounty_df.dtypes
CensusDataByCounty_rename_df

In [None]:
# Add an id column
CensusDataByCounty_rename_df.reset_index(drop=False, inplace=True)
CensusDataByCounty_rename_df = CensusDataByCounty_rename_df.rename(columns={'index':'id'})
CensusDataByCounty_rename_df.set_index('id', inplace=True)

CensusDataByCounty_rename_df.dtypes

Bees Colony Loss Data

In [None]:
# Store beeColonyLoss xlsx into DataFrame
beeColonyLoss_file = "data/beeColonyLossCSV.csv"
beeColonyLoss_df = pd.read_csv(beeColonyLoss_file)
beeColonyLoss_df

In [None]:
# Create new data for ColonyLoss_df w selected columns - Removing 'Season' since all rows have the same value
ColonyLoss_df = beeColonyLoss_df[['Year', 'State', 'Total Annual Loss', 'Beekeepers',
       'Beekeepers Exclusive to State', 'Colonies',
       'Colonies Exclusive to State']].copy()

ColonyLoss_renamed_df = ColonyLoss_df.rename(columns={
    'Total Annual Loss':'Total Annual Loss_pct',
    'Beekeepers Exclusive to State': 'Beekeepers Exclusive to State_pct',
    'Colonies Exclusive to State': 'Colonies Exclusive to State_pct'
})

In [None]:
# Drop rows with NaN values
ColonyLoss_renamed_df = ColonyLoss_renamed_df.dropna(how='any')
ColonyLoss_renamed_df

In [None]:
# Convert columns with string of numbers to float
ColonyLoss_renamed_df['Total Annual Loss_pct'] = ColonyLoss_renamed_df['Total Annual Loss_pct'].str.replace('%', '').astype(float)
ColonyLoss_renamed_df['Beekeepers Exclusive to State_pct'] = ColonyLoss_renamed_df['Beekeepers Exclusive to State_pct'].str.replace('%', '').astype(float)
ColonyLoss_renamed_df['Colonies Exclusive to State_pct'] = ColonyLoss_renamed_df['Colonies Exclusive to State_pct'].str.replace('%', '').astype(float)

# ColonyLoss_renamed_df.dtypes
ColonyLoss_renamed_df

In [None]:
# Add an id column
ColonyLoss_renamed_df.reset_index(drop=False, inplace=True)
ColonyLoss_renamed_df = ColonyLoss_renamed_df.rename(columns={'index':'id'})
ColonyLoss_renamed_df.set_index('id', inplace=True)

ColonyLoss_renamed_df.dtypes

Bee Colony Survey Data By State

In [None]:
# Store beeColonySurveyDataByState csv into DataFrame
beeColonySurveyDataByState = "data/beeColonySurveyDataByState.csv"
beeColonySurveyDataByState_df = pd.read_csv(beeColonySurveyDataByState)
beeColonySurveyDataByState_df

In [None]:
# Create new data for SurveyDataByState_df w selected columns - removing 'Week Ending', 'Watershed', 'CV (%)' b/c none of the rows have data
SurveyDataByState_df = beeColonySurveyDataByState_df[[
    'Year', 'Period', 'State', 'State ANSI', 'Data Item', 'Value']].copy()

# Note: There's no NaN values
SurveyDataByState_df = SurveyDataByState_df.dropna(how='any') 
SurveyDataByState_df

In [None]:
# Convert columns with string of numbers to float
SurveyDataByState_df['Value'] = SurveyDataByState_df.Value.str.replace(',', '').astype(int)

SurveyDataByState_df

In [None]:
# Add an id column
SurveyDataByState_df.reset_index(drop=False, inplace=True)
SurveyDataByState_df = SurveyDataByState_df.rename(columns={'index':'id'})
SurveyDataByState_df.set_index('id', inplace=True)

SurveyDataByState_df.dtypes

# LOAD

In [None]:
# Create database connection
connection_string = f'{username}:{password}@localhost:5432/bees_colonies_db'
engine = create_engine(f'postgresql://{connection_string}')

In [None]:
# Confirm tables
engine.table_names()

In [None]:
# Load SurveyDataByState_df into database
SurveyDataByState_df.to_sql(name='census_state', con=engine, if_exists='append', index=True)

In [None]:
# Load CensusDataByCounty_df into database
CensusDataByCounty_rename_df.to_sql(name='census_county', con=engine, if_exists='append', index=True)

In [None]:
# Load ColonyLoss_renamed_df into database
ColonyLoss_renamed_df.to_sql(name='colonyloss', con=engine, if_exists='append', index=True)

In [None]:
# Verify we can query from the database
pd.read_sql_query('select * from census_state', con=engine).head()

In [None]:
pd.read_sql_query('select * from census_county', con=engine).head()

In [None]:
pd.read_sql_query('select * from colonyloss', con=engine).head()