# FIPS Code + Shipment Data

In [51]:
# Importing required packages
import pandas as pd
import numpy as np
import warnings

# Set default option
pd.set_option("mode.copy_on_write", True)
warnings.simplefilter(action="ignore", category=FutureWarning)

In [52]:
# Load dataset
shipment = pd.read_parquet("../../Data/processed/shipment_eda.parquet")
fips_codes = pd.read_csv("../../Data/raw/county_fips.csv")

In [53]:
# Perform the merge operation directly since the columns are already in all caps
shipment_fips = pd.merge(
    shipment,
    fips_codes[["BUYER_COUNTY", "BUYER_STATE", "countyfips"]],
    on=["BUYER_COUNTY", "BUYER_STATE"],
    how="left",
)

In [54]:
# Fill NaN values with a placeholder (0) before converting to integers
shipment_fips["countyfips"] = (
    shipment_fips["countyfips"].fillna(0).astype(int).astype(str).str.zfill(5)
)

# Replace the placeholder '00000' back to NaN
shipment_fips["countyfips"] = shipment_fips["countyfips"].replace("00000", np.nan)

# Check the first few rows of the adjusted dataset
print(shipment_fips.sample(5))

      BUYER_STATE BUYER_COUNTY  YEAR           MME countyfips
41098          WI  FOND DU LAC  2016  2.347673e+07      55039
15812          MA    HAMPSHIRE  2011  7.414830e+07      25015
36627          TX         LYNN  2013  3.577728e+05      48305
20517          MS      CALHOUN  2014  2.801195e+06      28013
13820          KY       HARLAN  2011  1.971263e+07      21095


In [55]:
# Drop the rows where 'BUYER_STATE' is 'AR' from the shipment_fips DataFrame, Montgomery changed name. 
shipment_fips = shipment_fips[shipment_fips['BUYER_COUNTY'] != 'MONTGOMERY']

In [56]:
# Checking for NaNs
nan_fips = shipment_fips[shipment_fips["countyfips"].isna()]

# Group by 'BUYER_STATE' and 'BUYER_COUNTY' and count the NaN values
nan_counts = (
    nan_fips.groupby(["BUYER_STATE", "BUYER_COUNTY"])
    .size()
    .reset_index(name="NaN_count")
)

# Print the result
print(nan_counts.head(5))

  BUYER_STATE              BUYER_COUNTY  NaN_count
0          GU                      GUAM         14
1          MP  NORTHERN MARIANA ISLANDS          9
2          MP                    SAIPAN          5
3          PR                  ADJUNTAS         14
4          PR                    AGUADA         14


# Merged shipment mapped with Population

In [57]:
# Load dataset
population = pd.read_parquet("../../Data/processed/population.parquet")
state_codes = pd.read_csv("../../Data/raw/us_states-ab.csv")

In [58]:
# Merge the population dataset with the state abbreviations dataset
population_state_code = pd.merge(
    population, state_codes, left_on="State", right_on="state", how="left"
)

# Drop the columns we don't need
population_state_code = population_state_code.drop(
    columns=["state", "abbrev", "State_Code"]
)

# Rename the 'code' column from the state_codes DataFrame to 'State_Code'
population_state_code.rename(columns={"code": "State_Code"}, inplace=True)

In [59]:
# Convert 'year' column to nullable integer if it's in 'shipment_fips'
shipment_fips["YEAR"] = shipment_fips["YEAR"].astype("Int64")

# Ensure county names and state codes are in the same format
shipment_fips["BUYER_COUNTY"] = shipment_fips["BUYER_COUNTY"].str.upper().str.strip()
population_state_code["County"] = (
    population_state_code["County"].str.upper().str.strip()
)

# Convert 'population' column to nullable integer if it's in 'population_state_code'
population_state_code["Population"] = population_state_code["Population"].astype(
    "Int64"
)


# Merge datasets on BUYER_STATE with State_Code, BUYER_COUNTY with County, and YEAR with Year
shipment_with_population = pd.merge(
    shipment_fips,
    population_state_code,
    left_on=["BUYER_STATE", "BUYER_COUNTY", "YEAR"],
    right_on=["State_Code", "County", "Year"],
    how="left",
)

# Check the first few rows 
print(shipment_with_population.head(2))

  BUYER_STATE BUYER_COUNTY  YEAR           MME countyfips    State   County  \
0          AL      AUTAUGA  2006  1.710054e+07      01001  Alabama  AUTAUGA   
1          AL      AUTAUGA  2007  1.915784e+07      01001  Alabama  AUTAUGA   

  County_Code    Year  Population State_Code  
0       01001  2006.0       51328         AL  
1       01001  2007.0       52405         AL  


In [60]:
# Drop redundant columns
final_shipment_data = shipment_with_population.drop(
    columns=["State", "County", "County_Code", "Year", "State_Code"]
)

# Check the first few rows of the resulting DataFrame
print(final_shipment_data.sample(10))

      BUYER_STATE BUYER_COUNTY  YEAR           MME countyfips  Population
7985           IA       MONONA  2015  2.536995e+06      19133        8865
10540          IN         CLAY  2007  7.601827e+06      18021       26983
6918           GA    WHITFIELD  2012  4.254906e+07      13313      102926
41860          WV      LINCOLN  2008  7.543433e+06      54043       21881
35796          TX       HARRIS  2009  1.014596e+09      48201     4034866
12723          KS         RICE  2006  1.338461e+06      20159       10224
17149          MI     MONTCALM  2018  1.670766e+07      26117        <NA>
8187           IA       SHELBY  2007  1.645765e+06      19165       12496
41494          WI     WAUKESHA  2006  1.110510e+08      55133      380799
13992          KY      LETCHER  2015  1.674314e+07      21133       23043


In [61]:
# Filter the DataFrame for years up to 2015
shipment_data_up_to_2015 = final_shipment_data[final_shipment_data["YEAR"] <= 2015]

# Filter rows in the filtered DataFrame where Population is NaN or zero
nan_or_zero_population_up_to_2015 = shipment_data_up_to_2015[
    (shipment_data_up_to_2015["Population"].isna())
    | (shipment_data_up_to_2015["Population"] == 0)
]

# Drop duplicates based on state, county, and year
unique_nan_or_zero_population_up_to_2015 = (
    nan_or_zero_population_up_to_2015.drop_duplicates(
        subset=["BUYER_STATE", "BUYER_COUNTY", "YEAR"]
    )
)

# Print the number of unique rows by year
print(unique_nan_or_zero_population_up_to_2015["YEAR"].value_counts())

YEAR
2006    83
2007    83
2008    83
2009    83
2013    83
2010    82
2011    82
2012    82
2014    82
2015    81
Name: count, dtype: Int64


In [62]:
# Select state and county columns
unique_states_counties_with_nan_population = unique_nan_or_zero_population_up_to_2015[
    ["BUYER_STATE", "BUYER_COUNTY"]
]

# Drop duplicates to get unique combinations
unique_state_county_combinations = (
    unique_states_counties_with_nan_population.drop_duplicates()
)

In [63]:
# Get unique state names for NaN population values
unique_states_with_nan_population = unique_nan_or_zero_population_up_to_2015[
    "BUYER_STATE"
].unique()

# Print the unique state names
print(unique_states_with_nan_population)

['GU' 'MP' 'PR' 'PW' 'VA' 'VI']


In [64]:
# Extract the unique states with NaN or zero population
states_to_drop = unique_nan_or_zero_population_up_to_2015['BUYER_STATE'].unique()

# Filter out data from these states in the 'shipment_data_up_to_2015' DataFrame
filtered_shipment_data = shipment_data_up_to_2015[~shipment_data_up_to_2015['BUYER_STATE'].isin(states_to_drop)]

# Check the resulting DataFrame
print(filtered_shipment_data.sample(5))

      BUYER_STATE BUYER_COUNTY  YEAR           MME countyfips  Population
32347          SC    MCCORMICK  2012  1.364684e+06      45065        9926
24046          ND        WELLS  2007  2.986549e+05      38103        4317
27262          OH     CUYAHOGA  2007  2.843334e+08      39035     1301540
13570          KY     FRANKLIN  2013  1.801955e+07      21073       49482
30091          PA   CUMBERLAND  2014  1.084666e+08      42041      243400


In [65]:
# Check for NaN values in all columns of the filtered dataset
nan_columns_exist = filtered_shipment_data.isna().any()

# Print the result
print(nan_columns_exist)

BUYER_STATE     False
BUYER_COUNTY    False
YEAR            False
MME             False
countyfips      False
Population      False
dtype: bool


In [None]:
# Define the file path and name for the parquet file
file_path = '../../Data/processed/shipment_corrected.parquet'

# Save the DataFrame to a parquet file
filtered_shipment_data.to_parquet(file_path)