In [30]:
# Impoting required packages
import pandas as pd
import numpy as np

# setting default option
pd.set_option("mode.copy_on_write", True)

In [31]:
# Load Raw Data File
df = pd.read_csv("../../Data/raw/raw_population.txt", sep="\t")

In [3]:
df.head()

Unnamed: 0,Notes,State,State Code,County,County Code,Yearly July 1st Estimates,Yearly July 1st Estimates Code,Population
0,,Alabama,1.0,"Autauga County, AL",1001.0,2003.0,2003.0,46800
1,,Alabama,1.0,"Autauga County, AL",1001.0,2004.0,2004.0,48366
2,,Alabama,1.0,"Autauga County, AL",1001.0,2005.0,2005.0,49676
3,,Alabama,1.0,"Autauga County, AL",1001.0,2006.0,2006.0,51328
4,,Alabama,1.0,"Autauga County, AL",1001.0,2007.0,2007.0,52405


In [4]:
df[df["Notes"].notnull()]

Unnamed: 0,Notes,State,State Code,County,County Code,Yearly July 1st Estimates,Yearly July 1st Estimates Code,Population
40937,---,,,,,,,
40938,Dataset: Bridged-Race Population Estimates 199...,,,,,,,
40939,Query Parameters:,,,,,,,
40940,Yearly July 1st Estimates: 2003; 2004; 2005; 2...,,,,,,,
40941,Group By: State; County; Yearly July 1st Estim...,,,,,,,
...,...,...,...,...,...,...,...,...
41032,City are available only for the years prior to...,,,,,,,
41033,1999 and 2000 due to the addition of population.,,,,,,,
41034,"20. South Boston City, Virginia (FIPS code 517...",,,,,,,
41035,"June 30, 1995. This change was made retroactiv...",,,,,,,


In [5]:
# droping notes column
df1 = df.drop(columns=["Notes"])

In [6]:
# remving the rows with na values generated due to Notes, using state column for reference
df1 = df1.dropna(subset=["State"])

In [7]:
# dropping alaska
df1 = df1[df1["State"] != "Alaska"]

In [8]:
df1.sample(5)

Unnamed: 0,State,State Code,County,County Code,Yearly July 1st Estimates,Yearly July 1st Estimates Code,Population
10997,Iowa,19.0,"Johnson County, IA",19103.0,2015.0,2015.0,145114
20623,Missouri,29.0,"Scotland County, MO",29199.0,2008.0,2008.0,4818
27364,Ohio,39.0,"Montgomery County, OH",39113.0,2015.0,2015.0,532059
39390,West Virginia,54.0,"Monroe County, WV",54063.0,2003.0,2003.0,13349
16020,Massachusetts,25.0,"Norfolk County, MA",25021.0,2007.0,2007.0,656582


In [9]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 40495 entries, 0 to 40936
Data columns (total 7 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   State                           40495 non-null  object 
 1   State Code                      40495 non-null  float64
 2   County                          40495 non-null  object 
 3   County Code                     40495 non-null  float64
 4   Yearly July 1st Estimates       40495 non-null  float64
 5   Yearly July 1st Estimates Code  40495 non-null  float64
 6   Population                      40495 non-null  object 
dtypes: float64(4), object(3)
memory usage: 2.5+ MB


In [10]:
# validate if yealry estimate and estimate code are same
df1["Yearly July 1st Estimates"].equals(df1["Yearly July 1st Estimates Code"])

True

In [11]:
# Correcting Data Types
df2 = df1.copy()

# Saving state code as padded string
df2["State Code"] = df2["State Code"].astype(int).astype(str).str.zfill(2)

# Saving county code as padded string
df2["County Code"] = df2["County Code"].astype(int).astype(str).str.zfill(5)

# Converting Year to Integer
df2["Yearly July 1st Estimates"] = df2["Yearly July 1st Estimates"].astype(int)

# Converting Population to Integer
# replacing the missing values with 0 for now
df2["Population"] = df2["Population"].replace("Missing", 0)
df2["Population"] = df2["Population"].astype(int)

In [12]:
df2.sample(5)

Unnamed: 0,State,State Code,County,County Code,Yearly July 1st Estimates,Yearly July 1st Estimates Code,Population
25656,North Carolina,37,"Rockingham County, NC",37157,2010,2010.0,93681
22002,Nebraska,31,"Gage County, NE",31067,2009,2009.0,22364
15506,Maine,23,"Piscataquis County, ME",23021,2013,2013.0,17176
88,Alabama,1,"Butler County, AL",1013,2013,2013.0,20359
37566,Virginia,51,"Nottoway County, VA",51135,2012,2012.0,15758


In [13]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 40495 entries, 0 to 40936
Data columns (total 7 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   State                           40495 non-null  object 
 1   State Code                      40495 non-null  object 
 2   County                          40495 non-null  object 
 3   County Code                     40495 non-null  object 
 4   Yearly July 1st Estimates       40495 non-null  int64  
 5   Yearly July 1st Estimates Code  40495 non-null  float64
 6   Population                      40495 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 2.5+ MB


In [14]:
df3 = df2.copy()

# rename columns
df3 = df3.rename(
    columns={
        "Yearly July 1st Estimates": "Year",
        "State Code": "State_Code",
        "County Code": "County_Code",
    }
)

# reorder columns
df3 = df3[
    [
        "State",
        "State_Code",
        "County",
        "County_Code",
        "Year",
        "Population",
    ]
]

In [15]:
# viewing result
df3.sample(5)

Unnamed: 0,State,State_Code,County,County_Code,Year,Population
12871,Kansas,20,"Trego County, KS",20195,2004,3120
19749,Missouri,29,"DeKalb County, MO",29063,2005,12727
19927,Missouri,29,"Howard County, MO",29089,2014,10131
25366,North Carolina,37,"Macon County, NC",37113,2006,32736
20394,Missouri,29,"Phelps County, MO",29161,2013,45030


In [16]:
# maps with fips for proper county names
fips = pd.read_csv("../../Data/raw/county_fips.csv")

In [17]:
fips.head()

Unnamed: 0,BUYER_COUNTY,BUYER_STATE,countyfips
0,AUTAUGA,AL,1001
1,BALDWIN,AL,1003
2,BARBOUR,AL,1005
3,BIBB,AL,1007
4,BLOUNT,AL,1009


In [18]:
# padding fips to have consistency
fips["countyfips"] = fips["countyfips"].astype(str).str.zfill(5)

In [19]:
# performing left join to get the county names
df4 = pd.merge(
    df3,
    fips,
    how="left",
    left_on="County_Code",
    right_on="countyfips",
    validate="m:1",
    indicator=True,
)

In [20]:
# validate match for all rows
df4["_merge"].value_counts()

_merge
both          40456
left_only        39
right_only        0
Name: count, dtype: int64

In [21]:
df4[df4["_merge"] == "left_only"]

Unnamed: 0,State,State_Code,County,County_Code,Year,Population,BUYER_COUNTY,BUYER_STATE,countyfips,_merge
1690,Arkansas,5,"Montgomery County, AR",5097,2003,9239,,,,left_only
1691,Arkansas,5,"Montgomery County, AR",5097,2004,9334,,,,left_only
1692,Arkansas,5,"Montgomery County, AR",5097,2005,9358,,,,left_only
1693,Arkansas,5,"Montgomery County, AR",5097,2006,9437,,,,left_only
1694,Arkansas,5,"Montgomery County, AR",5097,2007,9478,,,,left_only
1695,Arkansas,5,"Montgomery County, AR",5097,2008,9573,,,,left_only
1696,Arkansas,5,"Montgomery County, AR",5097,2009,9490,,,,left_only
1697,Arkansas,5,"Montgomery County, AR",5097,2010,9515,,,,left_only
1698,Arkansas,5,"Montgomery County, AR",5097,2011,9404,,,,left_only
1699,Arkansas,5,"Montgomery County, AR",5097,2012,9344,,,,left_only


In [22]:
df4[df4["_merge"] == "left_only"]["County"].unique()

array(['Montgomery County, AR', 'Kalawao County, HI',
       'Oglala Lakota County, SD'], dtype=object)

In [23]:
df4.loc[df4["County"] == "Montgomery County, AR", "BUYER_COUNTY"] = "MONTGOMERY"
df4.loc[df4["County"] == "Kalawao County, HI", "BUYER_COUNTY"] = "KALAWAO"
df4.loc[df4["County"] == "Oglala Lakota County, SD", "BUYER_COUNTY"] = "OGLALA LAKOTA"

In [24]:
df4[df4["_merge"] == "left_only"]

Unnamed: 0,State,State_Code,County,County_Code,Year,Population,BUYER_COUNTY,BUYER_STATE,countyfips,_merge
1690,Arkansas,5,"Montgomery County, AR",5097,2003,9239,MONTGOMERY,,,left_only
1691,Arkansas,5,"Montgomery County, AR",5097,2004,9334,MONTGOMERY,,,left_only
1692,Arkansas,5,"Montgomery County, AR",5097,2005,9358,MONTGOMERY,,,left_only
1693,Arkansas,5,"Montgomery County, AR",5097,2006,9437,MONTGOMERY,,,left_only
1694,Arkansas,5,"Montgomery County, AR",5097,2007,9478,MONTGOMERY,,,left_only
1695,Arkansas,5,"Montgomery County, AR",5097,2008,9573,MONTGOMERY,,,left_only
1696,Arkansas,5,"Montgomery County, AR",5097,2009,9490,MONTGOMERY,,,left_only
1697,Arkansas,5,"Montgomery County, AR",5097,2010,9515,MONTGOMERY,,,left_only
1698,Arkansas,5,"Montgomery County, AR",5097,2011,9404,MONTGOMERY,,,left_only
1699,Arkansas,5,"Montgomery County, AR",5097,2012,9344,MONTGOMERY,,,left_only


In [25]:
# Final Verifications
df4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40495 entries, 0 to 40494
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   State         40495 non-null  object  
 1   State_Code    40495 non-null  object  
 2   County        40495 non-null  object  
 3   County_Code   40495 non-null  object  
 4   Year          40495 non-null  int64   
 5   Population    40495 non-null  int64   
 6   BUYER_COUNTY  40495 non-null  object  
 7   BUYER_STATE   40456 non-null  object  
 8   countyfips    40456 non-null  object  
 9   _merge        40495 non-null  category
dtypes: category(1), int64(2), object(7)
memory usage: 2.8+ MB


In [26]:
df4.sample(10)

Unnamed: 0,State,State_Code,County,County_Code,Year,Population,BUYER_COUNTY,BUYER_STATE,countyfips,_merge
13201,Kentucky,21,"Henry County, KY",21103,2009,15530,HENRY,KY,21103,both
31452,Tennessee,47,"DeKalb County, TN",47041,2008,18693,DEKALB,TN,47041,both
19550,Missouri,29,"Jefferson County, MO",29099,2014,222066,JEFFERSON,MO,29099,both
11225,Kansas,20,"Barton County, KS",20009,2009,27526,BARTON,KS,20009,both
28365,Oregon,41,"Clackamas County, OR",41005,2015,399852,CLACKAMAS,OR,41005,both
15299,Maryland,24,"Howard County, MD",24027,2014,306989,HOWARD,MD,24027,both
3138,Colorado,8,"Gunnison County, CO",8051,2008,15250,GUNNISON,CO,8051,both
22363,Nevada,32,"Douglas County, NV",32005,2006,46855,DOUGLAS,NV,32005,both
33122,Texas,48,"Crosby County, TX",48107,2014,5826,CROSBY,TX,48107,both
36287,Virginia,51,"Accomack County, VA",51001,2007,34553,ACCOMACK,VA,51001,both


In [27]:
# select required columns
df5 = df4[["State", "State_Code", "BUYER_COUNTY", "County_Code", "Year", "Population"]]

# rename columns
df5 = df5.rename(
    columns={
        "BUYER_COUNTY": "County",
    }
)

In [28]:
df5.sample(5)

Unnamed: 0,State,State_Code,County,County_Code,Year,Population
31265,Tennessee,47,CAMPBELL,47013,2003,39957
313,Alabama,1,DE KALB,1049,2004,67260
38919,West Virginia,54,MINERAL,54057,2013,27700
21512,Nebraska,31,FILLMORE,31059,2013,5635
18625,Mississippi,28,RANKIN,28121,2012,145833


In [29]:
# write to parqut in main file