In [1]:
"""
The population data has been taken from https://wonder.cdc.gov/bridged-race-population.html
The data is in the form of tab-separated values, with additional info towards the end of the document, starting with the text "---"
We have downloaded the data from 1990 to 2010 separately, and then 2010 to 2020 separately, to maintain the file sizes and ensure complete download of data. We will combine the two in this code.
We will also remove the additional info at the end of the document while reading the file.
The raw downloaded data has been saved in the folder "00_raw_data" as "pop_1990_2010.txt" and "pop_2010_2020.txt" 
"""

'\nThe population data has been taken from https://wonder.cdc.gov/bridged-race-population.html\nThe data is in the form of tab-separated values, with additional info towards the end of the document, starting with the text "---"\nWe have downloaded the data from 1990 to 2010 separately, and then 2010 to 2020 separately, to maintain the file sizes and ensure complete download of data. We will combine the two in this code.\nWe will also remove the additional info at the end of the document while reading the file.\nThe raw downloaded data has been saved in the folder "00_raw_data" as "pop_1990_2010.txt" and "pop_2010_2020.txt" \n'

In [2]:
# Importing libraries
import pandas as pd
import numpy as np

In [3]:
# Individual file links in the 00_raw_data folder
data_1990_2010 = "https://github.com/MIDS-at-Duke/opioid-2023-group-8-final-opioid/raw/population_cleaning/00_Source_Data/US_Pop_1990_2010.txt"
data_2010_2020 = "https://github.com/MIDS-at-Duke/opioid-2023-group-8-final-opioid/raw/population_cleaning/00_Source_Data/US_Pop_2010_2020.txt"

In [4]:
# Reading the file
# We will read the files in chunks of one line, and stop reading when we encounter the text "---"
df_1990_2010 = pd.DataFrame()
df_2010_2020 = pd.DataFrame()


def stop_reading(line):
    return "---" not in line


reader = pd.read_csv(data_1990_2010, iterator=True, chunksize=1, sep="\t")
for chunk in reader:
    if stop_reading(chunk.to_string()):
        df_1990_2010 = pd.concat([df_1990_2010, chunk])
    else:
        break

reader = pd.read_csv(data_2010_2020, iterator=True, chunksize=1, sep="\t")
for chunk in reader:
    if stop_reading(chunk.to_string()):
        df_2010_2020 = pd.concat([df_2010_2020, chunk])
    else:
        break

In [5]:
df_1990_2010.shape
# (66129, 8)

(66129, 8)

In [6]:
df_2010_2020.shape
# (31490, 8)

(31490, 8)

In [7]:
# Combining the two dataframes to get the entire 1990 to 2020 data in one dataframe
df_1990_2020 = pd.concat([df_1990_2010, df_2010_2020], ignore_index=True)

In [8]:
df_1990_2020.shape
# (97619, 8)

(97619, 8)

In [9]:
df_1990_2020.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 97619 entries, 0 to 97618
Data columns (total 8 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Notes                           0 non-null      float64
 1   Yearly July 1st Estimates       97619 non-null  int64  
 2   Yearly July 1st Estimates Code  97619 non-null  int64  
 3   State                           97619 non-null  object 
 4   State Code                      97619 non-null  int64  
 5   County                          97619 non-null  object 
 6   County Code                     97619 non-null  int64  
 7   Population                      97619 non-null  object 
dtypes: float64(1), int64(4), object(3)
memory usage: 6.0+ MB


In [10]:
# Dropping the unnecessary Notes column
df_1990_2020 = df_1990_2020.drop("Notes", axis=1)
# Dropping the duplicated Yearly July 1st Estimates column
df_1990_2020 = df_1990_2020.drop("Yearly July 1st Estimates", axis=1)

# Renaming the Year column
df_1990_2020 = df_1990_2020.rename(columns={"Yearly July 1st Estimates Code": "Year"})

In [11]:
df_1990_2020.head()

Unnamed: 0,Year,State,State Code,County,County Code,Population
0,1990,Alabama,1,"Autauga County, AL",1001,34353
1,1990,Alabama,1,"Baldwin County, AL",1003,98955
2,1990,Alabama,1,"Barbour County, AL",1005,25505
3,1990,Alabama,1,"Bibb County, AL",1007,16697
4,1990,Alabama,1,"Blount County, AL",1009,39473


In [12]:
# Checking for duplicates at a state-county-year level
df_1990_2020[df_1990_2020[["State Code", "County Code", "Year"]].duplicated()].size
# Since the size is 0, there are no duplicates at a state-county-year level

0

In [20]:
# Changing data types according to requirements
df_1990_2020["Population"] = df_1990_2020["Population"].replace(
    "Missing", np.nan
)  # Replacing "Missing" with NaN
df_1990_2020["Population"] = df_1990_2020["Population"].astype(
    "float64"
)  # Changing the data type to float
df_1990_2020["State Code"] = df_1990_2020["State Code"].astype(
    "object"
)  # Changing the data type to object
df_1990_2020["County Code"] = df_1990_2020["County Code"].astype(
    "object"
)  # Changing the data type to object
df_1990_2020["Year"] = pd.to_datetime(
    df_1990_2020["Year"], format="%Y"
)  # Changing the data type to datetime

In [14]:
# Checking if there is missing data in Population for more than 1 year in a row for a state-county combination
grouped = df_1990_2020.sort_values("Year").groupby(["State Code", "County Code"])

missing_population = grouped.apply(
    lambda x: x["Population"].isnull().astype(int).diff().abs().max() > 1
)

missing_population[missing_population].shape
# (0,)

(0,)

In [15]:
# Filling NA values in population based on interpolation of values from the same state-county from other years
df_1990_2020.sort_values("Year", inplace=True)
df_1990_2020["Population_filled"] = df_1990_2020["Population"]
df_1990_2020["Population_filled"] = df_1990_2020.groupby(["State Code", "County Code"])[
    "Population_filled"
].transform(lambda group: group.interpolate())
df_1990_2020.dropna(subset=["Population_filled"], inplace=True)

In [16]:
df_1990_2020

Unnamed: 0,Year,State,State Code,County,County Code,Population,Population_filled
0,1990-01-01,Alabama,1,"Autauga County, AL",1001,34353.0,34353.0
2094,1990-01-01,Ohio,39,"Lorain County, OH",39093,271711.0,271711.0
2095,1990-01-01,Ohio,39,"Lucas County, OH",39095,462634.0,462634.0
2096,1990-01-01,Ohio,39,"Madison County, OH",39097,37111.0,37111.0
2097,1990-01-01,Ohio,39,"Mahoning County, OH",39099,265095.0,265095.0
...,...,...,...,...,...,...,...
95522,2020-01-01,Kentucky,21,"Jackson County, KY",21109,13340.0,13340.0
95523,2020-01-01,Kentucky,21,"Jefferson County, KY",21111,767452.0,767452.0
95524,2020-01-01,Kentucky,21,"Jessamine County, KY",21113,54057.0,54057.0
95526,2020-01-01,Kentucky,21,"Kenton County, KY",21117,167949.0,167949.0


In [17]:
df_1990_2020[df_1990_2020["Population"].isnull()]

Unnamed: 0,Year,State,State Code,County,County Code,Population,Population_filled
34416,2000-01-01,Virginia,51,"Clifton Forge city, VA",51560,,4378.0
31588,2000-01-01,Alaska,2,"Wrangell-Petersburg Census Area, AK",2280,,6681.0
31581,2000-01-01,Alaska,2,"Prince of Wales-Outer Ketchikan Census Area, AK",2201,,6210.0
31584,2000-01-01,Alaska,2,"Skagway-Hoonah-Angoon Census Area, AK",2232,,3453.0
37565,2001-01-01,Virginia,51,"Clifton Forge city, VA",51560,,4378.0
...,...,...,...,...,...,...,...
97391,2020-01-01,Virginia,51,"Bedford city, VA",51515,,6174.0
94568,2020-01-01,Alaska,2,"Wrangell-Petersburg Census Area, AK",2280,,6681.0
94566,2020-01-01,Alaska,2,"Valdez-Cordova Census Area, AK",2261,,9492.0
94564,2020-01-01,Alaska,2,"Skagway-Hoonah-Angoon Census Area, AK",2232,,3453.0


In [18]:
df_1990_2020[df_1990_2020["Population"].isnull()]["State"].value_counts()

State
Alaska      74
Virginia    32
Name: count, dtype: int64

In [19]:
df_1990_2020.info()

<class 'pandas.core.frame.DataFrame'>
Index: 97519 entries, 0 to 97618
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   Year               97519 non-null  datetime64[ns]
 1   State              97519 non-null  object        
 2   State Code         97519 non-null  object        
 3   County             97519 non-null  object        
 4   County Code        97519 non-null  object        
 5   Population         97413 non-null  float64       
 6   Population_filled  97519 non-null  float64       
dtypes: datetime64[ns](1), float64(2), object(4)
memory usage: 6.0+ MB
