## PDS Group 7

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
import os
import warnings
warnings.simplefilter("ignore", UserWarning)


pd.set_option("mode.copy_on_write", True)

In [3]:
folders = {
    "2000-2010": "population_data/2000-2010/",
    "2010-2019": "population_data/2010-2019/",
    "2020-2024": "population_data/2020-2024/"
}

# Loop through each period and list Excel files
for period_label, folder_path in folders.items():
    file_list = glob.glob(os.path.join(folder_path, "*.xlsx"))
    print(f"\nPeriod: {period_label} — {len(file_list)} files found")
    for f in file_list:
        print(f"  {os.path.basename(f)}")



Period: 2000-2010 — 6 files found
  co-est00int-01-florida.xlsx
  co-est00int-01-georgia.xlsx
  co-est00int-01-northcarolina.xlsx
  co-est00int-01-southcarolina.xlsx
  co-est00int-01-washington.xlsx
  co-est00int-01_alabama.xlsx

Period: 2010-2019 — 6 files found
  co-est2019-annres-alabama.xlsx
  co-est2019-annres-florida.xlsx
  co-est2019-annres-georgia.xlsx
  co-est2019-annres-northcarolina.xlsx
  co-est2019-annres-southcarolina.xlsx
  co-est2019-annres-washington.xlsx

Period: 2020-2024 — 6 files found
  co-est2024-pop-alabama.xlsx
  co-est2024-pop-florida.xlsx
  co-est2024-pop-georgia.xlsx
  co-est2024-pop-northcarolina.xlsx
  co-est2024-pop-southcarolina.xlsx
  co-est2024-pop-washington.xlsx


In [4]:
# Pick a sample file from 2000-2010 folder
sample_file = "population_data/2000-2010/co-est00int-01_alabama.xlsx"  # replace with one from print list

# Read Excel file, skip first row if it contains title info (adjust header if needed)
temp_df = pd.read_excel(sample_file, header=2)  # header=1 usually works for Census files
temp_df.head()


Unnamed: 0,Geographic Area,"April 1, 20001",Intercensal Estimates (as of July 1),Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,"April 1, 20102","July 1, 20103"
0,,,2000.0,2001.0,2002.0,2003.0,2004.0,2005.0,2006.0,2007.0,2008.0,2009.0,,
1,Alabama,4447207.0,4452173.0,4467634.0,4480089.0,4503491.0,4530729.0,4569805.0,4628981.0,4672840.0,4718206.0,4757938.0,4779736.0,4785298.0
2,.Autauga County,43751.0,44021.0,44889.0,45909.0,46800.0,48366.0,49676.0,51328.0,52405.0,53277.0,54135.0,54571.0,54632.0
3,.Baldwin County,140416.0,141342.0,144875.0,147957.0,151509.0,156266.0,162183.0,168121.0,172404.0,175827.0,179406.0,182265.0,183195.0
4,.Barbour County,29042.0,29015.0,28863.0,28653.0,28594.0,28287.0,28027.0,27861.0,27757.0,27808.0,27657.0,27457.0,27411.0


In [5]:
temp_df.columns = temp_df.columns.str.strip()\
                                 .str.lower()\
                                 .str.replace(" ", "_")\
                                 .str.replace(r"[^a-z0-9_]", "", regex=True)
temp_df.head()



Unnamed: 0,geographic_area,april_1_20001,intercensal_estimates_as_of_july_1,unnamed_3,unnamed_4,unnamed_5,unnamed_6,unnamed_7,unnamed_8,unnamed_9,unnamed_10,unnamed_11,april_1_20102,july_1_20103
0,,,2000.0,2001.0,2002.0,2003.0,2004.0,2005.0,2006.0,2007.0,2008.0,2009.0,,
1,Alabama,4447207.0,4452173.0,4467634.0,4480089.0,4503491.0,4530729.0,4569805.0,4628981.0,4672840.0,4718206.0,4757938.0,4779736.0,4785298.0
2,.Autauga County,43751.0,44021.0,44889.0,45909.0,46800.0,48366.0,49676.0,51328.0,52405.0,53277.0,54135.0,54571.0,54632.0
3,.Baldwin County,140416.0,141342.0,144875.0,147957.0,151509.0,156266.0,162183.0,168121.0,172404.0,175827.0,179406.0,182265.0,183195.0
4,.Barbour County,29042.0,29015.0,28863.0,28653.0,28594.0,28287.0,28027.0,27861.0,27757.0,27808.0,27657.0,27457.0,27411.0


In [6]:
temp_df['2010'] = temp_df['july_1_20103']  # use the column that has the correct 2010 data

In [7]:
cols_to_drop = ['april_1_20102', 'april_1_20001','july_1_20103']  
temp_df = temp_df.drop(columns=cols_to_drop, errors='ignore')

In [8]:
temp_df

Unnamed: 0,geographic_area,intercensal_estimates_as_of_july_1,unnamed_3,unnamed_4,unnamed_5,unnamed_6,unnamed_7,unnamed_8,unnamed_9,unnamed_10,unnamed_11,2010
0,,2000.0,2001.0,2002.0,2003.0,2004.0,2005.0,2006.0,2007.0,2008.0,2009.0,
1,Alabama,4452173.0,4467634.0,4480089.0,4503491.0,4530729.0,4569805.0,4628981.0,4672840.0,4718206.0,4757938.0,4785298.0
2,.Autauga County,44021.0,44889.0,45909.0,46800.0,48366.0,49676.0,51328.0,52405.0,53277.0,54135.0,54632.0
3,.Baldwin County,141342.0,144875.0,147957.0,151509.0,156266.0,162183.0,168121.0,172404.0,175827.0,179406.0,183195.0
4,.Barbour County,29015.0,28863.0,28653.0,28594.0,28287.0,28027.0,27861.0,27757.0,27808.0,27657.0,27411.0
...,...,...,...,...,...,...,...,...,...,...,...,...
72,Note: All geographic boundaries for the 2000-2...,,,,,,,,,,,
73,Suggested Citation:,,,,,,,,,,,
74,Table 1. Intercensal Estimates of the Resident...,,,,,,,,,,,
75,"Source: U.S. Census Bureau, Population Division",,,,,,,,,,,


In [9]:
temp_df.columns


Index(['geographic_area', 'intercensal_estimates_as_of_july_1', 'unnamed_3',
       'unnamed_4', 'unnamed_5', 'unnamed_6', 'unnamed_7', 'unnamed_8',
       'unnamed_9', 'unnamed_10', 'unnamed_11', '2010'],
      dtype='object')

In [10]:
# Current columns
print(temp_df.columns)

# Create a proper column list
cols = ['geographic_area', '2000','2001','2002','2003','2004',
        '2005','2006','2007','2008','2009','2010']

# Assign to DataFrame
temp_df.columns = cols


Index(['geographic_area', 'intercensal_estimates_as_of_july_1', 'unnamed_3',
       'unnamed_4', 'unnamed_5', 'unnamed_6', 'unnamed_7', 'unnamed_8',
       'unnamed_9', 'unnamed_10', 'unnamed_11', '2010'],
      dtype='object')


In [11]:
temp_df

Unnamed: 0,geographic_area,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010
0,,2000.0,2001.0,2002.0,2003.0,2004.0,2005.0,2006.0,2007.0,2008.0,2009.0,
1,Alabama,4452173.0,4467634.0,4480089.0,4503491.0,4530729.0,4569805.0,4628981.0,4672840.0,4718206.0,4757938.0,4785298.0
2,.Autauga County,44021.0,44889.0,45909.0,46800.0,48366.0,49676.0,51328.0,52405.0,53277.0,54135.0,54632.0
3,.Baldwin County,141342.0,144875.0,147957.0,151509.0,156266.0,162183.0,168121.0,172404.0,175827.0,179406.0,183195.0
4,.Barbour County,29015.0,28863.0,28653.0,28594.0,28287.0,28027.0,27861.0,27757.0,27808.0,27657.0,27411.0
...,...,...,...,...,...,...,...,...,...,...,...,...
72,Note: All geographic boundaries for the 2000-2...,,,,,,,,,,,
73,Suggested Citation:,,,,,,,,,,,
74,Table 1. Intercensal Estimates of the Resident...,,,,,,,,,,,
75,"Source: U.S. Census Bureau, Population Division",,,,,,,,,,,


In [None]:
# Drop the first row by index
temp_long = temp_df.iloc[1:].reset_index(drop=True)

In [15]:
temp_long

Unnamed: 0,geographic_area,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010
0,Alabama,4452173.0,4467634.0,4480089.0,4503491.0,4530729.0,4569805.0,4628981.0,4672840.0,4718206.0,4757938.0,4785298.0
1,.Autauga County,44021.0,44889.0,45909.0,46800.0,48366.0,49676.0,51328.0,52405.0,53277.0,54135.0,54632.0
2,.Baldwin County,141342.0,144875.0,147957.0,151509.0,156266.0,162183.0,168121.0,172404.0,175827.0,179406.0,183195.0
3,.Barbour County,29015.0,28863.0,28653.0,28594.0,28287.0,28027.0,27861.0,27757.0,27808.0,27657.0,27411.0
4,.Bibb County,19913.0,21028.0,21199.0,21399.0,21721.0,22042.0,22099.0,22438.0,22705.0,22941.0,22867.0
...,...,...,...,...,...,...,...,...,...,...,...,...
71,Note: All geographic boundaries for the 2000-2...,,,,,,,,,,,
72,Suggested Citation:,,,,,,,,,,,
73,Table 1. Intercensal Estimates of the Resident...,,,,,,,,,,,
74,"Source: U.S. Census Bureau, Population Division",,,,,,,,,,,


In [20]:
temp_long.columns


Index(['geographic_area', '2000', '2001', '2002', '2003', '2004', '2005',
       '2006', '2007', '2008', '2009', '2010'],
      dtype='object')

In [None]:

# 1. Add state + period (they do NOT exist yet)
state_name = os.path.basename(sample_file).replace(".xlsx", "")
temp_df['state'] = state_name
temp_df['period'] = '2000-2010'

# 2. Melt long
temp_long = temp_df.melt(
    id_vars=['state', 'period', 'geographic_area'],
    var_name='year',
    value_name='population'
)

# 3. Drop NA population rows
temp_long = temp_long.dropna(subset=['population'])

# 4. Clean year: extract only valid 4-digit numbers
temp_long['year'] = temp_long['year'].astype(str).str.extract(r'(\d{4})').astype(int)

# 5. Clean population (remove commas, ignore bad values)
temp_long['population'] = (
    temp_long['population']
    .astype(str)
    .str.replace(",", "", regex=False)
)

# 6. Remove rows where population accidentally equals a year (e.g., '2000.0')
temp_long = temp_long[~temp_long['population'].str.contains(r'^\d{4}\.0$')]

# 7. Convert population to int safely
temp_long['population'] = temp_long['population'].astype(float).astype(int)

temp_long.head()


Unnamed: 0,state,period,geographic_area,year,population
1,co-est00int-01_alabama,2000-2010,Alabama,2000,4452173
2,co-est00int-01_alabama,2000-2010,.Autauga County,2000,44021
3,co-est00int-01_alabama,2000-2010,.Baldwin County,2000,141342
4,co-est00int-01_alabama,2000-2010,.Barbour County,2000,29015
5,co-est00int-01_alabama,2000-2010,.Bibb County,2000,19913


In [26]:
temp_long = temp_long.dropna(subset=['population'])

In [27]:
temp_long

Unnamed: 0,state,period,geographic_area,year,population
1,co-est00int-01_alabama,2000-2010,Alabama,2000,4452173
2,co-est00int-01_alabama,2000-2010,.Autauga County,2000,44021
3,co-est00int-01_alabama,2000-2010,.Baldwin County,2000,141342
4,co-est00int-01_alabama,2000-2010,.Barbour County,2000,29015
5,co-est00int-01_alabama,2000-2010,.Bibb County,2000,19913
...,...,...,...,...,...
834,co-est00int-01_alabama,2000-2010,.Tuscaloosa County,2010,195036
835,co-est00int-01_alabama,2000-2010,.Walker County,2010,66947
836,co-est00int-01_alabama,2000-2010,.Washington County,2010,17597
837,co-est00int-01_alabama,2000-2010,.Wilcox County,2010,11615


In [46]:
sample = "population_data/2000-2010/co-est00int-01-florida.xlsx"

raw = pd.read_excel(sample, header=2)
raw.columns = (
    raw.columns.str.strip()
    .str.lower()
    .str.replace(" ", "_")
    .str.replace(r"[^a-z0-9_]", "", regex=True)
)

print(raw.columns)
raw.head(10)


Index(['geographic_area', 'april_1_20001',
       'intercensal_estimates_as_of_july_1', 'unnamed_3', 'unnamed_4',
       'unnamed_5', 'unnamed_6', 'unnamed_7', 'unnamed_8', 'unnamed_9',
       'unnamed_10', 'unnamed_11', 'april_1_20102', 'july_1_20103'],
      dtype='object')


Unnamed: 0,geographic_area,april_1_20001,intercensal_estimates_as_of_july_1,unnamed_3,unnamed_4,unnamed_5,unnamed_6,unnamed_7,unnamed_8,unnamed_9,unnamed_10,unnamed_11,april_1_20102,july_1_20103
0,,,2000.0,2001.0,2002.0,2003.0,2004.0,2005.0,2006.0,2007.0,2008.0,2009.0,,
1,Florida,15982571.0,16047515.0,16356966.0,16689370.0,17004085.0,17415318.0,17842038.0,18166990.0,18367842.0,18527305.0,18652644.0,18801310.0,18843326.0
2,.Alachua County,217964.0,218611.0,221718.0,224614.0,227022.0,229867.0,233756.0,239506.0,242685.0,244888.0,246657.0,247336.0,247497.0
3,.Baker County,22255.0,22374.0,22620.0,23298.0,23555.0,24142.0,24832.0,25571.0,26212.0,26725.0,27124.0,27115.0,27106.0
4,.Bay County,148270.0,148393.0,150207.0,152741.0,155044.0,158804.0,162917.0,165644.0,165345.0,166267.0,167464.0,168852.0,169272.0
5,.Bradford County,26096.0,26064.0,26083.0,26306.0,27035.0,27703.0,28098.0,28506.0,28825.0,28961.0,28979.0,28520.0,28480.0
6,.Brevard County,476299.0,477819.0,486429.0,495425.0,504847.0,518722.0,529907.0,535138.0,539719.0,542378.0,542109.0,543376.0,543573.0
7,.Broward County,1622974.0,1630600.0,1662848.0,1690118.0,1707543.0,1725461.0,1746896.0,1739348.0,1720825.0,1723633.0,1733310.0,1748066.0,1753578.0
8,.Calhoun County,13002.0,13040.0,12847.0,12947.0,13293.0,13445.0,13850.0,13997.0,14222.0,14351.0,14692.0,14625.0,14599.0
9,.Charlotte County,141620.0,142266.0,146311.0,150123.0,153235.0,157755.0,155262.0,157099.0,159742.0,160467.0,159629.0,159978.0,159990.0
