In [1]:
import pandas as pd

In [18]:
# pythonically retrieving stores csv
pop_2010 = pd.read_csv('pop_2010.csv')
pop_2011 = pd.read_csv('pop_2011.csv')
pop_2012 = pd.read_csv('pop_2012.csv')
pop_2013 = pd.read_csv('pop_2013.csv')
pop_2014 = pd.read_csv('pop_2014.csv')
pop_2015 = pd.read_csv('pop_2015.csv')
pop_2016 = pd.read_csv('pop_2016.csv')
pop_2017 = pd.read_csv('pop_2017.csv')
pop_2018 = pd.read_csv('pop_2018.csv')
pop_2019 = pd.read_csv('pop_2019.csv')
pop_2020 = pd.read_csv('pop_2020.csv')
pop_2021 = pd.read_csv('pop_2021.csv')

In [29]:
def pop_group(df, title):
    """
    Process and clean population data in DataFrame.

    Parameters:
    - df (pd.DataFrame): Input DataFrame containing population data.

    Returns:
    pd.DataFrame: Processed DataFrame with columns representing different age groups and demographic information.

    This function takes a DataFrame containing population data and performs the following tasks:
    1. Transpose the DataFrame and reset index.
    2. Rename columns for better readability.
    3. Clean up column values and remove unnecessary characters.
    4. Select relevant columns and handle duplicates.
    5. Convert numeric columns to integer and handle missing values.
    6. Calculate additional columns such as child_pop, adult_pop, young_adult_pop, and senior_pop.
    7. Return a cleaned and processed DataFrame with relevant demographic information.
    """
    df = df.T
    df.reset_index(inplace=True)
    df.columns = df.iloc[0]
    df = df.iloc[1:]
    df.rename(columns={'Label (Grouping)':'county',
                            '\xa0\xa0\xa0\xa0Total population': 'total_pop',
                            '\xa0\xa0\xa0\xa0Under 5 years': 'under_5',
                            '\xa0\xa0\xa0\xa05 to 9 years': '5-9',
                            '\xa0\xa0\xa0\xa010 to 14 years': '10-14',
                            '\xa0\xa0\xa0\xa015 to 19 years': '15-19',
                            '\xa0\xa0\xa0\xa020 to 24 years': '20-24',
                            '\xa0\xa0\xa0\xa025 to 34 years': '25-34',
                            '\xa0\xa0\xa0\xa035 to 44 years': '35-44',
                            '\xa0\xa0\xa0\xa045 to 54 years': '45-54',
                            '\xa0\xa0\xa0\xa055 to 59 years': '55-59',
                            '\xa0\xa0\xa0\xa060 to 64 years': '60-64',
                            '\xa0\xa0\xa0\xa065 to 74 years': '65-74',
                            '\xa0\xa0\xa0\xa075 to 84 years': '75-84',
                            '\xa0\xa0\xa0\xa085 years and over': '85+',
                            '\xa0\xa0\xa0\xa018 years and over': '18+',
                            '\xa0\xa0\xa0\xa065 years and over': '65+',}, inplace=True)
    df.county = df.county.str.replace('!!Estimate', '')
    cat = ['county', 'total_pop','under_5', '5-9', '10-14', '15-19', '20-24', '25-34', '35-44', '45-54',
           '55-59', '60-64', '65-74', '75-84', '85+', '65+', '18+']
    df = df[cat]
    df = df.loc[:, ~df.columns.duplicated()]
    for col in df.drop(columns = 'county'):
        df[col] = df[col].str.replace(',','')
    df = df.dropna()
    df = df.reset_index().drop(columns = 'index')
    num = df.drop(columns='county').astype(int)
    df = pd.concat([df.county, num], axis = 1)
    df['child_pop'] = (df['total_pop'] - df['18+'])
    df['adult_pop'] = (df['25-34'] + df['35-44'] + df['45-54'] + df['55-59'] + df['60-64'])
    df['young_adult_pop'] = (df['18+'] - df['25-34'] - df['35-44'] - df['45-54'] - df['55-59'] - df['60-64'] - df['65+'])
    df = df.rename(columns = {'65+': 'senior_pop'})
    pop = ['county', 'child_pop', 'young_adult_pop', 'adult_pop', 'senior_pop', 'total_pop']
    df = df[pop]
    df = df.rename(columns = {'child_pop': f'child_{title}', 'young_adult_pop': f'yound_adult_{title}', 'adult_pop': f'adult_{title}', 'senior_pop': f'senior_{title}', 'total_pop': f'total_{title}'})
    return df

In [26]:
def pop_group2(df, title):
    """
    Process and clean population data in DataFrame.

    Parameters:
    - df (pd.DataFrame): Input DataFrame containing population data.

    Returns:
    pd.DataFrame: Processed DataFrame with columns representing different age groups and demographic information.

    This function takes a DataFrame containing population data and performs the following tasks:
    1. Transpose the DataFrame and reset index.
    2. Rename columns for better readability.
    3. Clean up column values and remove unnecessary characters.
    4. Select relevant columns and handle duplicates.
    5. Convert numeric columns to integer and handle missing values.
    6. Calculate additional columns such as child_pop, adult_pop, young_adult_pop, and senior_pop.
    7. Return a cleaned and processed DataFrame with relevant demographic information.
    """
    df = df.T
    df.reset_index(inplace=True)
    df.columns = df.iloc[0]
    df = df.iloc[1:]
    df.rename(columns={'Label (Grouping)':'county',
                            '\xa0\xa0\xa0\xa0Total population': 'total_pop',
                            '        Under 5 years': 'under_5',
                            '        5 to 9 years': '5-9',
                            '        10 to 14 years': '10-14',
                            '        15 to 19 years': '15-19',
                            '        20 to 24 years': '20-24',
                            '        25 to 34 years': '25-34',
                            '        35 to 44 years': '35-44',
                            '        45 to 54 years': '45-54',
                            '        55 to 59 years': '55-59',
                            '        60 to 64 years': '60-64',
                            '        65 to 74 years': '65-74',
                            '        75 to 84 years': '75-84',
                            '        85 years and over': '85+',
                            '        18 years and over': '18+',
                            '        65 years and over': '65+',}, inplace=True)
    df.county = df.county.str.replace('!!Estimate', '')
    cat = ['county', 'total_pop','under_5', '5-9', '10-14', '15-19', '20-24', '25-34', '35-44', '45-54',
           '55-59', '60-64', '65-74', '75-84', '85+', '65+', '18+']
    df = df[cat]
    df = df.loc[:, ~df.columns.duplicated()]
    for col in df.drop(columns = 'county'):
        df[col] = df[col].str.replace(',','')
    df = df.dropna()
    df = df.reset_index().drop(columns = 'index')
    num = df.drop(columns='county').astype(int)
    df = pd.concat([df.county, num], axis = 1)
    df['child_pop'] = (df['total_pop'] - df['18+'])
    df['adult_pop'] = (df['25-34'] + df['35-44'] + df['45-54'] + df['55-59'] + df['60-64'])
    df['young_adult_pop'] = (df['18+'] - df['25-34'] - df['35-44'] - df['45-54'] - df['55-59'] - df['60-64'] - df['65+'])
    df = df.rename(columns = {'65+': 'senior_pop'})
    pop = ['county', 'child_pop', 'young_adult_pop', 'adult_pop', 'senior_pop', 'total_pop']
    df = df[pop]
    df = df.rename(columns = {'child_pop': f'child_{title}', 'young_adult_pop': f'yound_adult_{title}', 'adult_pop': f'adult_{title}', 'senior_pop': f'senior_{title}', 'total_pop': f'total_{title}'})
    return df

In [13]:
pop_2010 = pop_2010.T
pop_2010.reset_index(inplace=True)
pop_2010.columns = pop_2010.iloc[0]
pop_2010 = pop_2010.iloc[1:]

In [19]:
pop_2017 = pop_2017.T
pop_2017.reset_index(inplace=True)
pop_2017.columns = pop_2017.iloc[0]
pop_2017 = pop_2017.iloc[1:]

In [22]:
pop_2018 = pop_2018.T
pop_2018.reset_index(inplace=True)
pop_2018.columns = pop_2018.iloc[0]
pop_2018 = pop_2018.iloc[1:]

In [24]:
pop_2019 = pop_2019.T
pop_2019.reset_index(inplace=True)
pop_2019.columns = pop_2019.iloc[0]
pop_2019 = pop_2019.iloc[1:]

In [25]:
pop_2020 = pop_2020.T
pop_2020.reset_index(inplace=True)
pop_2020.columns = pop_2020.iloc[0]
pop_2020 = pop_2020.iloc[1:]

In [26]:
pop_2021 = pop_2021.T
pop_2021.reset_index(inplace=True)
pop_2021.columns = pop_2021.iloc[0]
pop_2021 = pop_2021.iloc[1:]

In [None]:
'    Under 5 years'

In [None]:
'12345678Under 5 years'

In [None]:
'5 to 9 years'

In [21]:
column_name_pattern = '        '

# Use the filter method to select columns with the specified pattern
pop_2017 = pop_2017.filter(like=column_name_pattern, axis=1)

In [21]:
pop_2010.columns[0:20]

Index(['Label (Grouping)', 'SEX AND AGE', '    Total population', '    Male',
       '    Female', '    Under 5 years', '    5 to 9 years',
       '    10 to 14 years', '    15 to 19 years', '    20 to 24 years',
       '    25 to 34 years', '    35 to 44 years', '    45 to 54 years',
       '    55 to 59 years', '    60 to 64 years', '    65 to 74 years',
       '    75 to 84 years', '    85 years and over', '    Median age (years)',
       '    18 years and over'],
      dtype='object', name=0)

In [22]:
pop_2017.columns[0:20]

Index([], dtype='object', name=0)

In [23]:
pop_2018.columns[0:20]

Index(['Label (Grouping)', 'SEX AND AGE', '    Total population',
       '        Male', '        Female',
       '        Sex ratio (males per 100 females)', '        Under 5 years',
       '        5 to 9 years', '        10 to 14 years',
       '        15 to 19 years', '        20 to 24 years',
       '        25 to 34 years', '        35 to 44 years',
       '        45 to 54 years', '        55 to 59 years',
       '        60 to 64 years', '        65 to 74 years',
       '        75 to 84 years', '        85 years and over',
       '        Median age (years)'],
      dtype='object', name=0)

In [27]:
pop_2019.columns[0:20]

Index(['Label (Grouping)', 'SEX AND AGE', '    Total population',
       '        Male', '        Female',
       '        Sex ratio (males per 100 females)', '        Under 5 years',
       '        5 to 9 years', '        10 to 14 years',
       '        15 to 19 years', '        20 to 24 years',
       '        25 to 34 years', '        35 to 44 years',
       '        45 to 54 years', '        55 to 59 years',
       '        60 to 64 years', '        65 to 74 years',
       '        75 to 84 years', '        85 years and over',
       '        Median age (years)'],
      dtype='object', name=0)

In [28]:
pop_2020.columns[0:20]

Index(['Label (Grouping)', 'SEX AND AGE', '    Total population',
       '        Male', '        Female',
       '        Sex ratio (males per 100 females)', '        Under 5 years',
       '        5 to 9 years', '        10 to 14 years',
       '        15 to 19 years', '        20 to 24 years',
       '        25 to 34 years', '        35 to 44 years',
       '        45 to 54 years', '        55 to 59 years',
       '        60 to 64 years', '        65 to 74 years',
       '        75 to 84 years', '        85 years and over',
       '        Median age (years)'],
      dtype='object', name=0)

In [29]:
pop_2021.columns[0:20]

Index(['Label (Grouping)', 'SEX AND AGE', '    Total population',
       '        Male', '        Female',
       '        Sex ratio (males per 100 females)', '        Under 5 years',
       '        5 to 9 years', '        10 to 14 years',
       '        15 to 19 years', '        20 to 24 years',
       '        25 to 34 years', '        35 to 44 years',
       '        45 to 54 years', '        55 to 59 years',
       '        60 to 64 years', '        65 to 74 years',
       '        75 to 84 years', '        85 years and over',
       '        Median age (years)'],
      dtype='object', name=0)

In [30]:
pop_2010 = pop_group(pop_2010, 'pop_2010')
pop_2011 = pop_group(pop_2011, 'pop_2011')
pop_2012 = pop_group(pop_2012, 'pop_2012')
pop_2013 = pop_group(pop_2013, 'pop_2013')
pop_2014 = pop_group(pop_2014, 'pop_2014')
pop_2015 = pop_group(pop_2015, 'pop_2015')
pop_2016 = pop_group(pop_2016, 'pop_2016')

KeyError: "['total_pop', 'under_5', '5-9', '10-14', '15-19', '20-24', '25-34', '35-44', '45-54', '55-59', '60-64', '65-74', '75-84', '85+', '65+', '18+'] not in index"

In [27]:
pop_2017 = pop_group2(pop_2017, 'pop_2017')
pop_2018 = pop_group2(pop_2018, 'pop_2018')
pop_2019 = pop_group2(pop_2019, 'pop_2019')
pop_2020 = pop_group2(pop_2020, 'pop_2020')
pop_2021 = pop_group2(pop_2021, 'pop_2021')

IndexError: single positional indexer is out-of-bounds