In [1]:
import pandas as pd

In [2]:
population_data_file_path = '../data/raw/Population by borough 1939 to 2039.xlsx'
population_data = pd.read_excel(
    population_data_file_path,
    sheet_name='Population',
    skiprows=3,
    usecols='B:P',
    na_values=['n/a', 'N/A', 'N/A', 'n/a'],
    engine='openpyxl'
)

# Combine the first two rows into a single header
population_data.columns = [
    f"{col1} {col2}" if pd.notna(col1) & pd.notna(col2) else (col1 if pd.notna(col1) else col2)
    for col1, col2 in zip(population_data.iloc[0], population_data.iloc[1])
]

# Drop the first three rows
population_data = population_data.drop(index=[0, 1, 2])

# Drop all columns with all NaN values
population_data = population_data.dropna(axis=1, how='all')

# Drop all rows with all NaN values
population_data = population_data.dropna(axis=0, how='all')

# Drop the last 6 rows
population_data = population_data.drop(index=population_data.index[-6:])

# Reset the index
population_data = population_data.reset_index(drop=True)

# Multiply the values in the population columns by 1000
population_columns = [
    'mid-year estimate 1939', 
    'mid-year estimate 1988',
    'Census 2011',
    'Projection 2015',
    'Projection 2021',
    'Projection 2031',
    'Projection 2039',
]
for col in population_columns:
    population_data[col] = population_data[col].astype(str).str.replace(' ', '').str.replace(',', '').astype(float) * 1000
    population_data[col] = population_data[col].astype(int)

# Drop the columns with NaN as the name
population_data = population_data.loc[:, ~population_data.columns.isna()]

In [3]:
population_data.to_csv(
    '../data/processed/population_by_borough.csv',
    index=False,
    header=True,
    encoding='utf-8'
)