In [1]:
import pandas as pd
import re

In [2]:
def get_suburb_names(df):
    
    # Set up cases for extracting suburb names
    pattern_one = r'\(.*?\)'
    pattern_two = r'\-.*$'

    for i in range(len(df)):
        df["suburb"].loc[i] = re.sub(pattern_one, '', df["suburb"].loc[i])
        df["suburb"].loc[i] = re.sub(pattern_two, '', df["suburb"].loc[i])
        df["suburb"].loc[i] = df["suburb"].loc[i].lower()
        df["suburb"].loc[i] = df["suburb"].loc[i].replace("region", "")
        df["suburb"].loc[i] = df["suburb"].loc[i].strip()
    
    return df

In [3]:
# Read in datasets
income_one_df = pd.read_csv("../data/raw/datasource-AU_Govt_ABS-UoM_AURIN_DB_3_abs_personal_income_total_income_sa2_2011_2018_file_1.zip")
income_two_df = pd.read_csv("../data/raw/datasource-AU_Govt_ABS-UoM_AURIN_DB_3_abs_personal_income_total_income_sa2_2011_2018_file_2.zip")
income_df = pd.concat([income_one_df, income_two_df]).reset_index(drop=True)
population_df = pd.read_csv("../data/raw/datasource-AU_Govt_ABS-UoM_AURIN_DB_3_abs_regional_population_sa2_2001_2021.zip")

# Drop irrelevant attributes for income dataset
suburb_info = income_df[["sa2_code", "sa2_name"]]
income_df = income_df.loc[:, "mean_aud_2011_12":]
income_df = income_df.join(suburb_info)

# Rename attributes for income dataset
income_df.columns = ["mean_income_2011_12", "mean_income_2012_13", "mean_income_2013_14", "mean_income_2014_15", 
                     "mean_income_2015_16", "mean_income_2016_17", "mean_income_2017_18", "geometries", "suburb_code", 
                     "suburb"]

# Drop irrelevant attributes for population dataset
estimated_population = population_df.loc[:, "erp_2001":"erp_2021"]
population_df = population_df[["state_name_2016", "sa2_maincode_2016", "sa2_name_2016", "area_km2", "geom", 
                               "overseas_arrivals_2016_17", "overseas_arrivals_2017_18", "overseas_arrivals_2018_19", 
                               "overseas_arrivals_2019_20", "overseas_arrivals_2020_21"]]
population_df = population_df.join(estimated_population)

# Rename attributes for population dataset
population_df.rename(columns={"state_name_2016": "state", "sa2_maincode_2016": "suburb_code", "sa2_name_2016": 
                              "suburb", "area_km2": "suburb_area_km2", 
                              "erp_2001": "population_2001", "erp_2002": "population_2002", "erp_2003": "population_2003", 
                              "erp_2004": "population_2004", "erp_2005": "population_2005", "erp_2006": "population_2006", 
                              "erp_2007": "population_2007", "erp_2008": "population_2008", "erp_2009": "population_2009", 
                              "erp_2010": "population_2010", "erp_2011": "population_2011", "erp_2012": "population_2012", 
                              "erp_2013": "population_2013", "erp_2014": "population_2014", "erp_2015": "population_2015", 
                              "erp_2016": "population_2016", "erp_2017": "population_2017", "erp_2018": "population_2018", 
                              "erp_2019": "population_2019", "erp_2020": "population_2020", "erp_2021": "population_2021", 
                              "overseas_arrivals_2016_17": "immigration_2016", 
                              "overseas_arrivals_2017_18": "immigration_2017", 
                              "overseas_arrivals_2018_19": "immigration_2018", 
                              "overseas_arrivals_2019_20": "immigration_2019", 
                              "overseas_arrivals_2020_21": "immigration_2020"}, inplace=True)

# Read in sa2 suburb codes
suburb_info = pd.read_excel("../data/raw/list_of_SA2_and_their_SA3_and_state.xlsx")
suburb_info = suburb_info[suburb_info["STATE_NAME_2016"] == "Victoria"].reset_index(drop=True)
suburb_info = suburb_info[["SA2_MAINCODE_2016", "SA2_NAME_2016"]]

# Filter instances in Victoria for population dataset
population_df = population_df[population_df["state"] == "Victoria"].reset_index(drop=True)

# Filter instances in Victoria for income dataset
victoria_suburb_codes = suburb_info["SA2_MAINCODE_2016"]
income_df = pd.merge(income_df, victoria_suburb_codes, left_on="suburb_code", right_on="SA2_MAINCODE_2016")

# Extract suburb names from each instance in both income and population datasets
income_df = get_suburb_names(income_df)
population_df = get_suburb_names(population_df)

# Compute population density from 2001 to 2021 (people/km2)
j = 2001
for i in range(10,31):
    curr_col_name = "population_density_{year}".format(year=j)
    population_df[curr_col_name] = population_df.iloc[:, i]/population_df["suburb_area_km2"] 
    j += 1

# Drop rows containing NaN values for income dataset
income_df = income_df.dropna().reset_index(drop=True)

# Drop rows containing NaN values for population dataset
population_df = population_df.dropna().reset_index(drop=True)
# Save dataframes as csv files
income_df.to_csv("../data/curated/Income.csv", index=False)
population_df.to_csv("../data/curated/Population.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
