In [33]:
import pandas as pd
import os

In [34]:
os.chdir('/home/malachy/ADS (2024)/project-2-group-real-estate-industry-project-11/notebooks')

## Functions

### Preparing suburb name

In [46]:
import numpy as np

DIRECTIONS = ["north", "south", "west", "east"]

"""`df` should have unique indicies and"""
def suburb_name_split(df):
    indicies = list(df.index)
    final_indicies = []
    repeated_indicies = []
    counter = 0

    print(len(indicies))

    # for each index, gather how many indicies to repeat and the final product
    for index in indicies:
        # remove (.vic) if present
        clean_index = index.replace("(Vic.)", "")

        # replace the brackets around "west"
        clean_index = clean_index.replace("(", "").replace(")", "")

        # make sure lower case, no confusion
        clean_index = clean_index.lower()

        # nothing fancy
        if (not "-" in clean_index):
            final_indicies.append(clean_index)
            repeated_indicies.append(index)
            continue
        
        counter += 1

        # need to split
        index_split = clean_index.split(" - ")

        if (len(index_split) != 2):
            print("length of index split is not 2?")
            exit()
        
        # if only one direction, reintroduce without the weird thing
        if (index_split[1].lower() in DIRECTIONS):
            final_indicies.append(" ".join(index_split))
            repeated_indicies.append(index)
        
        # else will need to strip everything
        else:
            final_indicies.extend(index_split)
            repeated_indicies.extend(np.repeat(index, 2))
    
    print(counter / len(indicies))

    # duplicate
    df = df.loc[repeated_indicies]

    print(df.shape)
    print(len(final_indicies))

    # change the names accordingly
    df.index = final_indicies
    
    return df


### Column and imputation

In [36]:
def impute_previous(columns, na):
    flat_column_list = [[group[i] for group in columns] for i in range(len(columns[0]))]

    column_dataframe = pd.DataFrame(flat_column_list)

    column_dataframe = column_dataframe.applymap(lambda x: None if (not x) or (na in str(x)) else x)

    column_dataframe = column_dataframe.T.ffill().T

    flat_column_list = column_dataframe.to_numpy()

    return flat_column_list

In [37]:
def fill_columns(df, column_dict, na="Unnamed"):
    columns = list(df.columns)

    for index, new_column in column_dict.items():
        columns[index] = new_column

    if (type(df.columns) == pd.MultiIndex):
        flat_column_list = impute_previous(columns, na)

        df.columns = pd.MultiIndex.from_arrays(flat_column_list)
    
    else:
        df.columns = columns

    return df

## Economic by Region

In [38]:
def filter_victoria(df, column_name):
    region_mask = ~df[column_name].str.isnumeric()

    # get the region indexes and names
    regions_index, regions_names = list(df[region_mask].index), list(df[region_mask][column_name].values)

    # find the index for victoria in the list
    index_victoria = regions_names.index("Victoria")

    # reference the mask between victoria and the next index
    df = df.loc[regions_index[index_victoria] + 1: regions_index[index_victoria + 1] - 1]

    return df

### Geography summary

In [39]:
COLUMNS_DICT_INCOME_GEOGRAPHY = {
    0: ("SA2 code",None),
    1: ("SA2 name",None)
}

In [40]:
df = pd.read_csv("../data/landing/economic_by_region/income_by_geography_b2022.csv", header=[0, 1])

df = fill_columns(df, COLUMNS_DICT_INCOME_GEOGRAPHY)

df = filter_victoria(df, ("SA2 code", None))

In [41]:
df = df.melt(id_vars=df.columns.to_list()[:2], 
             value_vars = df.columns.to_list()[2:],
             var_name=["Measure", "Year"],
             value_name="Value")

In [42]:
new_df = df.pivot(index=list(df.columns[:2]) + ["Year"], columns="Measure", values="Value")
new_df = new_df.reset_index()
new_df.columns = [x[0] for x in new_df.columns[:2]] + list(new_df.columns[2:])
new_df = new_df.set_index("SA2 name")
new_df

Unnamed: 0_level_0,SA2 code,Year,Earners (persons),Mean ($),Median ($),Median age of earners (years),Sum ($)
SA2 name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Alfredton,201011001,2016-17,7117,60937,50596,42,433690088
Alfredton,201011001,2017-18,7558,62343,52448,42,471188331
Alfredton,201011001,2018-19,7987,63731,53932,42,509017733
Alfredton,201011001,2019-20,8665,65781,55204,41,569994001
Alfredton,201011001,2020-21,9438,69111,58036,41,652268973
...,...,...,...,...,...,...,...
Warrnambool - South,217041480,2016-17,7614,54442,43149,44,414520667
Warrnambool - South,217041480,2017-18,7737,54826,45726,44,424190279
Warrnambool - South,217041480,2018-19,8051,58145,47242,44,468127252
Warrnambool - South,217041480,2019-20,8096,58648,49080,44,474817395


In [27]:
new_df.index.values[:20]

array(['Alfredton', 'Alfredton', 'Alfredton', 'Alfredton', 'Alfredton',
       'Ballarat', 'Ballarat', 'Ballarat', 'Ballarat', 'Ballarat',
       'Buninyong', 'Buninyong', 'Buninyong', 'Buninyong', 'Buninyong',
       'Delacombe', 'Delacombe', 'Delacombe', 'Delacombe', 'Delacombe'],
      dtype=object)

In [47]:
new_df = suburb_name_split(new_df)
new_df

2610
0.45977011494252873
(16150, 7)
3230


ValueError: Length mismatch: Expected axis has 16150 elements, new values have 3230 elements

### Distribution

In [31]:
COLUMNS_DICT_INCOME_DISTRIBUTION = {
    0: "SA2 code",
    1: "SA2 name"
}

In [32]:
df = pd.read_csv("../data/landing/economic_by_region/income_distribution_by_geography_2021.csv", header=0)
df = df.drop(0)

df = fill_columns(df, COLUMNS_DICT_INCOME_DISTRIBUTION)

df = filter_victoria(df, "SA2 code")

df = df.set_index("SA2 name")

df = suburb_name_split(df)

522


UnboundLocalError: local variable 'count' referenced before assignment

## Economic

- Come back to

## Housing