# Income 

In [1]:
# import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [2]:
# load data
income = pd.read_excel('../data/landing/Table 2 - Total income distribution by geography, 2020-21.xlsx', sheet_name='Table 2.4', header=5)

FileNotFoundError: [Errno 2] No such file or directory: '../data/landing/Table 2 - Total income distribution by geography, 2020-21.xlsx'

In [None]:
# check data
income.head(5)

In [None]:
# select rows with NaN values
nan_rows = income[income.isnull().any(axis=1)]
nan_rows

In [None]:
# select all rows between 645 and 1168
income = income.iloc[646:1168] 

In [None]:
# rename columns Unnamed: 0 to SA2 code, and Unnamed: 1 to SA2 name
income.rename(columns={'Unnamed: 0': 'SA2 code', 'Unnamed: 1': 'SA2 name'}, inplace=True)

In [None]:
# filter data to only include rows with SA2 code that starts with 206, 207, 208, 209, 210, 211, 212, 213, 214 (Melbourne SA2)
income = income[income['SA2 code'].astype(str).str.startswith(('206', '207', '208', '209', '210', '211', '212', '213', '214'))]

In [None]:
# check data
income.head(5)

In [None]:
# check null values
income.isnull().sum()

In [None]:
# check for essendon airport
income[income['SA2 name'].str.contains('Essendon Airport')] 


In [None]:
# drop row 907 for Essendon Airport
income = income.drop([907])

In [None]:
# Convert relevant columns to numeric types (float or int) as appropriate
columns_to_convert = [
    'P80/P20', 'P80/P50', 'P20/P50', 'P10/P50', 
    'Gini coefficient', 'Top 1%', 'Top 5%', 'Top 10%', 
    'Lowest Quartile', 'Second Quartile', 'Third Quartile', 'Highest Quartile'
]

# Convert the columns to numeric, using 'coerce' to convert invalid values to NaN
income[columns_to_convert] = income[columns_to_convert].apply(pd.to_numeric, errors='coerce')

# Verify the new datatypes
print(income.dtypes)

In [None]:
# Remove commas from the columns and convert to numeric (float) first
columns_to_convert = ['Earners', 'Mean', 'Median', 'Sum', 'Median age of earners']

# Remove commas and convert to numeric
income[columns_to_convert] = income[columns_to_convert].replace({',': ''}, regex=True)

# Convert to integers after cleaning
income[columns_to_convert] = income[columns_to_convert].apply(pd.to_numeric, errors='coerce', downcast='integer')

# Verify the new datatypes
print(income.dtypes)

In [None]:
# Check if any value in the DataFrame is equal to 'np'
np_values = income[income == 'np'].dropna(how='all')

# Display the rows where 'np' is found
print(np_values)

In [None]:
# check first 5 rows
income.head(5)

In [ ]:
income.to_parquet('../data/raw/income_cleaned.parquet')

In [None]:
# Sort the data by 'Median' income in ascending order and select the top 10 SA2 regions
top_10_median = income.sort_values(by='Median', ascending=True).head(10)

# Plot the data
plt.figure(figsize=(10, 6))
sns.barplot(x='SA2 name', y='Median', data=top_10_median, palette='viridis')

# Rotate the x-axis labels for better readability
plt.xticks(rotation=45, ha='right')

# Add title and labels
plt.title('Top 10 SA2 Regions by Median Income (Lowest to Highest)', fontsize=14)
plt.xlabel('SA2 Name', fontsize=12)
plt.ylabel('Median Income ($)', fontsize=12)

# Display the plot
plt.tight_layout()
plt.show()

In [None]:
# Convert 'Gini coefficient' column to numeric (float)
income['Gini coefficient'] = pd.to_numeric(income['Gini coefficient'], errors='coerce')

# Sort the data by 'Gini coefficient' and select the top 10 SA2 regions
top_10_gini = income.sort_values(by='Gini coefficient', ascending=False).head(10)

# Plot the data
plt.figure(figsize=(10, 6))
sns.barplot(x='SA2 name', y='Gini coefficient', data=top_10_gini, palette='viridis')

# Rotate the x-axis labels to avoid overlap
plt.xticks(rotation=45, ha='right')

# Add title and labels
plt.title('Top 10 SA2 with the Highest Gini Coefficient', fontsize=14)
plt.xlabel('SA2 Name', fontsize=12)
plt.ylabel('Gini Coefficient', fontsize=12)

# Display the plot
plt.tight_layout()
plt.show()


In [None]:
# Sort the data by 'Top 10%' income share and select the top 10 SA2 regions
top_10_top_10 = income.sort_values(by='Top 10%', ascending=False).head(10)

# Plot the data
plt.figure(figsize=(10, 6))
sns.barplot(x='SA2 name', y='Top 10%', data=top_10_top_10, palette='viridis')

# Rotate the x-axis labels for better readability
plt.xticks(rotation=45, ha='right')

# Add title and labels
plt.title('Top 10 SA2 Regions with the Highest Top 10% Income Share', fontsize=14)
plt.xlabel('SA2 Name', fontsize=12)
plt.ylabel('Top 10% Income Share (%)', fontsize=12)

# Display the plot
plt.tight_layout()
plt.show()


In [None]:
# Sort the data by 'Earners' in descending order and select the top 10 SA2 regions
top_10_earners = income.sort_values(by='Earners', ascending=False).head(10)

# Plot the data
plt.figure(figsize=(10, 6))
sns.barplot(x='SA2 name', y='Earners', data=top_10_earners, palette='viridis')

# Rotate the x-axis labels for better readability
plt.xticks(rotation=45, ha='right')

# Add title and labels
plt.title('Top 10 SA2 Regions with the Most Earners', fontsize=14)
plt.xlabel('SA2 Name', fontsize=12)
plt.ylabel('Number of Earners', fontsize=12)

# Display the plot
plt.tight_layout()
plt.show()

## Check correlation

In [None]:
# import the parquet file
house = pd.read_parquet('../data/landing/domain_cleaned/part-00000-7dc1f449-c232-484e-b4e6-1dfbe30d6a77-c000.snappy.parquet')

In [None]:
# check data
house.head(5)

In [None]:
# check for feature names
house.columns

In [None]:
# check income data
income.head(5)

In [None]:
# merge the two dataframes on SA2 name and sa2_code 
merged = pd.merge(house, income, left_on='sa2_name', right_on='SA2 name', how='inner')

In [None]:
# check data
merged.head(5)

In [None]:
# cehck columns
merged.columns

In [None]:
# Select the relevant columns
cols_of_interest = ['Earners', 'Median age of earners', 'Sum', 'Median', 'Mean',
                    'P80/P20', 'P80/P50', 'P20/P50', 'P10/P50', 'Gini coefficient',
                    'Top 1%', 'Top 5%', 'Top 10%', 'Lowest Quartile', 'Second Quartile',
                    'Third Quartile', 'Highest Quartile', 'extracted_price']

# Create a DataFrame with only the selected columns
df_selected = merged[cols_of_interest]

# Convert columns to numeric if necessary (ignore errors)
df_selected = df_selected.apply(pd.to_numeric, errors='coerce')

In [None]:
# Calculate the correlation matrix
corr_matrix = df_selected.corr()

# Get only the correlations related to 'extracted_price'
extracted_price_corr = corr_matrix['extracted_price']


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Set up the matplotlib figure
plt.figure(figsize=(10, 8))

# Draw the heatmap only for extracted_price correlations
sns.heatmap(extracted_price_corr.to_frame(), annot=True, fmt=".2f", cmap='coolwarm', cbar_kws={'shrink': .8})

# Set title
plt.title('Correlation with Extracted Price')

# Show plot
plt.show()
