In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
import re

In [12]:
data_dir = "/mnt/c/Users/21659/Documents/GitHub/project-2-group-real-estate-industry-project-10/data/"
# Load the data
rent_data = pd.read_csv(data_dir + 'landing/domain_data/rent_data.csv')
population_data = pd.read_csv(data_dir + 'landing/External_data/SA2_population/SA2_population_data_table_3.csv')
income_data = pd.read_csv(data_dir + 'landing/External_data/SA2_income/SA2_income_data_table_1.csv')
school_data = pd.read_csv(data_dir + 'landing/External_data/hospitals_and_schools/hospitals_and_schools.csv')
train_station_data = pd.read_csv(data_dir + 'landing/External_data/train_stations/train_stations_victoria.csv')
mall_data = pd.read_csv(data_dir + 'landing/External_data/shopping_mall/shopping_mall_locations.csv')

  income_data = pd.read_csv(data_dir + 'landing/External_data/SA2_income/SA2_income_data_table_1.csv')


In [13]:
# Data Preprocessing and Feature Engineering
# Convert rent prices to numeric values
rent_data['price'] = rent_data['price'].astype(str).replace({'$': '', ' pw': '', 'per week': ''}, regex=True)

In [14]:
rent_data['price'] = rent_data['price'].apply(lambda x: float((re.findall(r'\d+\.?\d*', x) or [0])[0] )).astype(float)

In [15]:
population_data.iloc[:, 9:]

Unnamed: 0,SA2 code,SA2 name,0-4,5-9,10-14,15-19,20-24,25-29,30-34,35-39,...,45-49,50-54,55-59,60-64,65-69,70-74,75-79,80-84,85 and over,Total persons
0,101021007,Braidwood,154,165,173,157,94,129,153,198,...,221,262,233,197,125,108,79,57,37,2760
1,101021008,Karabar,720,749,741,688,623,657,735,746,...,675,585,491,300,223,201,144,64,32,9129
2,101021009,Queanbeyan,592,509,514,624,799,757,767,708,...,633,620,520,444,384,394,320,214,226,9717
3,101021010,Queanbeyan - East,247,242,237,269,338,336,327,312,...,323,297,199,152,97,94,82,54,20,3925
4,101021012,Queanbeyan West - Jerrabomberra,969,923,777,602,451,640,1068,1174,...,600,508,294,224,159,105,70,29,24,9425
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56437,801111141,Namadgi,9,10,4,4,4,5,3,1,...,5,9,0,7,4,1,0,0,0,67
56438,901011001,Christmas Island,72,100,107,91,75,135,185,187,...,107,149,113,112,98,102,21,18,6,1823
56439,901021002,Cocos (Keeling) Islands,48,50,41,14,17,59,21,57,...,27,32,45,67,32,45,24,12,8,631
56440,901031003,Jervis Bay,20,14,28,18,24,21,17,16,...,10,29,27,4,25,13,6,3,8,307


In [16]:
# Extract relevant population data (e.g., total persons)
population_data['Total_population'] = population_data.dropna().iloc[:, 11:].astype(int).sum(axis=1)

In [17]:
# Merge datasets to link rent data with population and income data
rent_data = rent_data.merge(population_data[['SA2 code', 'Total_population']], left_on='postcode', right_on='SA2 code', how='left')

# todo find post code to sa2 code projection

In [18]:
rent_data

Unnamed: 0,price,bathrooms,bedrooms,parking,type,street,suburb,postcode,latitude,longitude,school,features,SA2 code,Total_population
0,725.0,0,2,1,Apartment / Unit / Flat,26/418 St Kilda Road,MELBOURNE,3000.0,-37.821404,144.96938,South Melbourne Primary School1.6 km awayPrep-...,,,
1,675.0,0,2,1,Apartment / Unit / Flat,3712/80 ABeckett Street,MELBOURNE,3000.0,-37.809000,144.96107,Carlton Gardens Primary School1.1 km awayPrep-...,"Dishwasher,Split System Heating,Split System A...",,
2,675.0,0,2,0,Apartment / Unit / Flat,901/422 Collins Street,MELBOURNE,3000.0,-37.817097,144.96014,Docklands Primary School1.7 km awayPrep-6CoEdG...,"Built in wardrobes,Dishwasher,Floorboards,Inte...",,
3,650.0,0,2,1,Apartment / Unit / Flat,"3002B/11 Rose Lane,",MELBOURNE,3000.0,-37.815487,144.95422,Docklands Primary School1.1 km awayPrep-6CoEdG...,,,
4,620.0,0,1,0,Apartment / Unit / Flat,2313/250 Elizabeth Street,MELBOURNE,3000.0,-37.812550,144.96275,Carlton Gardens Primary School1.3 km awayPrep-...,"Gym,Intercom,Built in wardrobes,Internal Laund...",,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13963,330.0,0,2,1,House,7A Mckenzie Street,WONTHAGGI,3995.0,-38.610584,145.59688,Bass Coast College0.1 km away7-12CoEdGovernmen...,,,
13964,77.0,0,1,0,Apartment / Unit / Flat,120-127 Mc Kenzie Street,WONTHAGGI,3995.0,-38.597340,145.59306,Wonthaggi North Primary School0.6 km awayPrep-...,,,
13965,450.0,0,3,0,House,35 Anderson Avenue,INVERLOCH,3996.0,-38.629770,145.73927,Inverloch Primary School0.9 km awayPrep-6CoEdG...,"Gas*,Dishwasher*",,
13966,435.0,0,3,0,House,3 Kennards Court,INVERLOCH,3996.0,-38.626730,145.72865,Inverloch Primary School0.5 km awayPrep-6CoEdG...,"Built in wardrobes,Shed,Fully fenced",,


In [19]:
rent_data = rent_data.merge(income_data[['Code', 'Median total income (excl. Government pensions and allowances) ($)']], left_on='postcode', right_on='Code', how='left')

In [20]:
rent_data

Unnamed: 0,price,bathrooms,bedrooms,parking,type,street,suburb,postcode,latitude,longitude,school,features,SA2 code,Total_population,Code,Median total income (excl. Government pensions and allowances) ($)
0,725.0,0,2,1,Apartment / Unit / Flat,26/418 St Kilda Road,MELBOURNE,3000.0,-37.821404,144.96938,South Melbourne Primary School1.6 km awayPrep-...,,,,,
1,675.0,0,2,1,Apartment / Unit / Flat,3712/80 ABeckett Street,MELBOURNE,3000.0,-37.809000,144.96107,Carlton Gardens Primary School1.1 km awayPrep-...,"Dishwasher,Split System Heating,Split System A...",,,,
2,675.0,0,2,0,Apartment / Unit / Flat,901/422 Collins Street,MELBOURNE,3000.0,-37.817097,144.96014,Docklands Primary School1.7 km awayPrep-6CoEdG...,"Built in wardrobes,Dishwasher,Floorboards,Inte...",,,,
3,650.0,0,2,1,Apartment / Unit / Flat,"3002B/11 Rose Lane,",MELBOURNE,3000.0,-37.815487,144.95422,Docklands Primary School1.1 km awayPrep-6CoEdG...,,,,,
4,620.0,0,1,0,Apartment / Unit / Flat,2313/250 Elizabeth Street,MELBOURNE,3000.0,-37.812550,144.96275,Carlton Gardens Primary School1.3 km awayPrep-...,"Gym,Intercom,Built in wardrobes,Internal Laund...",,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13969,330.0,0,2,1,House,7A Mckenzie Street,WONTHAGGI,3995.0,-38.610584,145.59688,Bass Coast College0.1 km away7-12CoEdGovernmen...,,,,,
13970,77.0,0,1,0,Apartment / Unit / Flat,120-127 Mc Kenzie Street,WONTHAGGI,3995.0,-38.597340,145.59306,Wonthaggi North Primary School0.6 km awayPrep-...,,,,,
13971,450.0,0,3,0,House,35 Anderson Avenue,INVERLOCH,3996.0,-38.629770,145.73927,Inverloch Primary School0.9 km awayPrep-6CoEdG...,"Gas*,Dishwasher*",,,,
13972,435.0,0,3,0,House,3 Kennards Court,INVERLOCH,3996.0,-38.626730,145.72865,Inverloch Primary School0.5 km awayPrep-6CoEdG...,"Built in wardrobes,Shed,Fully fenced",,,,


# 1. Predicting Rent Price - Identify Key Features

In [21]:
# Encode categorical variables
rent_data = pd.get_dummies(rent_data, columns=['type', 'suburb'], drop_first=True)

# Select features and target variable
#'Total_population', 'Median total income (excl. Government pensions and allowances) ($)'
features = ['bathrooms', 'bedrooms', 'parking'] + [col for col in rent_data.columns if 'type_' in col or 'suburb_' in col]
target = 'price'

# Handle missing data
rent_data_miss = rent_data.dropna(subset=features)

In [22]:
rent_data.isna()['Median total income (excl. Government pensions and allowances) ($)']

0        True
1        True
2        True
3        True
4        True
         ... 
13969    True
13970    True
13971    True
13972    True
13973    True
Name: Median total income (excl. Government pensions and allowances) ($), Length: 13974, dtype: bool

In [23]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(rent_data[features], rent_data[target], test_size=0.3, random_state=42)

# Train a Random Forest model to identify important features
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

In [24]:
# feature importance
feature_importance = pd.Series(model.feature_importances_, index=X_train.columns).sort_values(ascending=False)
print("Top features predicting rent price:")
print(feature_importance.head(10))

Top features predicting rent price:
bedrooms                        0.215158
parking                         0.154826
suburb_ECHUCA                   0.053908
suburb_POINT COOK               0.027007
type_Apartment / Unit / Flat    0.024882
suburb_MELBOURNE                0.016147
type_House                      0.013307
suburb_MALVERN                  0.009738
suburb_SOUTH YARRA              0.009327
suburb_BRIGHTON                 0.008499
dtype: float64
