In [39]:
import pandas as pd
import numpy as np

In [40]:
# read in train data and data for predictions
train = pd.read_csv("../data/development/train.csv")
test = pd.read_csv("../data/development/test.csv")

In [41]:
train.head()

Unnamed: 0.1,Unnamed: 0,lga_name,lga_code,name,geometry,cost,beds,baths,parkings,nearest_station,nearest_park,nearest_shop,nearest_hospital,nearest_school,nearest_supermarket,distance_to_cbd,year,offence_count,population,weekly_income
0,0,alpine,20110,34 O'Donnell Avenue Myrtleford VIC 3737,POINT (146.7271339 -36.5522656),270,2.0,1.0,2.0,46.428333,5.346667,38.88,29.888333,2.526667,2.446667,999.0,2017,396,13113.0,592.342001
1,1,ararat,20260,4 Grano Street Ararat VIC 3377,POINT (142.9414137 -37.2818641),260,3.0,1.0,1.0,2.428333,28.928333,2.536667,999.0,1.433333,3.033333,999.0,2017,1249,11613.0,579.692855
2,2,ballarat,20570,3/41 Longley Street Alfredton VIC 3350,POINT (143.8376317 -37.5653948),280,3.0,2.0,2.0,7.341667,9.785,6.483333,24.81,2.235,4.776667,999.0,2017,11885,152520.0,617.844766
3,3,banyule,20660,3 Clinton Street Heidelberg Heights VIC 3081,POINT (145.0598679 -37.7598674),395,3.0,1.0,1.0,2.876667,11.215,9.528333,5.353333,1.76,2.63,23.14,2017,9703,129192.0,762.060273
4,4,bass coast,20740,23 Carpathia Street Coronet Bay VIC 3984,POINT (145.4968004 -38.4790177),285,3.0,2.0,2.0,86.75,32.095,22.93,39.63,3.885,7.376667,116.675,2017,2613,34166.0,528.836271


In [42]:
train_df = train.drop(columns=['Unnamed: 0', 'name', 'geometry', 'lga_code'])
test_df = test.drop(columns=['Unnamed: 0', 'name', 'geometry', 'lga_code'])

In [43]:
# check for null values
if train_df.isnull().any().any():
    print("The DataFrame has null values.")
else:
    print("The DataFrame does not have any null values.")

# To see which columns have null values
null_columns = train_df.columns[train_df.isnull().any()]
print("Columns with null values:", null_columns)

# To see the count of null values in each column
null_counts = train_df[null_columns].isnull().sum()
print("\nCount of null values in each column:")
print(null_counts)

The DataFrame has null values.
Columns with null values: Index(['population'], dtype='object')

Count of null values in each column:
population    7
dtype: int64


In [44]:
# inspect null values
rows_with_null = train_df[train_df.isnull().any(axis=1)]

rows_with_null

Unnamed: 0,lga_name,cost,beds,baths,parkings,nearest_station,nearest_park,nearest_shop,nearest_hospital,nearest_school,nearest_supermarket,distance_to_cbd,year,offence_count,population,weekly_income
60,queenscliffe,360,2.0,1.0,1.0,34.58,79.336667,19.263333,74.481667,1.67,4.243333,96.275,2017,152,,700.827647
139,queenscliffe,320,2.0,1.0,1.0,34.58,79.336667,19.263333,74.481667,1.67,4.243333,96.275,2018,115,,731.981209
218,queenscliffe,375,2.0,1.0,1.0,34.58,79.336667,19.263333,74.481667,1.67,4.243333,96.275,2019,321,,764.519626
297,queenscliffe,350,2.0,1.0,1.0,34.58,79.336667,19.263333,74.481667,1.67,4.243333,96.275,2020,120,,798.504457
376,queenscliffe,-,2.0,1.0,1.0,34.58,79.336667,19.263333,74.481667,1.67,4.243333,96.275,2021,104,,834.0
455,queenscliffe,410,2.0,1.0,1.0,34.58,79.336667,19.263333,74.481667,1.67,4.243333,96.275,2022,137,,871.07341
534,queenscliffe,420.0,2.0,1.0,1.0,34.58,79.336667,19.263333,74.481667,1.67,4.243333,96.275,2023,56,,909.794826


In [45]:
# remove queenscliffe
remove_list = [
    "queenscliffe"
]

# Remove rows with LGA_NAME23 in remove_list from train_df
train_df = train_df[~train_df['lga_name'].isin(remove_list)]

# Remove rows with LGA_NAME23 in remove_list from test_df (if needed)
test_df = test_df[~test_df['lga_name'].isin(remove_list)]


In [46]:
train_df.dtypes

lga_name                object
cost                    object
beds                   float64
baths                  float64
parkings               float64
nearest_station        float64
nearest_park           float64
nearest_shop           float64
nearest_hospital       float64
nearest_school         float64
nearest_supermarket    float64
distance_to_cbd        float64
year                     int64
offence_count            int64
population             float64
weekly_income          float64
dtype: object

In [47]:
# we noticed that there is one row with cost '-'
filtered_df = train_df[train_df['cost'] == '-']
filtered_df

Unnamed: 0,lga_name,cost,beds,baths,parkings,nearest_station,nearest_park,nearest_shop,nearest_hospital,nearest_school,nearest_supermarket,distance_to_cbd,year,offence_count,population,weekly_income
229,west wimmera,-,1.0,1.0,0.0,78.161667,999.0,76.206667,98.705,1.476667,0.196667,999.0,2019,160,2702.0,639.027371


In [48]:
# to handle this value, we used the average of 2018 and 2020 to replace it
average_cost = train_df[(train_df['lga_name'] == 'west wimmera') & (train_df['year'].isin([2018, 2020]))]['cost'].astype(float).mean()

In [49]:
train_df.loc[(train_df['lga_name'] == 'west wimmera') & (train_df['year'] == 2019), 'cost'] = average_cost

# Feature Selection

In [50]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

# One-hot encoding
df_encoded = pd.get_dummies(train_df, columns=['lga_name'], drop_first=True)

# Log transformation for numerical columns (excluding 'cost' and year as you might not want to log-transform year)
numerical_cols = ['nearest_station', 'nearest_park', 'nearest_shop', 'nearest_hospital', 'nearest_school', 'nearest_supermarket', 'distance_to_cbd', 'offence_count', 'population', 'weekly_income']

for col in numerical_cols:
    df_encoded[col] = np.log1p(df_encoded[col])

def forward_selection(data, target):
    """
    forward selection based on AIC for all numerical features
    """
    null_X = pd.DataFrame(sm.add_constant(data.iloc[:, 0]))  # Using the first column as an initial column.
    y = target.copy()
    all_candidates = data.columns.tolist()
    
    AIC_dict = {}
    null_model = sm.OLS(y, null_X).fit()
    last_min = null_model.aic
    candidates = []

    while True:
        AIC_dict = {}
        for x in all_candidates:
            print(f"Trying feature {x}")
            
            forward_X = pd.concat([data[x], null_X], axis=1)
            model = sm.OLS(y, forward_X).fit()
            
            AIC_dict[x] = model.aic
            print(f"AIC = {model.aic}")

        min_aic = min(AIC_dict.values())
        min_aic_key = min(AIC_dict, key=AIC_dict.get)

        if min_aic < last_min:
            candidates.append(min_aic_key)
            all_candidates.remove(min_aic_key)
            last_min = min_aic
            null_X = pd.concat([data[min_aic_key], null_X], axis=1)

            print(f'Step: {len(candidates)}')
            print(candidates)
            print(f'New AIC: {min_aic}')
            print('===============')
        else:
            final_model = sm.OLS(y, null_X).fit()
            print(final_model.summary())
            break

    return candidates


# Forward selection
X_encoded = df_encoded.drop(['cost'], axis=1)

y = df_encoded['cost']
y = y.astype("float")
selected_features = forward_selection(X_encoded, y)
print(selected_features)



Trying feature beds
AIC = 6506.148993228011
Trying feature baths
AIC = 6438.388782343059
Trying feature parkings
AIC = 6504.26878056694
Trying feature nearest_station
AIC = 6438.333036976214
Trying feature nearest_park
AIC = 6399.199257064839
Trying feature nearest_shop
AIC = 6431.378613914694
Trying feature nearest_hospital
AIC = 6378.145134289424
Trying feature nearest_school
AIC = 6507.99332155896
Trying feature nearest_supermarket
AIC = 6474.762187492515
Trying feature distance_to_cbd
AIC = 6174.316347672641
Trying feature year
AIC = 6388.789518166839
Trying feature offence_count
AIC = 6354.2018458585435
Trying feature population
AIC = 6270.731327977174
Trying feature weekly_income
AIC = 5981.317009442646
Trying feature lga_name_ararat
AIC = 6504.156399083129
Trying feature lga_name_ballarat
AIC = 6507.118115706934
Trying feature lga_name_banyule
AIC = 6505.720096700724
Trying feature lga_name_bass coast
AIC = 6507.9523763310335
Trying feature lga_name_baw baw
AIC = 6508.1396977035

AIC = 5903.857102513588
Trying feature lga_name_brimbank
AIC = 5902.971487876774
Trying feature lga_name_buloke
AIC = 5899.356459491769
Trying feature lga_name_campaspe
AIC = 5902.982993192175
Trying feature lga_name_cardinia
AIC = 5900.693760197996
Trying feature lga_name_casey
AIC = 5905.191753087389
Trying feature lga_name_central goldfields
AIC = 5901.265066416849
Trying feature lga_name_colac otway
AIC = 5905.254074147912
Trying feature lga_name_corangamite
AIC = 5905.455464465902
Trying feature lga_name_darebin
AIC = 5905.44248420343
Trying feature lga_name_east gippsland
AIC = 5899.549298614237
Trying feature lga_name_frankston
AIC = 5905.013656394297
Trying feature lga_name_gannawarra
AIC = 5904.580176124286
Trying feature lga_name_glen eira
AIC = 5905.4397131746155
Trying feature lga_name_glenelg
AIC = 5905.499886094785
Trying feature lga_name_golden plains
AIC = 5905.114446170083
Trying feature lga_name_greater bendigo
AIC = 5905.121344595868
Trying feature lga_name_greater d

# Modelling include Offence Count

## train and validate

In [51]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Splitting the data based on the year
train_subset = train_df[train_df['year'] < 2023]
validation_subset = train_df[train_df['year'] == 2023]

X_train_subset = train_subset.drop(columns=['cost'])
y_train_subset = train_subset['cost']

X_val = validation_subset.drop(columns=['cost'])
y_val = validation_subset['cost']

# One-hot encoding
X_train_encoded = pd.get_dummies(X_train_subset, columns=['lga_name'], drop_first=True)
X_val_encoded = pd.get_dummies(X_val, columns=['lga_name'], drop_first=True)

numerical_cols = ['nearest_station', 'nearest_park', 'nearest_shop', 'nearest_hospital', 'nearest_school', 'nearest_supermarket', 'distance_to_cbd', 'offence_count', 'population', 'weekly_income']

# Log-transformation
for col in numerical_cols:
    X_train_encoded[col] = np.log1p(X_train_encoded[col])
    X_val_encoded[col] = np.log1p(X_val_encoded[col])


model = LinearRegression()

# Train the model
model.fit(X_train_encoded[selected_features], y_train_subset)

# Predict on the validation set
val_preds = model.predict(X_val_encoded[selected_features])

# Compute RMSE for the validation set
rmse_val = mean_squared_error(y_val, val_preds, squared=False)
print(f"Validation RMSE: {rmse_val}")


Validation RMSE: 83.40485712193772


In [52]:
from sklearn.ensemble import RandomForestRegressor

# predict by random forest
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)

rf_regressor.fit(X_train_encoded, y_train_subset)

# Predict on the scaled validation set
val_preds = rf_regressor.predict(X_val_encoded)

# Compute RMSE for the validation set
rmse_val = mean_squared_error(y_val, val_preds, squared=False)
print(f"Validation RMSE: {rmse_val}")

Validation RMSE: 97.31102755913835


In [53]:
importances = rf_regressor.feature_importances_

In [54]:
# Get feature importances
importances = rf_regressor.feature_importances_

feature_names = X_train_encoded.columns.tolist()  

# Pairing the names with the importances and sort them
paired_importances = sorted(zip(importances, feature_names), reverse=True)

# Displaying the feature importances
for importance, name in paired_importances:
    print(f"{name}: {importance}")


distance_to_cbd: 0.5767331230861671
weekly_income: 0.16760999297160964
year: 0.05369612197008177
population: 0.044484362577563615
lga_name_bayside (vic.): 0.038758430951512296
nearest_station: 0.018560090825444207
nearest_school: 0.01690200471743094
nearest_shop: 0.015846776807616245
offence_count: 0.0152156977997024
nearest_supermarket: 0.011124704773181181
nearest_hospital: 0.008783321386088127
nearest_park: 0.008007600176673872
lga_name_yarra: 0.004121265735785922
beds: 0.0025724492661085645
lga_name_mount alexander: 0.00256504089678775
lga_name_mansfield: 0.0024887157671129568
parkings: 0.0018351177674680971
baths: 0.0016590872443167464
lga_name_mornington peninsula: 0.0011290880539890525
lga_name_towong: 0.0009037820009238453
lga_name_bass coast: 0.0004503276057594094
lga_name_east gippsland: 0.00039598308722135636
lga_name_surf coast: 0.0003013986495916873
lga_name_gannawarra: 0.0002921161979479181
lga_name_strathbogie: 0.00028999606253173336
lga_name_southern grampians: 0.000283

# Prediction for Offence Count in future three years

In [55]:
train_df.head()

Unnamed: 0,lga_name,cost,beds,baths,parkings,nearest_station,nearest_park,nearest_shop,nearest_hospital,nearest_school,nearest_supermarket,distance_to_cbd,year,offence_count,population,weekly_income
0,alpine,270,2.0,1.0,2.0,46.428333,5.346667,38.88,29.888333,2.526667,2.446667,999.0,2017,396,13113.0,592.342001
1,ararat,260,3.0,1.0,1.0,2.428333,28.928333,2.536667,999.0,1.433333,3.033333,999.0,2017,1249,11613.0,579.692855
2,ballarat,280,3.0,2.0,2.0,7.341667,9.785,6.483333,24.81,2.235,4.776667,999.0,2017,11885,152520.0,617.844766
3,banyule,395,3.0,1.0,1.0,2.876667,11.215,9.528333,5.353333,1.76,2.63,23.14,2017,9703,129192.0,762.060273
4,bass coast,285,3.0,2.0,2.0,86.75,32.095,22.93,39.63,3.885,7.376667,116.675,2017,2613,34166.0,528.836271


In [56]:
from sklearn.ensemble import RandomForestRegressor


X_train_subset = train_df.drop(columns=['cost', 'offence_count'])
y_train_subset = train_df['offence_count']

X_test_subset = test_df.drop(columns=['cost', 'offence_count'])
y_test_subset = test_df['offence_count']

X_train_encoded = pd.get_dummies(X_train_subset, columns=['lga_name'], drop_first=True)
X_test_encoded = pd.get_dummies(X_test_subset, columns=['lga_name'], drop_first=True)

numerical_cols = ['nearest_station', 'nearest_park', 'nearest_shop', 'nearest_hospital', 'nearest_school', 'nearest_supermarket', 'distance_to_cbd', 'population', 'weekly_income']

for col in numerical_cols:
    X_train_encoded[col] = np.log1p(X_train_encoded[col])
    X_test_encoded[col] = np.log1p(X_test_encoded[col])

rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model using the scaled data
rf_regressor.fit(X_train_encoded, y_train_subset)

# Predict on the scaled validation set
oc_preds = rf_regressor.predict(X_test_encoded)



# fill in the offence count data for future three years

In [57]:
test_df['offence_count'] = oc_preds

# Prediction

In [58]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


X_train = train_df.drop(columns=['cost'])
y_train = train_df['cost']

X_pred = test_df.drop(columns=['cost'])
y_pred = test_df['cost']

X_train_encoded = pd.get_dummies(X_train, columns=['lga_name'], drop_first=True)
X_pred_encoded = pd.get_dummies(X_pred, columns=['lga_name'], drop_first=True)

numerical_cols = ['nearest_station', 'nearest_park', 'nearest_shop', 'nearest_hospital', 'nearest_school', 'nearest_supermarket', 'distance_to_cbd', 'offence_count', 'population', 'weekly_income']

for col in numerical_cols:
    X_train_encoded[col] = np.log1p(X_train_encoded[col])
    X_pred_encoded[col] = np.log1p(X_pred_encoded[col])

model = LinearRegression()

# Train the model
model.fit(X_train_encoded[selected_features], y_train)

# Predict on the validation set
val_preds = model.predict(X_pred_encoded[selected_features])



In [59]:
test_df['cost'] = val_preds

In [60]:
train_df.to_csv("../data/scoring/past_data.csv")
test_df.to_csv("../data/scoring/future_data.csv")

# Growth Rate

In [61]:
train_df.head()

Unnamed: 0,lga_name,cost,beds,baths,parkings,nearest_station,nearest_park,nearest_shop,nearest_hospital,nearest_school,nearest_supermarket,distance_to_cbd,year,offence_count,population,weekly_income
0,alpine,270,2.0,1.0,2.0,46.428333,5.346667,38.88,29.888333,2.526667,2.446667,999.0,2017,396,13113.0,592.342001
1,ararat,260,3.0,1.0,1.0,2.428333,28.928333,2.536667,999.0,1.433333,3.033333,999.0,2017,1249,11613.0,579.692855
2,ballarat,280,3.0,2.0,2.0,7.341667,9.785,6.483333,24.81,2.235,4.776667,999.0,2017,11885,152520.0,617.844766
3,banyule,395,3.0,1.0,1.0,2.876667,11.215,9.528333,5.353333,1.76,2.63,23.14,2017,9703,129192.0,762.060273
4,bass coast,285,3.0,2.0,2.0,86.75,32.095,22.93,39.63,3.885,7.376667,116.675,2017,2613,34166.0,528.836271


In [62]:
test_df.head()

Unnamed: 0,lga_name,cost,beds,baths,parkings,nearest_station,nearest_park,nearest_shop,nearest_hospital,nearest_school,nearest_supermarket,distance_to_cbd,year,offence_count,population,weekly_income
0,alpine,423.877335,2.0,1.0,2.0,46.428333,5.346667,38.88,29.888333,2.526667,2.446667,999.0,2024,494.82,13547.0,855.905888
1,ararat,392.328767,3.0,1.0,1.0,2.428333,28.928333,2.536667,999.0,1.433333,3.033333,999.0,2024,1425.13,11884.0,776.354437
2,ballarat,429.35355,3.0,2.0,2.0,7.341667,9.785,6.483333,24.81,2.235,4.776667,999.0,2024,10663.03,169198.0,853.239387
3,banyule,519.373359,3.0,1.0,1.0,2.876667,11.215,9.528333,5.353333,1.76,2.63,23.14,2024,7947.73,138961.0,1049.530159
4,bass coast,429.023916,3.0,2.0,2.0,86.75,32.095,22.93,39.63,3.885,7.376667,116.675,2024,3010.08,39431.0,710.418329


In [64]:
current_costs = train_df[train_df['year'] == 2023].set_index('lga_name')['cost'].astype(float).to_dict()

predicted_costs = test_df[test_df['year'] == 2025].set_index('lga_name')['cost'].astype(float).to_dict()

In [65]:
# Initialize a dictionary for growth rates
growth_rates = {}

# Calculate growth rate for each lga_name
for lga, cost_2023 in current_costs.items():
    # Ensure the LGA_NAME23 exists in both dictionaries before calculating
    if lga in predicted_costs:
        growth_rate = (predicted_costs[lga] - cost_2023) / cost_2023 * 100
        growth_rates[lga] = growth_rate

# Convert the growth_rates dictionary to a DataFrame
growth_df = pd.DataFrame(list(growth_rates.items()), columns=['lga_name', 'growth_rate'])

# Get top 10 LGA_NAME23 with the highest growth rate
top_10_lga = growth_df.nlargest(10, 'growth_rate')

print(top_10_lga)


              lga_name  growth_rate
57  northern grampians    89.624713
66              towong    60.889520
10              buloke    52.778781
0               alpine    48.513948
70        west wimmera    33.175160
63         strathbogie    33.062787
61  southern grampians    21.172683
20          gannawarra    18.417677
50           moorabool    13.602755
59            pyrenees    12.341980
