In [1]:
import pandas as pd
import numpy as np



In [2]:
train = pd.read_csv("../data/development/train.csv")
test = pd.read_csv("../data/development/test.csv")

In [3]:
train.head()

Unnamed: 0.1,Unnamed: 0,LGA_NAME23,LGA_CODE23,name,geometry,cost,beds,baths,parkings,Nearest_station,Nearest_park,Nearest_shop,Nearest_hospital,Nearest_school,Nearest_supermarket,distance_to_cbd,year,Offence Count,population,weekly_income
0,0,Alpine,20110,34 O'Donnell Avenue Myrtleford VIC 3737,POINT (146.7271339 -36.5522656),270,2.0,1.0,2.0,46.428333,5.346667,38.88,29.888333,2.526667,2.446667,999.0,2017,396,13113.0,621.947682
1,1,Ararat,20260,4 Grano Street Ararat VIC 3377,POINT (142.9414137 -37.2818641),260,3.0,1.0,1.0,2.428333,28.928333,2.536667,999.0,1.433333,3.033333,999.0,2017,1249,11613.0,583.176092
2,2,Ballarat,20570,3/41 Longley Street Alfredton VIC 3350,POINT (143.8376317 -37.5653948),280,3.0,2.0,2.0,7.341667,9.785,6.483333,24.81,2.235,4.776667,999.0,2017,11885,152520.0,731.935668
3,3,Banyule,20660,3 Clinton Street Heidelberg Heights VIC 3081,POINT (145.0598679 -37.7598674),395,3.0,1.0,1.0,2.876667,11.215,9.528333,5.353333,1.76,2.63,23.14,2017,9703,129192.0,573.955394
4,4,Bass Coast,20740,23 Carpathia Street Coronet Bay VIC 3984,POINT (145.4968004 -38.4790177),285,3.0,2.0,2.0,86.75,32.095,22.93,39.63,3.885,7.376667,116.675,2017,2613,34166.0,549.541548


In [4]:
train_df = train.drop(columns=['Unnamed: 0', 'name', 'geometry', 'LGA_CODE23'])
test_df = test.drop(columns=['Unnamed: 0', 'name', 'geometry', 'LGA_CODE23'])

In [5]:
# List of LGA_NAME23 values to remove
remove_list = [
    "East Gippsland", 
    "Queenscliffe",
    "West Wimmera"
]

# Remove rows with LGA_NAME23 in remove_list from train_df
train_df = train_df[~train_df['LGA_NAME23'].isin(remove_list)]

# Remove rows with LGA_NAME23 in remove_list from test_df (if needed)
test_df = test_df[~test_df['LGA_NAME23'].isin(remove_list)]


In [6]:
train_df

Unnamed: 0,LGA_NAME23,cost,beds,baths,parkings,Nearest_station,Nearest_park,Nearest_shop,Nearest_hospital,Nearest_school,Nearest_supermarket,distance_to_cbd,year,Offence Count,population,weekly_income
0,Alpine,270,2.0,1.0,2.0,46.428333,5.346667,38.880000,29.888333,2.526667,2.446667,999.000000,2017,396,13113.0,621.947682
1,Ararat,260,3.0,1.0,1.0,2.428333,28.928333,2.536667,999.000000,1.433333,3.033333,999.000000,2017,1249,11613.0,583.176092
2,Ballarat,280,3.0,2.0,2.0,7.341667,9.785000,6.483333,24.810000,2.235000,4.776667,999.000000,2017,11885,152520.0,731.935668
3,Banyule,395,3.0,1.0,1.0,2.876667,11.215000,9.528333,5.353333,1.760000,2.630000,23.140000,2017,9703,129192.0,573.955394
4,Bass Coast,285,3.0,2.0,2.0,86.750000,32.095000,22.930000,39.630000,3.885000,7.376667,116.675000,2017,2613,34166.0,549.541548
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
548,Wodonga,450.0,3.0,2.0,2.0,10.213333,999.000000,5.975000,81.821667,2.803333,3.890000,999.000000,2023,3750,74233.0,804.199699
549,Wyndham,480.0,4.0,2.0,2.0,6.440000,12.295000,7.668333,13.736667,2.543333,3.648333,34.695000,2023,18155,685662.0,782.273895
550,Yarra,650.0,2.0,1.0,1.0,3.283333,6.468333,4.310000,4.795000,1.603333,1.371667,9.305000,2023,13415,153858.0,1488.804350
551,Yarra Ranges,530.0,3.0,2.0,2.0,4.188333,15.453333,6.746667,9.856667,2.313333,3.796667,44.118333,2023,7130,166556.0,576.773200


# Feature Selection

In [7]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

# Assuming you've loaded your dataframe as df
# One-hot encoding
df_encoded = pd.get_dummies(train_df, columns=['LGA_NAME23'], drop_first=True)

# Log transformation for numerical columns (excluding 'cost' and year as you might not want to log-transform year)
numerical_cols = ['Nearest_station', 'Nearest_park', 'Nearest_shop', 'Nearest_hospital', 'Nearest_school', 'Nearest_supermarket', 'distance_to_cbd', 'Offence Count', 'population', 'weekly_income']

for col in numerical_cols:
    df_encoded[col] = np.log1p(df_encoded[col])

def forward_selection(data, target):
    null_X = pd.DataFrame(sm.add_constant(data.iloc[:, 0]))  # Using the first column as an initial column.
    y = target.copy()
    all_candidates = data.columns.tolist()
    
    AIC_dict = {}
    null_model = sm.OLS(y, null_X).fit()
    last_min = null_model.aic
    candidates = []

    while True:
        AIC_dict = {}
        for x in all_candidates:
            print(f"Trying feature {x}")
            
            forward_X = pd.concat([data[x], null_X], axis=1)
            model = sm.OLS(y, forward_X).fit()
            
            AIC_dict[x] = model.aic
            print(f"AIC = {model.aic}")

        min_aic = min(AIC_dict.values())
        min_aic_key = min(AIC_dict, key=AIC_dict.get)

        if min_aic < last_min:
            candidates.append(min_aic_key)
            all_candidates.remove(min_aic_key)
            last_min = min_aic
            null_X = pd.concat([data[min_aic_key], null_X], axis=1)

            print(f'Step: {len(candidates)}')
            print(candidates)
            print(f'New AIC: {min_aic}')
            print('===============')
        else:
            final_model = sm.OLS(y, null_X).fit()
            print(final_model.summary())
            break

    return candidates


# Forward selection
X_encoded = df_encoded.drop(['cost'], axis=1)

y = df_encoded['cost']
y = y.astype("float")
selected_features = forward_selection(X_encoded, y)
print(selected_features)



Trying feature beds
AIC = 6338.113946812567
Trying feature baths
AIC = 6265.447394627321
Trying feature parkings
AIC = 6337.573971199878
Trying feature Nearest_station
AIC = 6281.67103803947
Trying feature Nearest_park
AIC = 6246.087560433344
Trying feature Nearest_shop
AIC = 6275.127808399312
Trying feature Nearest_hospital
AIC = 6222.042685364723
Trying feature Nearest_school
AIC = 6340.1124865727725
Trying feature Nearest_supermarket
AIC = 6300.5077397802415
Trying feature distance_to_cbd
AIC = 6017.3984387081555
Trying feature year
AIC = 6221.8720843778165
Trying feature Offence Count
AIC = 6198.534508147924
Trying feature population
AIC = 6118.556073855749
Trying feature weekly_income
AIC = 6004.5326214760125
Trying feature LGA_NAME23_Ararat
AIC = 6336.0263268640765
Trying feature LGA_NAME23_Ballarat
AIC = 6339.037618922087
Trying feature LGA_NAME23_Banyule
AIC = 6337.742883131028
Trying feature LGA_NAME23_Bass Coast
AIC = 6339.897952697152
Trying feature LGA_NAME23_Baw Baw
AIC = 

In [8]:
test_df

Unnamed: 0,LGA_NAME23,cost,beds,baths,parkings,Nearest_station,Nearest_park,Nearest_shop,Nearest_hospital,Nearest_school,Nearest_supermarket,distance_to_cbd,year,Offence Count,population,weekly_income
0,Alpine,,2.0,1.0,2.0,46.428333,5.346667,38.880000,29.888333,2.526667,2.446667,999.000000,2024,,13547.0,899.635833
1,Ararat,,3.0,1.0,1.0,2.428333,28.928333,2.536667,999.000000,1.433333,3.033333,999.000000,2024,,11884.0,764.993354
2,Ballarat,,3.0,2.0,2.0,7.341667,9.785000,6.483333,24.810000,2.235000,4.776667,999.000000,2024,,169198.0,980.445606
3,Banyule,,3.0,1.0,1.0,2.876667,11.215000,9.528333,5.353333,1.760000,2.630000,23.140000,2024,,138961.0,824.611943
4,Bass Coast,,3.0,2.0,2.0,86.750000,32.095000,22.930000,39.630000,3.885000,7.376667,116.675000,2024,,39431.0,717.487857
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
232,Wodonga,,3.0,2.0,2.0,10.213333,999.000000,5.975000,81.821667,2.803333,3.890000,999.000000,2026,,77804.0,870.207773
233,Wyndham,,4.0,2.0,2.0,6.440000,12.295000,7.668333,13.736667,2.543333,3.648333,34.695000,2026,,778022.0,864.232258
234,Yarra,,2.0,1.0,1.0,3.283333,6.468333,4.310000,4.795000,1.603333,1.371667,9.305000,2026,,164835.0,1716.588015
235,Yarra Ranges,,3.0,2.0,2.0,4.188333,15.453333,6.746667,9.856667,2.313333,3.796667,44.118333,2026,,171416.0,636.681223


# Modelling include Offence Count

## train and validate

In [9]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Splitting the data based on the year
train_subset = train_df[train_df['year'] < 2023]
validation_subset = train_df[train_df['year'] == 2023]

X_train_subset = train_subset.drop(columns=['cost'])
y_train_subset = train_subset['cost']

X_val = validation_subset.drop(columns=['cost'])
y_val = validation_subset['cost']

X_train_encoded = pd.get_dummies(X_train_subset, columns=['LGA_NAME23'], drop_first=True)
X_val_encoded = pd.get_dummies(X_val, columns=['LGA_NAME23'], drop_first=True)

numerical_cols = ['Nearest_station', 'Nearest_park', 'Nearest_shop', 'Nearest_hospital', 'Nearest_school', 'Nearest_supermarket', 'distance_to_cbd', 'Offence Count', 'population', 'weekly_income']

for col in numerical_cols:
    X_train_encoded[col] = np.log1p(X_train_encoded[col])
    X_val_encoded[col] = np.log1p(X_val_encoded[col])


model = LinearRegression()

# Train the model
model.fit(X_train_encoded[selected_features], y_train_subset)

# Predict on the validation set
val_preds = model.predict(X_val_encoded[selected_features])

# Compute RMSE for the validation set
rmse_val = mean_squared_error(y_val, val_preds, squared=False)
print(f"Validation RMSE: {rmse_val}")


Validation RMSE: 83.6587908376676


In [10]:
from sklearn.ensemble import RandomForestRegressor

rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model using the scaled data
rf_regressor.fit(X_train_encoded, y_train_subset)

# Predict on the scaled validation set
val_preds = rf_regressor.predict(X_val_encoded)

# Compute RMSE for the validation set
rmse_val = mean_squared_error(y_val, val_preds, squared=False)
print(f"Validation RMSE: {rmse_val}")

Validation RMSE: 100.0058763076082


In [11]:
importances = rf_regressor.feature_importances_

In [12]:
# Get feature importances
importances = rf_regressor.feature_importances_

# If you have the list of column names (assuming you do for the encoded data), 
# you can map these importances to the respective column names to make the output more interpretable
feature_names = X_train_encoded.columns.tolist()  # Assuming X_train_encoded is a DataFrame

# Pairing the names with the importances and sort them
paired_importances = sorted(zip(importances, feature_names), reverse=True)

# Displaying the feature importances
for importance, name in paired_importances:
    print(f"{name}: {importance}")


distance_to_cbd: 0.5652200956430959
weekly_income: 0.1331374298606711
year: 0.07199201189824096
LGA_NAME23_Bayside (Vic.): 0.05172293660276203
population: 0.04388025311415424
Nearest_shop: 0.018723336305883586
Nearest_park: 0.017747136531819262
Offence Count: 0.01758576360483706
Nearest_supermarket: 0.01626118981357134
Nearest_station: 0.015058893017564698
Nearest_school: 0.013364236679688621
Nearest_hospital: 0.011068730120021358
LGA_NAME23_Surf Coast: 0.005076936920400503
LGA_NAME23_Yarra: 0.002979391608381732
LGA_NAME23_Mornington Peninsula: 0.0020927012756648423
LGA_NAME23_Mansfield: 0.0017548957377025568
baths: 0.0016049479362139271
LGA_NAME23_Mount Alexander: 0.0012635035426012485
beds: 0.0011636470358177963
parkings: 0.001058468335043381
LGA_NAME23_Hindmarsh: 0.0009002047518667689
LGA_NAME23_Greater Dandenong: 0.0006619866916063072
LGA_NAME23_Yarra Ranges: 0.0004932723641501587
LGA_NAME23_Warrnambool: 0.00041675749326347387
LGA_NAME23_Manningham: 0.00040883241851138243
LGA_NAME2

# Prediction for Offence Count in future three years

In [13]:
train_df.head()

Unnamed: 0,LGA_NAME23,cost,beds,baths,parkings,Nearest_station,Nearest_park,Nearest_shop,Nearest_hospital,Nearest_school,Nearest_supermarket,distance_to_cbd,year,Offence Count,population,weekly_income
0,Alpine,270,2.0,1.0,2.0,46.428333,5.346667,38.88,29.888333,2.526667,2.446667,999.0,2017,396,13113.0,621.947682
1,Ararat,260,3.0,1.0,1.0,2.428333,28.928333,2.536667,999.0,1.433333,3.033333,999.0,2017,1249,11613.0,583.176092
2,Ballarat,280,3.0,2.0,2.0,7.341667,9.785,6.483333,24.81,2.235,4.776667,999.0,2017,11885,152520.0,731.935668
3,Banyule,395,3.0,1.0,1.0,2.876667,11.215,9.528333,5.353333,1.76,2.63,23.14,2017,9703,129192.0,573.955394
4,Bass Coast,285,3.0,2.0,2.0,86.75,32.095,22.93,39.63,3.885,7.376667,116.675,2017,2613,34166.0,549.541548


In [14]:
from sklearn.ensemble import RandomForestRegressor


X_train_subset = train_df.drop(columns=['cost', 'Offence Count'])
y_train_subset = train_df['Offence Count']

X_test_subset = test_df.drop(columns=['cost', 'Offence Count'])
y_test_subset = test_df['Offence Count']

X_train_encoded = pd.get_dummies(X_train_subset, columns=['LGA_NAME23'], drop_first=True)
X_test_encoded = pd.get_dummies(X_test_subset, columns=['LGA_NAME23'], drop_first=True)

numerical_cols = ['Nearest_station', 'Nearest_park', 'Nearest_shop', 'Nearest_hospital', 'Nearest_school', 'Nearest_supermarket', 'distance_to_cbd', 'population', 'weekly_income']

for col in numerical_cols:
    X_train_encoded[col] = np.log1p(X_train_encoded[col])
    X_test_encoded[col] = np.log1p(X_test_encoded[col])

rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model using the scaled data
rf_regressor.fit(X_train_encoded, y_train_subset)

# Predict on the scaled validation set
oc_preds = rf_regressor.predict(X_test_encoded)



# fill in the offence count data for future three years

In [15]:
test_df['Offence Count'] = oc_preds

In [16]:
train_df.head()

Unnamed: 0,LGA_NAME23,cost,beds,baths,parkings,Nearest_station,Nearest_park,Nearest_shop,Nearest_hospital,Nearest_school,Nearest_supermarket,distance_to_cbd,year,Offence Count,population,weekly_income
0,Alpine,270,2.0,1.0,2.0,46.428333,5.346667,38.88,29.888333,2.526667,2.446667,999.0,2017,396,13113.0,621.947682
1,Ararat,260,3.0,1.0,1.0,2.428333,28.928333,2.536667,999.0,1.433333,3.033333,999.0,2017,1249,11613.0,583.176092
2,Ballarat,280,3.0,2.0,2.0,7.341667,9.785,6.483333,24.81,2.235,4.776667,999.0,2017,11885,152520.0,731.935668
3,Banyule,395,3.0,1.0,1.0,2.876667,11.215,9.528333,5.353333,1.76,2.63,23.14,2017,9703,129192.0,573.955394
4,Bass Coast,285,3.0,2.0,2.0,86.75,32.095,22.93,39.63,3.885,7.376667,116.675,2017,2613,34166.0,549.541548


In [17]:
test_df

Unnamed: 0,LGA_NAME23,cost,beds,baths,parkings,Nearest_station,Nearest_park,Nearest_shop,Nearest_hospital,Nearest_school,Nearest_supermarket,distance_to_cbd,year,Offence Count,population,weekly_income
0,Alpine,,2.0,1.0,2.0,46.428333,5.346667,38.880000,29.888333,2.526667,2.446667,999.000000,2024,494.28,13547.0,899.635833
1,Ararat,,3.0,1.0,1.0,2.428333,28.928333,2.536667,999.000000,1.433333,3.033333,999.000000,2024,1434.46,11884.0,764.993354
2,Ballarat,,3.0,2.0,2.0,7.341667,9.785000,6.483333,24.810000,2.235000,4.776667,999.000000,2024,10836.13,169198.0,980.445606
3,Banyule,,3.0,1.0,1.0,2.876667,11.215000,9.528333,5.353333,1.760000,2.630000,23.140000,2024,8042.20,138961.0,824.611943
4,Bass Coast,,3.0,2.0,2.0,86.750000,32.095000,22.930000,39.630000,3.885000,7.376667,116.675000,2024,3005.98,39431.0,717.487857
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
232,Wodonga,,3.0,2.0,2.0,10.213333,999.000000,5.975000,81.821667,2.803333,3.890000,999.000000,2026,3007.73,77804.0,870.207773
233,Wyndham,,4.0,2.0,2.0,6.440000,12.295000,7.668333,13.736667,2.543333,3.648333,34.695000,2026,14655.10,778022.0,864.232258
234,Yarra,,2.0,1.0,1.0,3.283333,6.468333,4.310000,4.795000,1.603333,1.371667,9.305000,2026,12934.89,164835.0,1716.588015
235,Yarra Ranges,,3.0,2.0,2.0,4.188333,15.453333,6.746667,9.856667,2.313333,3.796667,44.118333,2026,8250.71,171416.0,636.681223


# Prediction

In [18]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


X_train = train_df.drop(columns=['cost'])
y_train = train_df['cost']

X_pred = test_df.drop(columns=['cost'])
y_pred = test_df['cost']

X_train_encoded = pd.get_dummies(X_train, columns=['LGA_NAME23'], drop_first=True)
X_pred_encoded = pd.get_dummies(X_pred, columns=['LGA_NAME23'], drop_first=True)

numerical_cols = ['Nearest_station', 'Nearest_park', 'Nearest_shop', 'Nearest_hospital', 'Nearest_school', 'Nearest_supermarket', 'distance_to_cbd', 'Offence Count', 'population', 'weekly_income']

for col in numerical_cols:
    X_train_encoded[col] = np.log1p(X_train_encoded[col])
    X_pred_encoded[col] = np.log1p(X_pred_encoded[col])

model = LinearRegression()

# Train the model
model.fit(X_train_encoded[selected_features], y_train)

# Predict on the validation set
val_preds = model.predict(X_pred_encoded[selected_features])



In [19]:
test_df['cost'] = val_preds

In [20]:
train_df.to_csv("../data/scoring/past_data.csv")
test_df.to_csv("../data/scoring/future_data.csv")

# Growth Rate

In [21]:
train_df.head()

Unnamed: 0,LGA_NAME23,cost,beds,baths,parkings,Nearest_station,Nearest_park,Nearest_shop,Nearest_hospital,Nearest_school,Nearest_supermarket,distance_to_cbd,year,Offence Count,population,weekly_income
0,Alpine,270,2.0,1.0,2.0,46.428333,5.346667,38.88,29.888333,2.526667,2.446667,999.0,2017,396,13113.0,621.947682
1,Ararat,260,3.0,1.0,1.0,2.428333,28.928333,2.536667,999.0,1.433333,3.033333,999.0,2017,1249,11613.0,583.176092
2,Ballarat,280,3.0,2.0,2.0,7.341667,9.785,6.483333,24.81,2.235,4.776667,999.0,2017,11885,152520.0,731.935668
3,Banyule,395,3.0,1.0,1.0,2.876667,11.215,9.528333,5.353333,1.76,2.63,23.14,2017,9703,129192.0,573.955394
4,Bass Coast,285,3.0,2.0,2.0,86.75,32.095,22.93,39.63,3.885,7.376667,116.675,2017,2613,34166.0,549.541548


In [22]:
test_df.head()

Unnamed: 0,LGA_NAME23,cost,beds,baths,parkings,Nearest_station,Nearest_park,Nearest_shop,Nearest_hospital,Nearest_school,Nearest_supermarket,distance_to_cbd,year,Offence Count,population,weekly_income
0,Alpine,416.624783,2.0,1.0,2.0,46.428333,5.346667,38.88,29.888333,2.526667,2.446667,999.0,2024,494.28,13547.0,899.635833
1,Ararat,386.201873,3.0,1.0,1.0,2.428333,28.928333,2.536667,999.0,1.433333,3.033333,999.0,2024,1434.46,11884.0,764.993354
2,Ballarat,424.105138,3.0,2.0,2.0,7.341667,9.785,6.483333,24.81,2.235,4.776667,999.0,2024,10836.13,169198.0,980.445606
3,Banyule,526.697982,3.0,1.0,1.0,2.876667,11.215,9.528333,5.353333,1.76,2.63,23.14,2024,8042.2,138961.0,824.611943
4,Bass Coast,450.009746,3.0,2.0,2.0,86.75,32.095,22.93,39.63,3.885,7.376667,116.675,2024,3005.98,39431.0,717.487857


In [29]:
current_costs = train_df[train_df['year'] == 2023].set_index('LGA_NAME23')['cost'].astype(float).to_dict()

predicted_costs = test_df[test_df['year'] == 2025].set_index('LGA_NAME23')['cost'].astype(float).to_dict()


In [30]:
# Initialize a dictionary for growth rates
growth_rates = {}

# Calculate growth rate for each LGA_NAME23
for lga, cost_2023 in current_costs.items():
    # Ensure the LGA_NAME23 exists in both dictionaries before calculating
    if lga in predicted_costs:
        growth_rate = (predicted_costs[lga] - cost_2023) / cost_2023 * 100
        growth_rates[lga] = growth_rate

# Convert the growth_rates dictionary to a DataFrame
growth_df = pd.DataFrame(list(growth_rates.items()), columns=['LGA_NAME23', 'growth_rate'])

# Get top 10 LGA_NAME23 with the highest growth rate
top_10_lga = growth_df.nlargest(10, 'growth_rate')

print(top_10_lga)


            LGA_NAME23  growth_rate
56  Northern Grampians    88.508752
65              Towong    60.099057
10              Buloke    52.740218
0               Alpine    45.982079
62         Strathbogie    30.111469
60  Southern Grampians    22.070891
19          Gannawarra    21.045136
14  Central Goldfields    12.726634
15         Colac Otway    12.507342
5              Baw Baw    12.338420
