In [164]:
import pandas as pd
import numpy as np

In [165]:
train = pd.read_csv("../data/development/train.csv")
test = pd.read_csv("../data/development/test.csv")

In [166]:
train_df = train.drop(columns=['Unnamed: 0'])
test_df = test.drop(columns=['Unnamed: 0'])

In [167]:
# List of LGA_NAME23 values to remove
remove_list = [
    "East Gippsland", 
    "Queenscliffe",
    "West Wimmera"
]

# Remove rows with LGA_NAME23 in remove_list from train_df
train_df = train_df[~train_df['LGA_NAME23'].isin(remove_list)]

# Remove rows with LGA_NAME23 in remove_list from test_df (if needed)
test_df = test_df[~test_df['LGA_NAME23'].isin(remove_list)]


In [168]:
train_df.replace([np.inf, -np.inf], 999, inplace=True)

# Replace inf and -inf in test_df with 999
test_df.replace([np.inf, -np.inf], 999, inplace=True)

In [169]:
train_df

Unnamed: 0,LGA_NAME23,LGA_CODE23,cost,beds,baths,parkings,Nearest_station,Nearest_park,Nearest_shop,Nearest_hospital,Nearest_school,Nearest_supermarket,year,Offence Count,population,weekly_income
0,Alpine,20110,270,2.0,1.0,2.0,46.428333,5.346667,38.880000,29.888333,2.526667,2.446667,2017,396,13113.0,621.947682
1,Ararat,20260,260,3.0,1.0,1.0,2.428333,28.928333,2.536667,999.000000,1.433333,3.033333,2017,1249,11613.0,583.176092
2,Ballarat,20570,280,3.0,2.0,2.0,7.341667,9.785000,6.483333,24.810000,2.235000,4.776667,2017,11885,152520.0,731.935668
3,Banyule,20660,395,3.0,1.0,1.0,2.876667,11.215000,9.528333,5.353333,1.760000,2.630000,2017,9703,129192.0,573.955394
4,Bass Coast,20740,285,3.0,2.0,2.0,86.876667,32.528333,23.363333,39.832500,3.904167,7.377500,2017,2613,34166.0,549.541548
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
548,Wodonga,27170,450.0,3.0,2.0,2.0,10.300000,999.000000,5.999167,81.853333,2.840833,3.923333,2023,3750,74233.0,804.199699
549,Wyndham,27260,480.0,4.0,2.0,2.0,6.440000,12.302500,7.693333,13.742500,2.543333,3.651667,2023,18155,685662.0,782.273895
550,Yarra,27350,650.0,2.0,1.0,1.0,3.283333,6.468333,4.310000,4.795000,1.603333,1.371667,2023,13415,153858.0,1488.804350
551,Yarra Ranges,27450,535.0,3.0,2.0,2.0,4.250000,15.481667,6.949167,9.932500,2.348333,3.799167,2023,7130,166556.0,576.773200


In [170]:
test_df

Unnamed: 0,LGA_NAME23,LGA_CODE23,cost,beds,baths,parkings,Nearest_station,Nearest_park,Nearest_shop,Nearest_hospital,Nearest_school,Nearest_supermarket,year,Offence Count,population,weekly_income
0,Alpine,20110,,2.0,1.0,2.0,46.428333,5.346667,38.880000,29.888333,2.526667,2.446667,2024,,13547.0,899.635833
1,Ararat,20260,,3.0,1.0,1.0,2.428333,28.928333,2.536667,999.000000,1.433333,3.033333,2024,,11884.0,764.993354
2,Ballarat,20570,,3.0,2.0,2.0,7.341667,9.785000,6.483333,24.810000,2.235000,4.776667,2024,,169198.0,980.445606
3,Banyule,20660,,3.0,1.0,1.0,2.876667,11.215000,9.528333,5.353333,1.760000,2.630000,2024,,138961.0,824.611943
4,Bass Coast,20740,,3.0,2.0,2.0,86.876667,32.528333,23.363333,39.832500,3.904167,7.377500,2024,,39431.0,717.487857
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
232,Wodonga,27170,,3.0,2.0,2.0,10.300000,999.000000,5.999167,81.853333,2.840833,3.923333,2026,,77804.0,870.207773
233,Wyndham,27260,,4.0,2.0,2.0,6.440000,12.302500,7.693333,13.742500,2.543333,3.651667,2026,,778022.0,864.232258
234,Yarra,27350,,2.0,1.0,1.0,3.283333,6.468333,4.310000,4.795000,1.603333,1.371667,2026,,164835.0,1716.588015
235,Yarra Ranges,27450,,3.0,2.0,2.0,4.250000,15.481667,6.949167,9.932500,2.348333,3.799167,2026,,171416.0,636.681223


# Modelling without Offence Count

In [171]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

# Splitting the data based on the year
train_subset = train_df[train_df['year'] < 2023]
validation_subset = train_df[train_df['year'] == 2023]

X_train_subset = train_subset.drop(columns=['cost', 'Offence Count'])
y_train_subset = train_subset['cost']

X_val = validation_subset.drop(columns=['cost', 'Offence Count'])
y_val = validation_subset['cost']

# Identify categorical and numerical columns
categorical_cols = [cname for cname in X_train_subset.columns if X_train_subset[cname].dtype == "object"]
numerical_cols = [cname for cname in X_train_subset.columns if X_train_subset[cname].dtype in ['int64', 'float64'] and cname != 'year']


# Manually scale data by year
X_train_scaled = X_train_subset.copy()
for year in X_train_scaled['year'].unique():
    subset_indices = X_train_scaled[X_train_scaled['year'] == year].index
    mean = X_train_subset.loc[subset_indices, numerical_cols].mean()
    std = X_train_subset.loc[subset_indices, numerical_cols].std()
    X_train_scaled.loc[subset_indices, numerical_cols] = (X_train_subset.loc[subset_indices, numerical_cols] - mean) / std

X_val_scaled = X_val.copy()
for year in X_val['year'].unique():
    subset_indices = X_val[X_val['year'] == year].index
    if year in X_train_subset['year'].unique():
        mean = X_train_subset[X_train_subset['year'] == year][numerical_cols].mean()
        std = X_train_subset[X_train_subset['year'] == year][numerical_cols].std()
        X_val_scaled.loc[subset_indices, numerical_cols] = (X_val.loc[subset_indices, numerical_cols] - mean) / std

# Preprocessing for numerical data: only impute missing values now
numerical_transformer = SimpleImputer(strategy='mean')

# Preprocessing for categorical data: impute missing values and apply one-hot encoding
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Model definition
model = LinearRegression()

# Main pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

# Train the model using the scaled data
pipeline.fit(X_train_scaled, y_train_subset)

# Predict on the scaled validation set
val_preds = pipeline.predict(X_val_scaled)

# Compute RMSE for the validation set
rmse_val = mean_squared_error(y_val, val_preds, squared=False)
print(f"Validation RMSE: {rmse_val}")







Validation RMSE: 19967615.547751546


In [172]:
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)

# Main pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', rf_regressor)
])

# Train the model using the scaled data
pipeline.fit(X_train_scaled, y_train_subset)

# Predict on the scaled validation set
val_preds = pipeline.predict(X_val_scaled)

# Compute RMSE for the validation set
rmse_val = mean_squared_error(y_val, val_preds, squared=False)
print(f"Validation RMSE: {rmse_val}")


Validation RMSE: 141.36917117678283


In [176]:
from sklearn.feature_selection import mutual_info_regression
# Compute mutual information between each feature and the target
X_train_numerical = X_train_scaled[numerical_cols]

# Compute mutual information between each numerical feature and the target
mi = mutual_info_regression(X_train_numerical, y_train_subset)

# Convert to a Series for easier visualization
mi_series = pd.Series(mi, index=X_train_numerical.columns)


In [177]:
mi_series

LGA_CODE23             0.180190
beds                   0.082147
baths                  0.107758
parkings               0.038793
Nearest_station        0.117498
Nearest_park           0.206141
Nearest_shop           0.294937
Nearest_hospital       0.238535
Nearest_school         0.145024
Nearest_supermarket    0.227155
population             0.498166
weekly_income          0.376413
dtype: float64

# Modelling include Offence Count

In [178]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

# Splitting the data based on the year
train_subset = train_df[train_df['year'] < 2023]
validation_subset = train_df[train_df['year'] == 2023]

X_train_subset = train_subset.drop(columns=['cost'])
y_train_subset = train_subset['cost']

X_val = validation_subset.drop(columns=['cost'])
y_val = validation_subset['cost']

# Identify categorical and numerical columns
categorical_cols = [cname for cname in X_train_subset.columns if X_train_subset[cname].dtype == "object"]
numerical_cols = [cname for cname in X_train_subset.columns if X_train_subset[cname].dtype in ['int64', 'float64'] and cname != 'year']


# Manually scale data by year
X_train_scaled = X_train_subset.copy()
for year in X_train_scaled['year'].unique():
    subset_indices = X_train_scaled[X_train_scaled['year'] == year].index
    mean = X_train_subset.loc[subset_indices, numerical_cols].mean()
    std = X_train_subset.loc[subset_indices, numerical_cols].std()
    X_train_scaled.loc[subset_indices, numerical_cols] = (X_train_subset.loc[subset_indices, numerical_cols] - mean) / std

X_val_scaled = X_val.copy()
for year in X_val['year'].unique():
    subset_indices = X_val[X_val['year'] == year].index
    if year in X_train_subset['year'].unique():
        mean = X_train_subset[X_train_subset['year'] == year][numerical_cols].mean()
        std = X_train_subset[X_train_subset['year'] == year][numerical_cols].std()
        X_val_scaled.loc[subset_indices, numerical_cols] = (X_val.loc[subset_indices, numerical_cols] - mean) / std

# Preprocessing for numerical data: only impute missing values now
numerical_transformer = SimpleImputer(strategy='mean')

# Preprocessing for categorical data: impute missing values and apply one-hot encoding
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Model definition
model = LinearRegression()

# Main pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

# Train the model using the scaled data
pipeline.fit(X_train_scaled, y_train_subset)

# Predict on the scaled validation set
val_preds = pipeline.predict(X_val_scaled)

# Compute RMSE for the validation set
rmse_val = mean_squared_error(y_val, val_preds, squared=False)
print(f"Validation RMSE: {rmse_val}")







Validation RMSE: 21946354.016449966


In [179]:
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)

# Main pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', rf_regressor)
])

# Train the model using the scaled data
pipeline.fit(X_train_scaled, y_train_subset)

# Predict on the scaled validation set
val_preds = pipeline.predict(X_val_scaled)

# Compute RMSE for the validation set
rmse_val = mean_squared_error(y_val, val_preds, squared=False)
print(f"Validation RMSE: {rmse_val}")

Validation RMSE: 136.43229036392876


In [180]:
# Assuming your dataframe is named train_df

# Drop non-numeric columns for simplicity (or you can encode them if needed)
numeric_train_df = train_df.select_dtypes(include=['int64', 'float64'])

# Separate features and target
X = numeric_train_df.drop(columns=['Offence Count'])
y = numeric_train_df['Offence Count']

# Compute mutual information
mi = mutual_info_regression(X, y)

# Create a Series for visualization
mi_series = pd.Series(mi, index=X.columns)


In [181]:
mi_series

LGA_CODE23             1.775497
beds                   0.211888
baths                  0.208547
parkings               0.241151
Nearest_station        0.845504
Nearest_park           0.944114
Nearest_shop           1.552181
Nearest_hospital       0.939835
Nearest_school         1.404990
Nearest_supermarket    1.152431
year                   0.000000
population             1.574727
weekly_income          0.215681
dtype: float64