In [None]:
import pandas as pd

file_path = '../data/curated/external/final_file.csv'
df = pd.read_csv(file_path)

# Calculation of population and crime rate growth (percentage)
df['population_growth_rate_21_22'] = ((df['2022_population'] - df['2021_population']) / df['2021_population']) * 100
df['population_growth_rate_22_23'] = ((df['2023_population'] - df['2022_population']) / df['2022_population']) * 100

df['crime_growth_rate_21_22'] = ((df['2022crime'] - df['2021crime']) / df['2021crime']) * 100
df['crime_growth_rate_22_23'] = ((df['2023crime'] - df['2022crime']) / df['2022crime']) * 100

# Calculate the average growth rate for 2021-2023
df['average_population_growth_rate_21_23'] = (df['population_growth_rate_21_22'] + df['population_growth_rate_22_23']) / 2
df['average_crime_growth_rate_21_23'] = (df['crime_growth_rate_21_22'] + df['crime_growth_rate_22_23']) / 2

# Calculate the average growth rate of the population
average_population_growth_rate_21_23 = df['average_population_growth_rate_21_23'].mean()
average_crime_growth_rate_21_23 = df['average_crime_growth_rate_21_23'].mean()



In [None]:
import pandas as pd
import numpy as np
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split, GridSearchCV

file_path = '../data/curated/external/final_file.csv'
df = pd.read_csv(file_path)

# ensure no null value
df = df.dropna()

# Defining features (X) and target variables (y)
X = df[['beds', 'baths', 'parking', 'minimum_distance_station', 
        'minimum_distance_school', 'minimum_distance_police', 
        'minimum_distance_supermarket', 'minimum_distance_library', 
        'minimum_distance_gym', 'minimum_distance_cbd', '2022_population', 
        '2023_population', 'ERP change %', 'Net overseas migration', 
        'Population density 2023 (persons/km2)', 'Median_tot_prsnl_inc_weekly', 
        '2021_population', 'Mar 2021', 'Jun 2021', 'Sep 2021', 
        'Dec 2021', 'Mar 2022', 'Jun 2022', 'Sep 2022', 'Dec 2022', 'Mar 2023',
        'Number_of_Schools', '2021crime', '2022crime', '2023crime', 'Median_Age', 
        'People aged 0-14 years', 'People aged 15-64 years', 'People aged 65 years and over']]
y = df['price']

# Divide the data into training sets and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardized feature data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the SVR model
svr = SVR()

# Define a hyperparameter grid
param_grid = {
    'kernel': ['linear', 'rbf', 'poly'],
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto'],
    'degree': [2, 3, 4]  # 对于 'poly' 核函数
}

# Hyperparameter tuning using GridSearchCV
grid_search = GridSearchCV(estimator=svr, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', verbose=1)

# train model
grid_search.fit(X_train_scaled, y_train)

# Prediction using the best parameters
best_model = grid_search.best_estimator_

# predice
y_train_pred = best_model.predict(X_train_scaled)
y_test_pred = best_model.predict(X_test_scaled)

# Evaluation model
mse_train = mean_squared_error(y_train, y_train_pred)
rmse_train = np.sqrt(mse_train)
mae_train = mean_absolute_error(y_train, y_train_pred)
r2_train = r2_score(y_train, y_train_pred)

mse_test = mean_squared_error(y_test, y_test_pred)
rmse_test = np.sqrt(mse_test)
mae_test = mean_absolute_error(y_test, y_test_pred)
r2_test = r2_score(y_test, y_test_pred)

print(f"Evaluation indicators on the training set:")
print(f"MSE: {mse_train:.4f}, RMSE: {rmse_train:.4f}, MAE: {mae_train:.4f}, R²: {r2_train:.4f}")

print(f"Evaluation indicators on the testing set:")
print(f"MSE: {mse_test:.4f}, RMSE: {rmse_test:.4f}, MAE: {mae_test:.4f}, R²: {r2_test:.4f}")
