Primary School Analysis

In [9]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.spatial.distance import cdist


# Load datasets
df = pd.read_csv("./preprocessed/train.csv")
df.head()

# test_df = pd.read_csv("./Dataset/test.csv")
schools_df = pd.read_csv("./Dataset/auxiliary-data/sg-primary-schools.csv")

# Calculate distances between each rental property and all primary schools
df_values = df[['latitude', 'longitude']].values
schools_values = schools_df[['latitude', 'longitude']].values
distances = cdist(df_values, schools_values, metric='euclidean')

# Find the index of the nearest primary school for each rental property
nearest_school_indices = np.argmin(distances, axis=1)

# Add new features to rental_data
df['nearest_school_distance'] = np.min(distances, axis=1)
df['nearest_school_name'] = schools_df['name'].iloc[nearest_school_indices].values

# Define features and target variable
features = ['nearest_school_distance', 'subzone', 'region', 'town', 'street_name','block','flat_type','flat_model','floor_area_sqm']
X = df[features]
y = df['monthly_rent']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

xgboost_model = XGBRegressor(
    learning_rate=0.1,
    n_estimators=100,
    max_depth=5,
    min_child_weight=1,
    gamma=0.6,
    subsample=0.6,
    colsample_bytree=0.7,
    objective='reg:squarederror',
    nthread=-1,
    scale_pos_weight=1,
    seed=27,
    reg_alpha=0.00006,
    random_state=42
)


xgboost_model.fit(X_train, y_train)
predictions = xgboost_model.predict(X_test)
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')


Mean Squared Error: 389085.1673830981
