# Preprocessing - Complete Dataset

## Landing Layer

In [2]:
import pandas as pd

In [7]:
# Load the complete dataset
rental_df_landing = pd.read_csv('../../data/landing/rental_df_landing.csv')

rental_df_landing = rental_df_landing.rename(columns={'distance/米': 'distance/m'})

## Raw Layer

In [8]:
import numpy as np
import pandas as pd

- range

In [9]:
# Step 1: Apply range check for bedroom, bathroom, and parking on the complete dataset
rental_df_landing['bedroom'] = rental_df_landing['bedroom'].apply(lambda x: x if 1 <= x <= 6 else np.nan)
rental_df_landing['bathroom'] = rental_df_landing['bathroom'].apply(lambda x: x if 1 <= x <= 8 else np.nan)
rental_df_landing['parking'] = rental_df_landing['parking'].apply(lambda x: x if 0 <= x <= 5 else np.nan)

- duplicates

In [11]:
# Load train and test datasets containing the education index
train_df_schoolRelated_w_educationIndex = pd.read_csv('../../data/curated/train_df_schoolRelated_curated.csv')
test_df_schoolRelated_w_educationIndex = pd.read_csv('../../data/curated/test_df_schoolRelated_curated.csv')

# Separate the train and test datasets from rental_df_landing based on 'id'
rental_df_landing_train = rental_df_landing[rental_df_landing['id'].isin(train_df_schoolRelated_w_educationIndex['id'])]
rental_df_landing_test = rental_df_landing[rental_df_landing['id'].isin(test_df_schoolRelated_w_educationIndex['id'])]

# Perform left merge to include the education index in the train dataset
rental_df_landing_train = pd.merge(
    rental_df_landing_train, 
    train_df_schoolRelated_w_educationIndex[['id', 'educationIndex']], 
    on='id', 
    how='left'
)

# Perform left merge to include the education index in the test dataset
rental_df_landing_test = pd.merge(
    rental_df_landing_test, 
    test_df_schoolRelated_w_educationIndex[['id', 'educationIndex']], 
    on='id', 
    how='left'
)

# Replace the 'rent' column in the test dataset with values from test_df_schoolRelated_w_educationIndex
rental_df_landing_train['rent'] = rental_df_landing_train['id'].map(
    train_df_schoolRelated_w_educationIndex.set_index('id')['rent']
)

rental_df_landing_test['rent'] = rental_df_landing_test['id'].map(
    test_df_schoolRelated_w_educationIndex.set_index('id')['rent']
)

In [12]:
# Define the school-related columns to drop
school_related_columns_to_drop = ['educationLevel', 'name', 'distance/m', 'year', 'gender', 'get_type']

# Drop school-related columns from the train dataset
rental_df_landing_train = rental_df_landing_train.drop(columns=school_related_columns_to_drop)

rental_df_landing_test = rental_df_landing_test.drop(columns=school_related_columns_to_drop)

# Remove duplicates based on the 'id' column (keep the first occurrence) in the train dataset
rental_df_landing_train_no_duplicates = rental_df_landing_train.drop_duplicates(subset=['id'], keep='first')

rental_df_landing_test_no_duplicates = rental_df_landing_test.drop_duplicates(subset=['id'], keep='first')

- representations

In [13]:
rental_df_train_clean_represent = rental_df_landing_train_no_duplicates.copy()
rental_df_test_clean_represent = rental_df_landing_test_no_duplicates.copy()

rental_df_train_clean_represent['land'] = rental_df_train_clean_represent['land'].str.extract(r'^(\d+)').astype(float)

# Define the minimum and maximum allowed land values
min_allowed_land = 10  # Example minimum allowed value
max_allowed_land = 10000  # Example maximum allowed value

# Replace values outside the allowed range with NaN in the train dataset
rental_df_train_clean_represent['land'] = rental_df_train_clean_represent['land'].apply(lambda x: x if min_allowed_land <= x <= max_allowed_land else np.nan)

mean_land_value_train = rental_df_train_clean_represent['land'].mean()

# Fill missing values in the train dataset with the mean
rental_df_train_clean_represent['land'].fillna(mean_land_value_train, inplace=True)

rental_df_test_clean_represent['land'] = rental_df_test_clean_represent['land'].str.extract(r'^(\d+)').astype(float)

# Replace values outside the allowed range with NaN in the test dataset
rental_df_test_clean_represent['land'] = rental_df_test_clean_represent['land'].apply(lambda x: x if min_allowed_land <= x <= max_allowed_land else np.nan)

In [14]:
# Define the percentage columns
percentage_columns = ['under 20', '20-39', '40-59', '60+', 'Owner', 'Renter', 'Family', 'Single']

# Step 2: Clean percentage columns for the train dataset (remove '%', convert to int)
rental_df_train_clean_represent[percentage_columns] = rental_df_train_clean_represent[percentage_columns].apply(
    lambda x: x.str.replace('%', '').astype(int)
)

# Step 2: Clean percentage columns for the test dataset (remove '%', convert to int)
rental_df_test_clean_represent[percentage_columns] = rental_df_test_clean_represent[percentage_columns].apply(
    lambda x: x.str.replace('%', '').astype(int)
)

In [15]:
print("Unique property types in the train dataset BEFORE standardizing:")
print(rental_df_train_clean_represent['propertyType'].unique())

# Simplifying and standardizing the property types for the train dataset
rental_df_train_clean_represent['propertyType'] = rental_df_train_clean_represent['propertyType'].replace({
    'New House & Land': 'House', 
    'Apartment / Unit / Flat': 'Apartment',
    'Studio': 'Apartment',
    'New Apartments / Off the Plan': 'Apartment',
    # Add more replacements if necessary
})

rental_df_test_clean_represent['propertyType'] = rental_df_test_clean_represent['propertyType'].replace({
    'New House & Land': 'House', 
    'Apartment / Unit / Flat': 'Apartment',
    'Studio': 'Apartment',
    'New Apartments / Off the Plan': 'Apartment',
    # Add more replacements if necessary
})

print("\nUnique property types in the train dataset AFTER standardizing:")
print(rental_df_train_clean_represent['propertyType'].unique())

Unique property types in the train dataset BEFORE standardizing:
['House' 'Terrace' 'Villa' 'Semi-Detached' 'New House & Land' 'Duplex'
 'Apartment / Unit / Flat' 'Studio' 'New Apartments / Off the Plan'
 'Block of Units' 'Townhouse']

Unique property types in the train dataset AFTER standardizing:
['House' 'Terrace' 'Villa' 'Semi-Detached' 'Duplex' 'Apartment'
 'Block of Units' 'Townhouse']


In [16]:
rental_df_train_clean_represent['property'] = rental_df_train_clean_represent['property'].apply(
    lambda x: ', '.join(sorted(x.split(', '))) if pd.notna(x) else x
)

rental_df_test_clean_represent['property'] = rental_df_test_clean_represent['property'].apply(
    lambda x: ', '.join(sorted(x.split(', '))) if pd.notna(x) else x
)

- missing values

In [17]:
rental_df_train_nomiss = rental_df_train_clean_represent.copy()
rental_df_test_nomiss = rental_df_test_clean_represent.copy()

# Step 2: Convert bedroom, bathroom, and parking columns to integers if not NaN for the train dataset
rental_df_train_nomiss['bedroom'] = rental_df_train_nomiss['bedroom'].apply(lambda x: int(x) if not pd.isna(x) else x)
rental_df_train_nomiss['bathroom'] = rental_df_train_nomiss['bathroom'].apply(lambda x: int(x) if not pd.isna(x) else x)
rental_df_train_nomiss['parking'] = rental_df_train_nomiss['parking'].apply(lambda x: int(x) if not pd.isna(x) else x)

# Fill missing values in the train dataset with the mode
bedroom_mode_train = rental_df_train_nomiss['bedroom'].mode()[0]
bathroom_mode_train = rental_df_train_nomiss['bathroom'].mode()[0]
parking_mode_train = rental_df_train_nomiss['parking'].mode()[0]

rental_df_train_nomiss['bedroom'].fillna(bedroom_mode_train, inplace=True)
rental_df_train_nomiss['bathroom'].fillna(bathroom_mode_train, inplace=True)
rental_df_train_nomiss['parking'].fillna(parking_mode_train, inplace=True)

# Step 3: Convert bedroom, bathroom, and parking columns to integers if not NaN for the test dataset
rental_df_test_nomiss['bedroom'] = rental_df_test_nomiss['bedroom'].apply(lambda x: int(x) if not pd.isna(x) else x)
rental_df_test_nomiss['bathroom'] = rental_df_test_nomiss['bathroom'].apply(lambda x: int(x) if not pd.isna(x) else x)
rental_df_test_nomiss['parking'] = rental_df_test_nomiss['parking'].apply(lambda x: int(x) if not pd.isna(x) else x)

# Fill missing values in the test dataset using the mode from the train dataset
rental_df_test_nomiss['bedroom'].fillna(bedroom_mode_train, inplace=True)
rental_df_test_nomiss['bathroom'].fillna(bathroom_mode_train, inplace=True)
rental_df_test_nomiss['parking'].fillna(parking_mode_train, inplace=True)

In [18]:
# Calculate the mean land value from the train dataset
mean_land_value_train = rental_df_train_nomiss['land'].mean()

# Fill missing values in the train dataset with the mean
rental_df_train_nomiss['land'].fillna(mean_land_value_train, inplace=True)

rental_df_test_nomiss['land'].fillna(mean_land_value_train, inplace=True)

The 'property' column contains multiple labels, so missing values are filled with 'Unknown' for consistency

In [19]:
rental_df_train_nomiss['property'] = rental_df_train_nomiss['property'].fillna('Unknown')

rental_df_test_nomiss['property'] = rental_df_test_nomiss['property'].fillna('Unknown')

In [20]:
rental_df_train_nomiss = rental_df_train_nomiss.rename(columns={
    'property': 'propertyFeatures'
})

rental_df_test_nomiss = rental_df_test_nomiss.rename(columns={
    'property': 'propertyFeatures'
})

# Define columns to drop and apply it to both the train and test datasets
columns_to_drop = ['_url', 'page', 'feature', 'type', 'Available', 'Bond']

rental_df_train_nomiss = rental_df_train_nomiss.drop(columns=columns_to_drop, errors='ignore')
rental_df_test_nomiss = rental_df_test_nomiss.drop(columns=columns_to_drop, errors='ignore')

# Save the cleaned train and test datasets to CSV files
rental_df_train_nomiss.to_csv('../../data/raw/rental_df_train_raw.csv', index=False)
rental_df_test_nomiss.to_csv('../../data/raw/rental_df_test_raw.csv', index=False)

## Curated Layer

In [21]:
import time
import requests
from tqdm import tqdm
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
from shapely.geometry import Polygon
from geopy.geocoders import Nominatim
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.model_selection import cross_val_predict, KFold


### (Part 1) Feature Engineering on address

- suburb (mean encoding)

In [22]:
# Step 1: Extract suburb from the 'address' column and insert it after 'address' in the train dataset
rental_df_train_nomiss['suburb'] = rental_df_train_nomiss['address'].str.extract(r'(\d{4})$')

# Find the index of the 'address' column in the train dataset
address_idx_train = rental_df_train_nomiss.columns.get_loc('address')

# Insert 'suburb' column next to 'address'
rental_df_train_nomiss.insert(address_idx_train + 1, 'suburb', rental_df_train_nomiss.pop('suburb'))

# Step 2: Extract suburb from the 'address' column and insert it after 'address' in the test dataset
rental_df_test_nomiss['suburb'] = rental_df_test_nomiss['address'].str.extract(r'(\d{4})$')

# Find the index of the 'address' column in the test dataset
address_idx_test = rental_df_test_nomiss.columns.get_loc('address')

# Insert 'suburb' column next to 'address'
rental_df_test_nomiss.insert(address_idx_test + 1, 'suburb', rental_df_test_nomiss.pop('suburb'))

In [23]:
# Calculate the mean rent by suburb in the train dataset
suburb_mean_rent_train = rental_df_train_nomiss.groupby('suburb')['rent'].mean()

# Map the means to the 'suburb' column in both train and test datasets
rental_df_train_nomiss['suburb_encoded'] = rental_df_train_nomiss['suburb'].map(suburb_mean_rent_train)
rental_df_test_nomiss['suburb_encoded'] = rental_df_test_nomiss['suburb'].map(suburb_mean_rent_train)

# Fill NaN values in the test set (if a suburb is not in the train set) with the overall mean rent
overall_mean_rent = rental_df_train_nomiss['rent'].mean()
rental_df_test_nomiss['suburb_encoded'].fillna(overall_mean_rent, inplace=True)

# Step 1: Get the column index of 'suburb'
suburb_index_train = rental_df_train_nomiss.columns.get_loc('suburb')
suburb_index_test = rental_df_test_nomiss.columns.get_loc('suburb')

# Step 2: Drop the original 'suburb' column (for now)
rental_df_train_nomiss = rental_df_train_nomiss.drop(columns=['suburb'])
rental_df_test_nomiss = rental_df_test_nomiss.drop(columns=['suburb'])

# Step 3: Insert 'suburb_encoded' into the original position of 'suburb'
rental_df_train_nomiss.insert(suburb_index_train, 'suburb_encoded', rental_df_train_nomiss.pop('suburb_encoded'))
rental_df_test_nomiss.insert(suburb_index_test, 'suburb_encoded', rental_df_test_nomiss.pop('suburb_encoded'))

- Transport Index

Since each station serves routes in both directions, we should treat them as a single station when calculating the transport index for a location.

In [25]:
# Step 1: Load bus stop data
bus_gdf_path = '../../data/external/BUS.gdb/'
bus_gdf = gpd.read_file(bus_gdf_path)

# Step 2: Find duplicate stops based on 'STOP_NAME'
duplicate_stops = bus_gdf[bus_gdf.duplicated(subset='STOP_NAME', keep=False)]

# Step 3: Calculate the number of routes for each stop
bus_gdf['num_routes'] = bus_gdf['ROUTES_USING_STOP'].apply(lambda x: len(str(x).split(',')))  # Count routes per stop

# Step 4: Sort by 'STOP_NAME' and 'num_routes' to prioritize stops with more routes
bus_gdf_sorted = bus_gdf.sort_values(by=['STOP_NAME', 'num_routes'], ascending=[True, False])

# Step 5: Drop duplicate stops, keeping the one with the most routes
bus_gdf_cleaned = bus_gdf_sorted.drop_duplicates(subset='STOP_NAME', keep='first').reset_index(drop=True)

  result = read_func(


In [27]:
# Step 1: Load tram stop data
tram_gdf_path = '../../data/external/TRAM/'
tram_gdf = gpd.read_file(tram_gdf_path)

# Step 2: Find duplicate stops based on 'STOP_NAME'
duplicate_stops = tram_gdf[tram_gdf.duplicated(subset='STOP_NAME', keep=False)]

# Step 3: Calculate the number of routes for each tram stop
tram_gdf['num_routes'] = tram_gdf['ROUTEUSSP'].apply(lambda x: len(str(x).split(',')))

# Step 4: Sort by 'STOP_NAME' and 'num_routes' to prioritize stops with more routes
tram_gdf_sorted = tram_gdf.sort_values(by=['STOP_NAME', 'num_routes'], ascending=[True, False])

# Step 5: Drop duplicate stops, keeping the one with the most routes
tram_gdf_cleaned = tram_gdf_sorted.drop_duplicates(subset='STOP_NAME', keep='first').reset_index(drop=True)


In [28]:
train_gdf_path = '../../data/external/TRAIN/'
train_gdf = gpd.read_file(train_gdf_path)

In [29]:
tram_gdf_cleaned.rename(columns={'ROUTEUSSP': 'ROUTES_USING_STOP'}, inplace=True)


# Define Melbourne CBD coordinates and create the polygon
# Coordinates are defined in EPSG:4326
melbourne_cbd_coords = [
    (144.946457, -37.819722),  # North-West corner (near Flagstaff)
    (144.9785, -37.810087),    # North-East corner (near Parliament)
    (144.9609, -37.818257),    # South-East corner (near Southbank)
    (144.946457, -37.824722),  # South-West corner (near Southern Cross)
]
melbourne_cbd_polygon = Polygon(melbourne_cbd_coords)

# Create a GeoDataFrame for the CBD
# The CBD polygon is created in EPSG:4326 (WGS84), so we set that CRS here
melbourne_cbd_gdf = gpd.GeoDataFrame(index=[0], crs='EPSG:4326', geometry=[melbourne_cbd_polygon])

# Function to standardize and filter stops within the CBD
def filter_stops_within_cbd(stops_gdf, cbd_polygon):
    # Ensure that stops are reprojected to the same CRS as the CBD polygon (EPSG:4326)
    if stops_gdf.crs != 'EPSG:4326':
        stops_gdf = stops_gdf.to_crs('EPSG:4326')
    # Filter stops within the CBD polygon
    stops_in_cbd = stops_gdf[stops_gdf['geometry'].within(cbd_polygon)]
    return stops_in_cbd

# Load and filter bus and tram stops within the CBD
gdf_bus_stops_in_cbd = filter_stops_within_cbd(bus_gdf_cleaned, melbourne_cbd_polygon)
gdf_tram_stops_in_cbd = filter_stops_within_cbd(tram_gdf_cleaned, melbourne_cbd_polygon)

# Extract unique routes for bus and tram stops within the CBD
bus_routes_across_cbd = set(','.join(gdf_bus_stops_in_cbd['ROUTES_USING_STOP'].unique()).split(','))
tram_routes_across_cbd = set(','.join(gdf_tram_stops_in_cbd['ROUTES_USING_STOP'].unique()).split(','))

# Step 6: Print the results
print(f"Bus routes across CBD: {bus_routes_across_cbd}")
print(f"Tram routes across CBD: {tram_routes_across_cbd}")

Bus routes across CBD: {'237', '232', '350', '303', '309', '216', '220', '234', '235', '236', '250', '200', '251', '207'}
Tram routes across CBD: {'96', '11', '35', '48', '109', '59', '75', '70', '57', '12', '19', '58'}


In [30]:
def has_bus_route_across_cbd(routes):
    stop_routes = set(routes.split(','))
    return bool(stop_routes & bus_routes_across_cbd)

def has_tram_route_across_cbd(routes):
    stop_routes = set(routes.split(','))
    return bool(stop_routes & tram_routes_across_cbd)

def flag_stations(row):
    if row['HAS_ROUTE_ACROSS_CBD']:  # Already flagged as inside CBD
        return True
    elif row['STOP_ZONE'] and '1' in row['STOP_ZONE']:  # Likely connected to CBD (Zone 1) and non-None
        return True
    else:
        return np.nan  # Fill with NaN for now, to be handled later

# Apply the relevant function to bus and tram stops
bus_gdf_cleaned['HAS_ROUTE_ACROSS_CBD'] = bus_gdf_cleaned['ROUTES_USING_STOP'].apply(has_bus_route_across_cbd)
tram_gdf_cleaned['HAS_ROUTE_ACROSS_CBD'] = tram_gdf_cleaned['ROUTES_USING_STOP'].apply(has_tram_route_across_cbd)

train_gdf['HAS_ROUTE_ACROSS_CBD'] = train_gdf['geometry'].within(melbourne_cbd_polygon)

# Combine the data from all transportation modes temporarily to calculate the mode
combined_transport_gdf = pd.concat([bus_gdf_cleaned[['HAS_ROUTE_ACROSS_CBD']], 
                                    tram_gdf_cleaned[['HAS_ROUTE_ACROSS_CBD']], 
                                    train_gdf[['HAS_ROUTE_ACROSS_CBD']]])

# Get the mode of 'HAS_ROUTE_ACROSS_CBD' across all transportation modes, excluding NaN
has_route_mode_all_transport = combined_transport_gdf['HAS_ROUTE_ACROSS_CBD'].dropna().mode()[0]

# Apply the updated function to the train data
train_gdf['HAS_ROUTE_ACROSS_CBD'] = train_gdf.apply(flag_stations, axis=1)

# Now fill missing values in the train data using the mode from the combined dataset
train_gdf['HAS_ROUTE_ACROSS_CBD'].fillna(has_route_mode_all_transport, inplace=True)

# Step 5: Fill missing values in the 'HAS_ROUTE_ACROSS_CBD' column for the train data using the mode
train_gdf['HAS_ROUTE_ACROSS_CBD'].fillna(has_route_mode_all_transport, inplace=True)

#### Distance Caculation

we first need to find geometry of each property's address

In [None]:
# Initialize the Nominatim geocoder with a custom user agent
geolocator = Nominatim(user_agent="junqis")

# Function to geocode an address with a delay and handle errors
def geocode_address_with_delay(address):
    try:
        location = geolocator.geocode(address, timeout=10)  # Increased timeout
        time.sleep(1)  # Delay of 1 second to avoid rate limits
        if location:
            return location.latitude, location.longitude
        else:
            return None, None
    except Exception as e:
        print(f"Error geocoding {address}: {e}")
        return None, None

# Step 1: Load or prepare train and test datasets
train_df = rental_df_train_nomiss.copy()
test_df = rental_df_test_nomiss.copy()

# Step 2: Concatenate train and test datasets into one combined dataframe
combined_df = pd.concat([train_df, test_df], axis=0)

# Initialize latitude and longitude columns as None if they do not exist
combined_df[['latitude', 'longitude']] = None

# Step 3: Iterate over rows and geocode addresses for the combined dataframe
for idx, row in tqdm(combined_df.iterrows(), total=combined_df.shape[0]):
    if pd.isna(row['latitude']) or pd.isna(row['longitude']):  # Geocode only if lat/lon is missing
        lat, lon = geocode_address_with_delay(row['address'])
        combined_df.at[idx, 'latitude'] = lat
        combined_df.at[idx, 'longitude'] = lon

# combined_df.to_csv('../../data/distance/rental_df_w_geometry.csv')

In [32]:
rental_df_w_geometry = pd.read_csv('../../data/distance/rental_df_w_geometry.csv')

rental_df_w_geometry['geometry'] = rental_df_w_geometry.apply(lambda row: Point(row['longitude'], row['latitude']), axis=1)

Once the geometry is determined, we can then find the nearest transportation stops from the properties using straight-line distance.

In [33]:
# Function to find the n nearest stations for all properties in one go (vectorized)
def find_nearest_stations(property_gdf, stops_gdf, n=5, transport_type='Bus'):
    expanded_data = []
    
    # For each property, calculate the distance to all stops at once (vectorized)
    for idx, property_row in property_gdf.iterrows():
        property_geom = property_row['geometry']
        
        # Calculate distances to all stops at once (vectorized operation)
        stops_gdf['distance'] = stops_gdf['geometry'].distance(property_geom)
        
        # Filter out invalid distances (NaN or inf)
        valid_stops = stops_gdf[stops_gdf['distance'].notna() & (stops_gdf['distance'] != float('inf'))]
        
        # Sort the stops by distance and take the top n nearest stops
        nearest_stops = valid_stops.nsmallest(n, 'distance')
        
        # Fill in missing stops with None if fewer than n stops are found
        for i in range(n):
            if i < len(nearest_stops):
                stop_row = nearest_stops.iloc[i]
                expanded_data.append({
                    'id': property_row['id'],
                    'rent': property_row['rent'],
                    'address': property_row['address'],
                    'nearest_stop_id': stop_row.get('STOP_ID', None),
                    'nearest_stop_name': stop_row.get('STOP_NAME', None),
                    'nearest_stop_has_route_across_cbd': stop_row.get('HAS_ROUTE_ACROSS_CBD', None),
                    'nearest_stop_distance': stop_row['distance'],
                    'transport_type': transport_type  # Include the transportation type
                })
            else:
                # Fill in None for missing stop values if fewer than n stops are found
                expanded_data.append({
                    'id': property_row['id'],
                    'rent': property_row['rent'],
                    'address': property_row['address'],
                    'nearest_stop_id': None,
                    'nearest_stop_name': None,
                    'nearest_stop_has_route_across_cbd': None,
                    'nearest_stop_distance': None,
                    'transport_type': transport_type  # Include the transportation type
                })
    
    return expanded_data

train_gdf['STOP_ID'] = range(1, len(train_gdf) + 1)

# Prepare GeoDataFrames for bus, tram, and train (already projected to EPSG:3857)
property_gdf = gpd.GeoDataFrame(rental_df_w_geometry, geometry='geometry', crs="EPSG:4326").to_crs(epsg=3857)
busStop_gdf = gpd.GeoDataFrame(bus_gdf_cleaned, geometry='geometry', crs="EPSG:4326").to_crs(epsg=3857)
tram_gdf = gpd.GeoDataFrame(tram_gdf_cleaned, geometry='geometry', crs="EPSG:4326").to_crs(epsg=3857)
train_gdf = gpd.GeoDataFrame(train_gdf.rename(columns={'STATION': 'STOP_NAME'}), geometry='geometry', crs="EPSG:4326").to_crs(epsg=3857)

In [83]:
nearest_bus = find_nearest_stations(property_gdf, busStop_gdf, n=5, transport_type='Bus')

nearest_tram = find_nearest_stations(property_gdf, tram_gdf, n=5, transport_type='Tram')

nearest_train = find_nearest_stations(property_gdf, train_gdf, n=5, transport_type='Train')

nearest_stops = pd.DataFrame(nearest_bus + nearest_tram + nearest_train)

In [100]:
nearest_stops.to_csv('../../data/distance/nearest_stops.csv')

Now, using the information obtained from the straight-line distance, we can refine the search to find the nearest transportation stops to the properties based on route distance.

In [None]:
# Function to get route distances using Google Distance Matrix API
def get_route_distances(origin, destinations, api_key):
    destination_string = '|'.join(destinations)  # Join destinations into a string for API request
    
    url = (
        f"https://maps.googleapis.com/maps/api/distancematrix/json"
        f"?origins={origin}"
        f"&destinations={destination_string}"
        f"&key={api_key}"
    )
    
    try:
        response = requests.get(url)
        response.raise_for_status()  # Check if the request was successful (HTTP 200)
        
        data = response.json()
        
        # Initialize route distances list
        route_distances = []
        
        # Check if 'rows' exists in the response
        if 'rows' in data and 'elements' in data['rows'][0]:
            for element in data['rows'][0]['elements']:
                if element['status'] == 'OK':
                    route_distances.append(element['distance']['value'])  # Distance in meters
                else:
                    route_distances.append(None)  # Handle cases where distance isn't available
        else:
            print("Warning: No valid distance data returned.")
            route_distances = [None] * len(destinations)  # Fill with None if no valid response
        
        return route_distances
    
    except requests.exceptions.RequestException as e:
        # Handle any HTTP errors, network errors, etc.
        print(f"Error fetching data from API: {e}")
        return [None] * len(destinations)

# Group the data by 'id' and 'address' and aggregate the list of nearest stop names
rental_df_perProperty = nearest_stops.groupby(['id', 'address'])['nearest_stop_name'].apply(list).reset_index()

# Initialize an empty list to store the results
to_stop_routeDistances = []

# Loop through each property and calculate route distances to its nearest stops
for index, property in rental_df_perProperty.iterrows():
    property_address = property['address']  # The address of the property
    stop_names = property['nearest_stop_name']  # List of nearest stop names

    # Get distances using the Google API
    distances = get_route_distances(property_address, stop_names, api_key='YOUR_API_KEY_HERE')  # Replace with your API key

    # Check if the lengths of stop_names and distances match
    if len(distances) != len(stop_names):
        print(f"Warning: Mismatch between stop names and distances for property: {property_address}. Skipping this property.")
        continue  # Skip this property if there's a mismatch

    # If distances were successfully calculated, store the results
    if distances:
        for i, stop in enumerate(stop_names):
            distance = {
                'id': property['id'],
                'address': property_address,
                'stop_name': stop,
                'route_distance/m': distances[i] if distances[i] is not None else None  # Handle missing distances
            }
            to_stop_routeDistances.append(distance)

# Save the route distances to a CSV file
file_path = '../../data/distance/to_stop_routeDistances.csv'
to_stop_routeDistances_df = pd.DataFrame(to_stop_routeDistances)
to_stop_routeDistances_df.to_csv(file_path, index=False)

In [85]:
nearest_stops = pd.read_csv('../../data/distance/nearest_stops.csv')
to_stop_distance = pd.read_csv('../../data/distance/to_stop_routeDistances.csv')

In [97]:
# Remove duplicate rows based on 'id', 'address', and 'stop_name'
to_stop_distance = to_stop_distance.drop_duplicates(subset=['id', 'address', 'stop_name'])

to_stop_distance.reset_index(drop=True, inplace=True)

# Add an 'order' column to the 'all_transportations' DataFrame to preserve original order after merging
nearest_stops['order'] = nearest_stops.index

# Step 2: Merge the transport data with the route distance data
# Merging on 'id', 'address', and 'nearest_stop_name' from 'all_transportations' 
# with 'id', 'address', and 'stop_name' from 'to_stop_distance'
nearest_stops_w_stop_routeDistances = pd.merge(
    nearest_stops, 
    to_stop_distance[['id', 'address', 'stop_name', 'route_distance/m']], 
    left_on=['id', 'address', 'nearest_stop_name'],  # Matching 'nearest_stop_name' with 'stop_name'
    right_on=['id', 'address', 'stop_name'], 
    how='left'  # Keeping all records from 'all_transportations'
)

# Sort the merged DataFrame based on the original 'order' and reset the index
nearest_stops_w_stop_routeDistances = nearest_stops_w_stop_routeDistances.sort_values(by='order').reset_index(drop=True)

nearest_stops_w_stop_routeDistances = nearest_stops_w_stop_routeDistances.drop(columns=['order'])

nearest_stops = nearest_stops.drop(columns=['order'], errors='ignore')

#### Stops Scoring

In [87]:
train_ids = rental_df_train_nomiss['id'].unique()  # Using 'rental_df_train_nomiss' for train IDs
test_ids = rental_df_test_nomiss['id'].unique()    

# Split transportation data into training and testing sets
nearest_stops_train_w_stop_routeDistances = nearest_stops_w_stop_routeDistances[nearest_stops_w_stop_routeDistances['id'].isin(train_ids)]
nearest_stops_test_w_stop_routeDistances = nearest_stops_w_stop_routeDistances[nearest_stops_w_stop_routeDistances['id'].isin(test_ids)]

In [88]:
# Calculate 'distance_ratio' for both train and test datasets
nearest_stops_train_w_stop_routeDistances['distance_ratio'] = nearest_stops_train_w_stop_routeDistances.apply(
    lambda row: row['route_distance/m'] / row['nearest_stop_distance'] 
    if not pd.isna(row['route_distance/m']) else np.nan, axis=1
)

nearest_stops_test_w_stop_routeDistances['distance_ratio'] = nearest_stops_test_w_stop_routeDistances.apply(
    lambda row: row['route_distance/m'] / row['nearest_stop_distance'] 
    if not pd.isna(row['route_distance/m']) else np.nan, axis=1
)

# Apply upper and lower ratio thresholds for both train and test
upper_ratio_threshold = 5
lower_ratio_threshold = 0.2

# For train
nearest_stops_train_w_stop_routeDistances.loc[
    (nearest_stops_train_w_stop_routeDistances['distance_ratio'] > upper_ratio_threshold) |
    (nearest_stops_train_w_stop_routeDistances['distance_ratio'] < lower_ratio_threshold), 
    ['route_distance/m', 'distance_ratio']] = np.nan

# For test
nearest_stops_test_w_stop_routeDistances.loc[
    (nearest_stops_test_w_stop_routeDistances['distance_ratio'] > upper_ratio_threshold) |
    (nearest_stops_test_w_stop_routeDistances['distance_ratio'] < lower_ratio_threshold), 
    ['route_distance/m', 'distance_ratio']] = np.nan

# Calculate property-level and dataset-wide average ratios for the train dataset
property_avg_ratios_train = nearest_stops_train_w_stop_routeDistances.groupby('id')['distance_ratio'].apply(
    lambda x: np.mean([val for val in x if not pd.isna(val)])
)

dataset_avg_ratio_train = nearest_stops_train_w_stop_routeDistances['distance_ratio'].mean(skipna=True)

# Define function to fill missing route distances
def fill_missing_route_distances(row, property_avg_ratios, dataset_avg_ratio):
    if pd.isna(row['route_distance/m']):
        property_avg_ratio = property_avg_ratios.get(row['id'], np.nan)
        if not pd.isna(property_avg_ratio):
            return row['nearest_stop_distance'] * property_avg_ratio
        else:
            return row['nearest_stop_distance'] * dataset_avg_ratio
    return row['route_distance/m']

# Apply the function to fill missing route distances in both train and test datasets
# For train
nearest_stops_train_w_stop_routeDistances['route_distance/m'] = nearest_stops_train_w_stop_routeDistances.apply(
    fill_missing_route_distances, axis=1, 
    property_avg_ratios=property_avg_ratios_train, 
    dataset_avg_ratio=dataset_avg_ratio_train
)

# For test (using train-derived averages)
nearest_stops_test_w_stop_routeDistances['route_distance/m'] = nearest_stops_test_w_stop_routeDistances.apply(
    fill_missing_route_distances, axis=1, 
    property_avg_ratios=property_avg_ratios_train,  # Use property ratios from train
    dataset_avg_ratio=dataset_avg_ratio_train  # Use dataset ratio from train
)

nearest_stops_train_w_stop_routeDistances = nearest_stops_train_w_stop_routeDistances.drop(columns=['distance_ratio'])
nearest_stops_test_w_stop_routeDistances = nearest_stops_test_w_stop_routeDistances.drop(columns=['distance_ratio'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nearest_stops_train_w_stop_routeDistances['distance_ratio'] = nearest_stops_train_w_stop_routeDistances.apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nearest_stops_test_w_stop_routeDistances['distance_ratio'] = nearest_stops_test_w_stop_routeDistances.apply(
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: 

#### Caculating Transport Index

In [89]:
# OneHotEncode the transport-related categorical features
transport_related_categorical = ['nearest_stop_has_route_across_cbd', 'transport_type']

# Initialize the OneHotEncoder
transport_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Fit and transform on the train dataset
transport_encoded_train = transport_encoder.fit_transform(nearest_stops_train_w_stop_routeDistances[transport_related_categorical])

# Transform the test dataset using the encoder fitted on the train dataset
transport_encoded_test = transport_encoder.transform(nearest_stops_test_w_stop_routeDistances[transport_related_categorical])

# Get the encoded feature names
encoded_transport_feature_names = transport_encoder.get_feature_names_out(transport_related_categorical)

# Create DataFrames for train and test sets with the encoded features
# Reset index to ensure alignment when merging the columns back
nearest_stops_train_w_stop_routeDistances = nearest_stops_train_w_stop_routeDistances.reset_index(drop=True)
nearest_stops_test_w_stop_routeDistances = nearest_stops_test_w_stop_routeDistances.reset_index(drop=True)

transport_df_encoded_train = pd.DataFrame(transport_encoded_train, columns=encoded_transport_feature_names)
transport_df_encoded_train['id'] = nearest_stops_train_w_stop_routeDistances['id']
transport_df_encoded_train['rent'] = nearest_stops_train_w_stop_routeDistances['rent']
transport_df_encoded_train['route_distance/m'] = nearest_stops_train_w_stop_routeDistances['route_distance/m']

transport_df_encoded_test = pd.DataFrame(transport_encoded_test, columns=encoded_transport_feature_names)
transport_df_encoded_test['id'] = nearest_stops_test_w_stop_routeDistances['id']
transport_df_encoded_test['rent'] = nearest_stops_test_w_stop_routeDistances['rent']
transport_df_encoded_test['route_distance/m'] = nearest_stops_test_w_stop_routeDistances['route_distance/m']

# Cross-validation on the train set
transport_related_v2 = list(encoded_transport_feature_names)

# X and y for the training set
X_transport_train = transport_df_encoded_train[transport_related_v2]
y_transport_train = transport_df_encoded_train['rent']

# Define cross-validation
cv_transport = KFold(n_splits=5, shuffle=True, random_state=42)
rf_model_transport = RandomForestRegressor(n_estimators=100, random_state=42)

# Perform cross-validated predictions on the training set
transport_score_train = cross_val_predict(rf_model_transport, X_transport_train, y_transport_train, cv=cv_transport)

# Step 10: Fit the model on the full training set and predict on the test set
rf_model_transport.fit(X_transport_train, y_transport_train)

# X for the test set
X_transport_test = transport_df_encoded_test[transport_related_v2]

# Predict rent for the test set using the trained model
transport_score_test = rf_model_transport.predict(X_transport_test)

# Step 11: Assign the scores to the respective train and test DataFrames
transport_df_encoded_train['transport_score'] = transport_score_train
transport_df_encoded_test['transport_score'] = transport_score_test

# Normalize the transport scores and distances
scaler_transport = MinMaxScaler()

# Normalize transport scores for the train set
transport_score_reshaped_train = transport_df_encoded_train['transport_score'].values.reshape(-1, 1)
transport_score_normalized_train = scaler_transport.fit_transform(transport_score_reshaped_train)
transport_df_encoded_train['transport_score'] = transport_score_normalized_train

# Normalize transport scores for the test set using the same scaler
transport_score_reshaped_test = transport_df_encoded_test['transport_score'].values.reshape(-1, 1)
transport_score_normalized_test = scaler_transport.transform(transport_score_reshaped_test)
transport_df_encoded_test['transport_score'] = transport_score_normalized_test

# Step 13: Normalize reverse distances for both train and test sets
distance_scaler_transport = MinMaxScaler()

# Train set
transport_df_encoded_train['reverse_distance/m'] = 1 - distance_scaler_transport.fit_transform(
    transport_df_encoded_train[['route_distance/m']]
)

# Test set using the scaler fitted on train
transport_df_encoded_test['reverse_distance/m'] = 1 - distance_scaler_transport.transform(
    transport_df_encoded_test[['route_distance/m']]
)

# Step 14: Calculate the transportation index for both train and test sets
transport_df_encoded_train['transport_index_equally_important_score'] = (
    0.5 * transport_df_encoded_train['transport_score'] + 0.5 * transport_df_encoded_train['reverse_distance/m']
)

transport_df_encoded_test['transport_index_equally_important_score'] = (
    0.5 * transport_df_encoded_test['transport_score'] + 0.5 * transport_df_encoded_test['reverse_distance/m']
)

# Normalize the final transportation index
scaler = MinMaxScaler()

# Normalize for train set
transport_df_encoded_train['transport_index_equally_important_score'] = scaler.fit_transform(
    transport_df_encoded_train[['transport_index_equally_important_score']]
)

# Normalize for test set using the scaler fitted on train
transport_df_encoded_test['transport_index_equally_important_score'] = scaler.transform(
    transport_df_encoded_test[['transport_index_equally_important_score']]
)

# Compute the final transport index for both train and test sets
def compute_transport_index(group):
    # Sort by transport index score and select the top 5 nearest stops
    nearest_stops = group.sort_values(by='transport_index_equally_important_score', ascending=False).head(5)
    # Sum the transport scores for the top 5 nearest stops
    transport_index = np.sum(nearest_stops['transport_index_equally_important_score'])
    return transport_index

# Apply the function for train and test sets
transport_index_train = transport_df_encoded_train.groupby('id').apply(lambda group: compute_transport_index(group))
transport_index_test = transport_df_encoded_test.groupby('id').apply(lambda group: compute_transport_index(group))

# Assign the transportation index back to the DataFrame
transport_df_encoded_train['transport_index'] = transport_df_encoded_train['id'].map(transport_index_train)
transport_df_encoded_test['transport_index'] = transport_df_encoded_test['id'].map(transport_index_test)

In [90]:
# Step 1: Ensure transport index has the correct name for merging
transport_index_train.name = 'transportIndex'
transport_index_test.name = 'transportIndex'

# Step 2: Merge the transportation index with the existing rental_df_w_education_index_train DataFrame
rental_df_train_w_transportIndex = pd.merge(
    rental_df_train_nomiss,  # the dataset that already contains the education index
    transport_index_train,  # the calculated transport index for the train set
    left_on='id',  # merging on 'id'
    right_index=True,  # the transportation index has 'id' as the index
    how='inner'  # use 'inner' to keep only the records that exist in both datasets
)

# Step 3: Merge the transportation index with the existing rental_df_w_education_index_test DataFrame
rental_df_test_w_transportIndex = pd.merge(
    rental_df_test_nomiss,  # the dataset that already contains the education index
    transport_index_test,  # the calculated transport index for the test set
    left_on='id',  # merging on 'id'
    right_index=True,  # the transportation index has 'id' as the index
    how='inner'  # use 'inner' to keep only the records that exist in both datasets
)

- Life Index

#### Distance Caculation

In [43]:
shapefile_path = "../../data/external/VPA_Open_Space-shp"

open_space_df = gpd.read_file(shapefile_path)

rental_df_w_geometry = pd.read_csv('../../data/distance/rental_df_w_geometry.csv')

In [44]:
# List of relevant features for open space data
relevant_features = [
    'FID',               # Unique identifier
    'OS_CATEGOR',        # Category of open space      
    'OS_STATUS',         # Status of the open space (existing, proposed)
    'OS_ACCESS',         # Accessibility information
    'MANAGER_TY',        # Management type of the open space
    'OS_TYPE',           # Specific type of open space
    'SHAPE_Area'         # Area of the open space
]

# Print unique values for categorical features in order
for feature in relevant_features:
    if feature not in ['FID', 'SHAPE_Area']:  # Exclude identifier and area feature
        unique_values = open_space_df[feature].unique()
        print(f"Unique values in '{feature}': {sorted(unique_values)}")

shape_area_range = (open_space_df['SHAPE_Area'].min(), open_space_df['SHAPE_Area'].max())
print(f"Range of 'SHAPE_Area': {shape_area_range}")

Unique values in 'OS_CATEGOR': ['Cemeteries', 'Civic squares and promenades', 'Conservation reserves', 'Government schools', 'Natural and semi-natural open space', 'Non-government schools', 'Parks and gardens', 'Public housing reserves', 'Recreation corridor', 'Services and utilities reserves', 'Sportsfields and organised recreation', 'Tertiary institutions', 'Transport reservations']
Unique values in 'OS_STATUS': ['Existing', 'Planned']
Unique values in 'OS_ACCESS': ['Closed', 'Highly Limited', 'Limited', 'Open']
Unique values in 'MANAGER_TY': ['Committee - local government', 'Committee - other', 'Crown', 'Local government', 'NO DATA', 'Private', 'Public authority', 'State Government']
Unique values in 'OS_TYPE': ['Private open space', 'Public open space', 'Restricted public land']
Range of 'SHAPE_Area': (0.0, 0.027314988415512)


In [45]:
# Replace "NO DATA" with NaN in 'MANAGER_TY'
open_space_df['MANAGER_TY'] = open_space_df['MANAGER_TY'].replace('NO DATA', pd.NA)

# Fill missing values in 'MANAGER_TY' with the mode
open_space_df['MANAGER_TY'].fillna(open_space_df['MANAGER_TY'].mode()[0], inplace=True)

# Calculate the mean of 'SHAPE_Area' where the area is greater than 0
mean_shape_area = open_space_df.loc[open_space_df['SHAPE_Area'] > 0, 'SHAPE_Area'].mean()

# Replace 0.0 values in 'SHAPE_Area' with the mean value
open_space_df['SHAPE_Area'] = open_space_df['SHAPE_Area'].replace(0.0, mean_shape_area)

# Convert the DataFrame to EPSG:3857 coordinate reference system for geospatial analysis
open_spaces_gdf = open_space_df.to_crs(epsg=3857)

In [46]:
rental_gdf = rental_df_w_geometry.copy()

# Create the geometry column from latitude and longitude
rental_gdf['geometry'] = rental_gdf.apply(lambda row: Point(row['longitude'], row['latitude']), axis=1)
rental = gpd.GeoDataFrame(rental_gdf, geometry='geometry', crs="EPSG:4326").to_crs(epsg=3857)

In [None]:
def find_nearest_open_spaces(property_gdf, open_spaces_gdf, n=5):
    expanded_data = []
    
    # For each property, calculate the distance to all open spaces at once (vectorized)
    for idx, property_row in property_gdf.iterrows():
        property_geom = property_row['geometry']
        
        # Calculate distances to all open spaces in one go
        open_spaces_gdf['distance'] = open_spaces_gdf['geometry'].distance(property_geom)
        
        # Filter out invalid distances (NaN or infinity)
        valid_open_spaces = open_spaces_gdf[open_spaces_gdf['distance'].notna() & (open_spaces_gdf['distance'] != float('inf'))]
        
        # Sort the open spaces by distance and select the top n nearest ones
        nearest_open_spaces = valid_open_spaces.nsmallest(n, 'distance')
        
        # Append nearest open spaces to the results or fill with None if fewer than n are found
        for i in range(n):
            if i < len(nearest_open_spaces):
                open_space_row = nearest_open_spaces.iloc[i]
                expanded_data.append({
                    'property_id': property_row['id'],
                    'rent': property_row['rent'],
                    'nearest_open_space_id': open_space_row.get('FID', None),
                    'nearest_open_space_distance': open_space_row['distance'],
                    'OS_CATEGOR': open_space_row['OS_CATEGOR'],
                    'OS_STATUS': open_space_row['OS_STATUS'],
                    'OS_ACCESS': open_space_row['OS_ACCESS'],
                    'MANAGER_TY': open_space_row['MANAGER_TY'],
                    'OS_TYPE': open_space_row['OS_TYPE'],
                    'SHAPE_Area': open_space_row['SHAPE_Area']
                })
            else:
                # If fewer than n open spaces, append None values
                expanded_data.append({
                    'property_id': property_row['id'],
                    'rent': property_row['rent'],
                    'nearest_open_space_id': None,
                    'nearest_open_space_distance': None,
                    'OS_CATEGOR': None,
                    'OS_STATUS': None,
                    'OS_ACCESS': None,
                    'MANAGER_TY': None,
                    'OS_TYPE': None,
                    'SHAPE_Area': None
                })
    
    return expanded_data

# Call the function with the rental and open spaces GeoDataFrames
nearest_open_spaces_data = find_nearest_open_spaces(rental, open_spaces_gdf)

# Convert the result to a DataFrame
nearest_open_spaces_df = pd.DataFrame(nearest_open_spaces_data)

# Handle missing distances by calculating property-level average distance
property_avg_distances = nearest_open_spaces_df.groupby('property_id')['nearest_open_space_distance'].mean()

# Fill missing distances with the average distance for the respective property
nearest_open_spaces_df['nearest_open_space_distance'].fillna(
    nearest_open_spaces_df['property_id'].map(property_avg_distances), inplace=True
)

#nearest_open_spaces_df.to_csv('../../data/distance/to_openSpace_distance.csv')

#### Caculating Life Index

In [47]:
nearest_open_spaces_df = pd.read_csv('../../data/distance/to_openSpace_distance.csv')
nearest_open_spaces_df = nearest_open_spaces_df.drop(columns='Unnamed: 0')

In [48]:
# Step 1: Split the dataset by train and test IDs
train_ids = rental_df_train_nomiss['id'].unique()
test_ids = rental_df_test_nomiss['id'].unique()

nearest_open_spaces_train = nearest_open_spaces_df[nearest_open_spaces_df['property_id'].isin(train_ids)]
nearest_open_spaces_test = nearest_open_spaces_df[nearest_open_spaces_df['property_id'].isin(test_ids)]

# Step 2: Fill missing distances with the dataset-wide average distance for train and test sets
dataset_avg_distance_train = nearest_open_spaces_train['nearest_open_space_distance'].mean()
dataset_avg_distance_test = nearest_open_spaces_test['nearest_open_space_distance'].mean()

nearest_open_spaces_train['nearest_open_space_distance'].fillna(dataset_avg_distance_train, inplace=True)
nearest_open_spaces_test['nearest_open_space_distance'].fillna(dataset_avg_distance_test, inplace=True)

# Step 3: One-Hot Encoding for train and test sets
open_space_related_categorical = ['OS_CATEGOR', 'OS_STATUS', 'OS_ACCESS', 'MANAGER_TY', 'OS_TYPE']

# Initialize OneHotEncoder
open_space_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Fit and transform on the train dataset
open_space_encoded_train = open_space_encoder.fit_transform(nearest_open_spaces_train[open_space_related_categorical])

# Transform the test dataset using the encoder fitted on the train dataset
open_space_encoded_test = open_space_encoder.transform(nearest_open_spaces_test[open_space_related_categorical])

# Get encoded feature names
encoded_open_space_feature_names = open_space_encoder.get_feature_names_out(open_space_related_categorical)

# Step 4: Create DataFrames for train and test sets with encoded features
open_space_df_encoded_train = pd.DataFrame(open_space_encoded_train, columns=encoded_open_space_feature_names)
open_space_df_encoded_train['property_id'] = nearest_open_spaces_train['property_id'].values
open_space_df_encoded_train['rent'] = nearest_open_spaces_train['rent'].values
open_space_df_encoded_train['nearest_open_space_distance'] = nearest_open_spaces_train['nearest_open_space_distance'].values

open_space_df_encoded_test = pd.DataFrame(open_space_encoded_test, columns=encoded_open_space_feature_names)
open_space_df_encoded_test['property_id'] = nearest_open_spaces_test['property_id'].values
open_space_df_encoded_test['rent'] = nearest_open_spaces_test['rent'].values
open_space_df_encoded_test['nearest_open_space_distance'] = nearest_open_spaces_test['nearest_open_space_distance'].values

# Step 5: Min-Max Scaling on encoded features and nearest_open_space_distance
scaler = MinMaxScaler()

# Scale the encoded categorical features and distance separately
scaled_features_train = scaler.fit_transform(pd.concat([open_space_df_encoded_train[encoded_open_space_feature_names],
                                                        open_space_df_encoded_train[['nearest_open_space_distance']]], axis=1))

scaled_features_test = scaler.transform(pd.concat([open_space_df_encoded_test[encoded_open_space_feature_names],
                                                   open_space_df_encoded_test[['nearest_open_space_distance']]], axis=1))

# Ensure the scaled features have the correct column names
scaled_feature_names_train = encoded_open_space_feature_names.tolist() + ['nearest_open_space_distance']
scaled_feature_names_test = encoded_open_space_feature_names.tolist() + ['nearest_open_space_distance']

# Update the DataFrames with scaled features
open_space_df_encoded_train[scaled_feature_names_train] = scaled_features_train
open_space_df_encoded_test[scaled_feature_names_test] = scaled_features_test

# Step 6: Define feature matrix (X) and target variable (y) for train and test sets
X_open_space_train = open_space_df_encoded_train[scaled_feature_names_train]
y_open_space_train = open_space_df_encoded_train['rent']

X_open_space_test = open_space_df_encoded_test[scaled_feature_names_test]
y_open_space_test = open_space_df_encoded_test['rent']

# Step 7: Cross-validation with Random Forest on the train set
cv_open_space = KFold(n_splits=5, shuffle=True, random_state=42)
rf_model_open_space = RandomForestRegressor(n_estimators=100, random_state=42)
open_space_score_train = cross_val_predict(rf_model_open_space, X_open_space_train, y_open_space_train, cv=cv_open_space)

# Fit the model on the full train set and predict on the test set
rf_model_open_space.fit(X_open_space_train, y_open_space_train)
open_space_score_test = rf_model_open_space.predict(X_open_space_test)

# Step 8: Normalize Open Space Scores for both train and test sets
scaler_open_space = MinMaxScaler()

# Train set
open_space_score_reshaped_train = open_space_score_train.reshape(-1, 1)
open_space_score_normalized_train = scaler_open_space.fit_transform(open_space_score_reshaped_train)

# Test set
open_space_score_reshaped_test = open_space_score_test.reshape(-1, 1)
open_space_score_normalized_test = scaler_open_space.transform(open_space_score_reshaped_test)

# Assign normalized scores to train and test DataFrames
open_space_df_encoded_train['open_space_score'] = open_space_score_normalized_train
open_space_df_encoded_test['open_space_score'] = open_space_score_normalized_test

# Step 9: Normalize reverse distances for both train and test sets
distance_scaler_open_space = MinMaxScaler()

# Train set
open_space_df_encoded_train['reverse_distance/m'] = 1 - distance_scaler_open_space.fit_transform(
    open_space_df_encoded_train[['nearest_open_space_distance']]
)

# Test set
open_space_df_encoded_test['reverse_distance/m'] = 1 - distance_scaler_open_space.transform(
    open_space_df_encoded_test[['nearest_open_space_distance']]
)

# Step 10: Calculate Open Space Index for both train and test sets
open_space_df_encoded_train['open_space_index_equally_important_score'] = (
    0.5 * open_space_df_encoded_train['open_space_score'] + 0.5 * open_space_df_encoded_train['reverse_distance/m']
)

open_space_df_encoded_test['open_space_index_equally_important_score'] = (
    0.5 * open_space_df_encoded_test['open_space_score'] + 0.5 * open_space_df_encoded_test['reverse_distance/m']
)

# Step 11: Normalize the final Open Space Index for both train and test sets
scaler_open_space_final = MinMaxScaler()

# Train set
open_space_df_encoded_train['open_space_index_equally_important_score'] = scaler_open_space_final.fit_transform(
    open_space_df_encoded_train[['open_space_index_equally_important_score']]
)

# Test set
open_space_df_encoded_test['open_space_index_equally_important_score'] = scaler_open_space_final.transform(
    open_space_df_encoded_test[['open_space_index_equally_important_score']]
)

# Step 12: Compute the final Open Space Index for both train and test sets
def compute_open_space_index(group):
    # Sort by open space index score and select the top 5 nearest spaces
    nearest_open_spaces = group.sort_values(by='open_space_index_equally_important_score', ascending=False).head(5)
    # Sum the open space scores for the top 5 nearest spaces
    open_space_index = np.sum(nearest_open_spaces['open_space_index_equally_important_score'])
    return open_space_index

# Apply the function for train and test sets
open_space_index_train = open_space_df_encoded_train.groupby('property_id').apply(lambda group: compute_open_space_index(group))
open_space_index_test = open_space_df_encoded_test.groupby('property_id').apply(lambda group: compute_open_space_index(group))

# Step 13: Assign the Open Space Index back to the DataFrame
open_space_df_encoded_train['open_space_index'] = open_space_df_encoded_train['property_id'].map(open_space_index_train)
open_space_df_encoded_test['open_space_index'] = open_space_df_encoded_test['property_id'].map(open_space_index_test)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nearest_open_spaces_train['nearest_open_space_distance'].fillna(dataset_avg_distance_train, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nearest_open_spaces_test['nearest_open_space_distance'].fillna(dataset_avg_distance_test, inplace=True)


In [49]:
# Function to compute the open space index
def compute_open_space_index(group):
    # Sort by reverse distance (closer open spaces first) and select the top 5 nearest spaces
    nearest_open_spaces = group.sort_values(by='open_space_index_equally_important_score', ascending=False).head(5)
    
    # Sum the open space scores for the top 5 nearest open spaces
    open_space_index = np.sum(nearest_open_spaces['open_space_index_equally_important_score'])
    
    return open_space_index

# Apply this function to compute the open space index for the train set
open_space_index_train = open_space_df_encoded_train.groupby('property_id').apply(lambda group: compute_open_space_index(group))

# Apply this function to compute the open space index for the test set
open_space_index_test = open_space_df_encoded_test.groupby('property_id').apply(lambda group: compute_open_space_index(group))

# Ensure that the column has the correct name for merging
open_space_index_train.name = 'lifeIndex'
open_space_index_test.name = 'lifeIndex'

In [50]:
rental_df_w_index_train = pd.merge(
    rental_df_train_w_transportIndex,  # The dataset that already contains the education and transport index
    open_space_index_train.rename('lifeIndex'),  # The open space index, renamed to 'lifeIndex'
    left_on='id',  # Merging on 'id' in the rental dataset
    right_index=True,  # Merging on the index from the open space index (property_id is the index)
    how='inner'  # Perform an inner join to keep only matching records
)

# Step 2: Merge the open space index into the rental_df_w_education_transport_index_test
rental_df_w_index_test = pd.merge(
    rental_df_test_w_transportIndex,  # The dataset that already contains the education and transport index
    open_space_index_test.rename('lifeIndex'),  # The open space index, renamed to 'lifeIndex'
    left_on='id',  # Merging on 'id' in the rental dataset
    right_index=True,  # Merging on the index from the open space index (property_id is the index)
    how='inner'  # Perform an inner join to keep only matching records
)

# Step 3: Drop the 'order' column if it's not needed in both train and test datasets
rental_df_w_index_train.drop(columns='order', inplace=True, errors='ignore')
rental_df_w_index_test.drop(columns='order', inplace=True, errors='ignore')

In [59]:
rental_df_w_index_train = rental_df_w_index_train.drop(columns='address')
rental_df_w_index_test = rental_df_w_index_test.drop(columns='address')

### (Part 1) Feature Engineering on physical characteristics of property

- property type (one hot encoding)

In [74]:
# Step 1: Get the column index of 'propertyType' in both train and test datasets
col_index_train = rental_df_w_index_train.columns.get_loc('propertyType')

# Step 2: Apply get_dummies to 'propertyType' for the train dataset
rental_df_numericType_train = pd.get_dummies(rental_df_w_index_train, columns=['propertyType'], prefix='propertyType')

# Apply get_dummies to 'propertyType' for the test dataset
rental_df_numericType_test = pd.get_dummies(rental_df_w_index_test, columns=['propertyType'], prefix='propertyType')

# Step 3: Align the columns of the test dataset with the train dataset
# Ensure the test set has the same columns as the train set
rental_df_numericType_test = rental_df_numericType_test.reindex(columns=rental_df_numericType_train.columns, fill_value=0)

# Step 4: Reorder the columns in the train dataset to maintain the position of the encoded propertyType
property_type_columns_train = [col for col in rental_df_numericType_train.columns if col.startswith('propertyType_')]
columns_train = rental_df_numericType_train.columns.tolist()

for col in property_type_columns_train:
    columns_train.remove(col)
columns_train[col_index_train:col_index_train] = property_type_columns_train

rental_df_numericType_train = rental_df_numericType_train[columns_train]

# Step 5: Reorder the columns in the test dataset to maintain the position of the encoded propertyType
property_type_columns_test = [col for col in rental_df_numericType_test.columns if col.startswith('propertyType_')]
columns_test = rental_df_numericType_test.columns.tolist()

for col in property_type_columns_test:
    columns_test.remove(col)
columns_test[col_index_train:col_index_train] = property_type_columns_test  # Use col_index_train to align with the train set

rental_df_numericType_test = rental_df_numericType_test[columns_test]

- property feature (frequency encoding)

In [75]:
rental_df_numericFeatures_train = rental_df_numericType_train.copy()
rental_df_numericFeatures_test = rental_df_numericType_test.copy()

# Split the 'propertyFeatures' column in both datasets
rental_df_numericFeatures_train['propertyFeatures'] = rental_df_numericFeatures_train['propertyFeatures'].str.split(',')
rental_df_numericFeatures_test['propertyFeatures'] = rental_df_numericFeatures_test['propertyFeatures'].str.split(',')

# Find all unique features in the train dataset
all_features_train = [feature.strip() for sublist in rental_df_numericFeatures_train['propertyFeatures'] for feature in sublist]

# Create feature counts for train data and calculate log-inverse feature mapping
feature_counts_train = pd.Series(all_features_train).value_counts()
log_inverse_feature_mapping_train = {feature: 1 / np.log(count + 1) for feature, count in feature_counts_train.items()}

# Calculate mean log-inverse score for train data
mean_log_inverse_score_train = pd.Series(list(log_inverse_feature_mapping_train.values())).mean()

# Define a function to encode features based on the log-inverse frequency
def encode_features(features, feature_mapping, mean_score):
    if features == ['unknown']:
        return mean_score  # Use mean inverse score for 'unknown'
    return sum([feature_mapping.get(f.strip(), 0) for f in features])

# Apply encoding to the 'propertyFeatures' column for train dataset
rental_df_numericFeatures_train['propertyFeatures_encoded'] = rental_df_numericFeatures_train['propertyFeatures'].apply(
    lambda features: encode_features(features, log_inverse_feature_mapping_train, mean_log_inverse_score_train)
)

# Apply encoding to the 'propertyFeatures' column for test dataset (using train mapping)
rental_df_numericFeatures_test['propertyFeatures_encoded'] = rental_df_numericFeatures_test['propertyFeatures'].apply(
    lambda features: encode_features(features, log_inverse_feature_mapping_train, mean_log_inverse_score_train)
)

#  Reorder the columns to place 'propertyFeatures_encoded' in the original 'propertyFeatures' position for both train and test datasets
col_index_train = rental_df_numericFeatures_train.columns.get_loc('propertyFeatures')
col_index_test = rental_df_numericFeatures_test.columns.get_loc('propertyFeatures')

columns_train = rental_df_numericFeatures_train.columns.tolist()
columns_train.insert(col_index_train, columns_train.pop(columns_train.index('propertyFeatures_encoded')))

columns_test = rental_df_numericFeatures_test.columns.tolist()
columns_test.insert(col_index_test, columns_test.pop(columns_test.index('propertyFeatures_encoded')))

rental_df_numericFeatures_train = rental_df_numericFeatures_train[columns_train]
rental_df_numericFeatures_test = rental_df_numericFeatures_test[columns_test]

# Step 10: Drop the original 'propertyFeatures' column from both train and test datasets
rental_df_numericFeatures_train = rental_df_numericFeatures_train.drop(columns=['propertyFeatures'])
rental_df_numericFeatures_test = rental_df_numericFeatures_test.drop(columns=['propertyFeatures'])

- description (Doc2Vec embeddings than PCA reduce dimension)

In [76]:
# Step 1: Copy train and test datasets
rental_df_numericDescription_train = rental_df_numericFeatures_train.copy()
rental_df_numericDescription_test = rental_df_numericFeatures_test.copy()

# Step 2: Reset the index to ensure a clean, continuous sequence
rental_df_numericDescription_train = rental_df_numericDescription_train.reset_index(drop=True)
rental_df_numericDescription_test = rental_df_numericDescription_test.reset_index(drop=True)

# Step 3: Ensure that no descriptions are missing or empty in the train and test datasets
rental_df_numericDescription_train = rental_df_numericDescription_train[
    rental_df_numericDescription_train['description'].notna() & 
    (rental_df_numericDescription_train['description'].str.strip() != "")
]
rental_df_numericDescription_test = rental_df_numericDescription_test[
    rental_df_numericDescription_test['description'].notna() & 
    (rental_df_numericDescription_test['description'].str.strip() != "")
]

# Step 4: Prepare tagged data for training the Doc2Vec model using the train dataset
# Use the newly reset DataFrame index as tags to ensure matching tags and indices
tagged_data_train = [
    TaggedDocument(words=desc.split(), tags=[str(i)]) 
    for i, desc in zip(rental_df_numericDescription_train.index, rental_df_numericDescription_train['description'])
]

# Step 5: Train the Doc2Vec model on the train dataset descriptions
doc2vec_model = Doc2Vec(tagged_data_train, vector_size=300, window=5, min_count=1, workers=4, epochs=40)

# Save the model if needed
doc2vec_model.save("../../models/doc2vec_rental_descriptions_train.model")

# Step 6: Retrieve the Doc2Vec embeddings for the train dataset using matching index values
def get_doc2vec_embedding(index):
    try:
        return doc2vec_model.dv.get_vector(str(index))  # Use get_vector instead of get
    except KeyError:
        return np.zeros(doc2vec_model.vector_size)  # Return zero vector for missing keys

# Apply the function to retrieve embeddings for the train dataset
rental_df_numericDescription_train['doc2vec_embeddings'] = rental_df_numericDescription_train.index.to_series().apply(get_doc2vec_embedding)

# Step 7: Infer the embeddings for the test dataset
rental_df_numericDescription_test['doc2vec_embeddings'] = rental_df_numericDescription_test['description'].apply(
    lambda x: doc2vec_model.infer_vector(x.split())  # Infer embeddings for each description in the test dataset
)

# Step 8: Combine embeddings into arrays for PCA transformation
embeddings_train = np.vstack(rental_df_numericDescription_train['doc2vec_embeddings'].values)
embeddings_test = np.vstack(rental_df_numericDescription_test['doc2vec_embeddings'].values)

In [77]:
# Step 9: Apply PCA to reduce the dimensionality of embeddings (fit on train, transform test)
pca = PCA(n_components=10)

# PCA on train data
reduced_embeddings_train = pca.fit_transform(embeddings_train)

# PCA on test data (using the same PCA model trained on train data)
reduced_embeddings_test = pca.transform(embeddings_test)

# Step 10: Convert reduced embeddings into DataFrames
reduced_embeddings_train_df = pd.DataFrame(reduced_embeddings_train, columns=[f'doc2vec_embedding_{i+1}' for i in range(reduced_embeddings_train.shape[1])])
reduced_embeddings_test_df = pd.DataFrame(reduced_embeddings_test, columns=[f'doc2vec_embedding_{i+1}' for i in range(reduced_embeddings_test.shape[1])])

# Step 11: Find the position of the 'description' column
col_index_train = rental_df_numericDescription_train.columns.get_loc('description')
col_index_test = rental_df_numericDescription_test.columns.get_loc('description')

# Step 12: Insert reduced embeddings right after the original 'description' column
for i, col in enumerate(reduced_embeddings_train_df.columns):
    rental_df_numericDescription_train.insert(col_index_train + 1 + i, col, reduced_embeddings_train_df[col])

for i, col in enumerate(reduced_embeddings_test_df.columns):
    rental_df_numericDescription_test.insert(col_index_test + 1 + i, col, reduced_embeddings_test_df[col])

# Step 13: Drop the original 'description' column from both datasets
rental_df_numericDescription_train = rental_df_numericDescription_train.drop(columns=['description'])
rental_df_numericDescription_test = rental_df_numericDescription_test.drop(columns=['description'])

# Step 14: Drop the 'doc2vec_embeddings' column if present in both datasets
if 'doc2vec_embeddings' in rental_df_numericDescription_train.columns:
    rental_df_numericDescription_train = rental_df_numericDescription_train.drop(columns=['doc2vec_embeddings'])
if 'doc2vec_embeddings' in rental_df_numericDescription_test.columns:
    rental_df_numericDescription_test = rental_df_numericDescription_test.drop(columns=['doc2vec_embeddings'])

In [78]:
rental_df_numericDescription_train.to_csv('../../data/curated/rental_df_numericDescription_train.csv')
rental_df_numericDescription_test.to_csv('../../data/curated/rental_df_numericDescription_test.csv')

In [81]:
rental_df_numericDescription_test


Unnamed: 0,id,rent,suburb_encoded,bedroom,bathroom,parking,propertyType_Apartment,propertyType_Block of Units,propertyType_Duplex,propertyType_House,...,doc2vec_embedding_4,doc2vec_embedding_5,doc2vec_embedding_6,doc2vec_embedding_7,doc2vec_embedding_8,doc2vec_embedding_9,doc2vec_embedding_10,educationIndex,transportIndex,lifeIndex
0,17205306,550.000000,562.156107,3.0,2,3.0,0,0,0,1,...,0.836285,0.036708,-1.523336,0.145149,0.608593,-0.013410,-0.449576,3.336283,3.632117,3.724052
1,17198219,470.000000,481.428571,4.0,2,2.0,0,0,0,1,...,-0.319007,-0.185621,-0.046673,-0.164099,-0.342121,0.692183,0.000414,3.336073,2.803054,2.730468
2,17192805,400.000000,415.384615,3.0,1,3.0,0,0,0,1,...,0.311565,0.183104,0.645433,0.586536,0.745326,-0.234981,-1.923859,3.316745,2.946476,3.063298
3,17186367,601.020209,613.008533,4.0,4,2.0,0,0,0,1,...,0.462641,-0.007321,-0.536598,0.301277,-0.902802,0.061945,0.174813,3.753792,3.842829,3.758799
4,17183641,470.000000,496.333333,3.0,1,1.0,0,0,0,1,...,0.602301,0.116313,-0.873118,-0.532432,-0.095673,-0.477763,-0.348878,3.540573,2.889705,3.536137
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1475,17074068,800.000000,713.000000,3.0,2,2.0,0,0,0,0,...,0.311331,-0.441705,0.215627,-0.055897,-0.562288,-0.073010,-0.582896,3.679646,3.621857,3.774974
1476,17203930,430.000000,492.500000,3.0,2,2.0,0,0,0,0,...,1.331228,-0.631301,0.657347,0.227995,-0.444900,-1.123487,0.384566,3.221527,0.695847,0.636994
1477,17199166,950.000000,641.190476,4.0,3,1.0,0,0,0,0,...,0.221875,-0.951697,-1.025178,-0.192776,0.327907,-0.679706,-0.596598,3.544625,3.068617,3.619969
1478,17198474,450.000000,550.000000,3.0,2,2.0,0,0,0,0,...,0.697269,0.423407,-0.712353,0.186623,-1.073208,-0.209014,0.942093,1.262079,2.757537,3.702783
