In [23]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [24]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [25]:
test_data = pd.read_csv('core-data/test.csv')
train_data_cleaned = pd.read_csv('train-data-for-test.csv')

In [26]:
train_data_cleaned.head()

Unnamed: 0.1,Unnamed: 0,flat_model,floor_area_sqm,latitude,longitude,subzone,planning_area,region,monthly_rent,rent_approved_year,...,monthly_avg_rent_by_planning_area,monthly_avg_rent_by_flat_model,monthly_avg_rent_per_sqm_by_region,monthly_avg_rent_per_sqm_by_subzone,monthly_avg_rent_per_sqm_by_planning_area,monthly_avg_rent_per_sqm_by_flat_model,region_label,subzone_label,planning_area_label,flat_model_label
0,0,new generation,67.0,1.344518,103.73863,yuhua east,jurong east,west region,1600,2021,...,2595.146199,2369.965462,27.233545,29.73822,27.915301,31.060362,3,92,16,18
1,1,new generation,92.0,1.330186,103.938717,bedok north,bedok,east region,2250,2022,...,2438.227223,2369.965462,27.057755,30.937463,29.456489,31.060362,2,136,27,18
2,2,improved,67.0,1.332242,103.845643,toa payoh central,toa payoh,central region,1900,2022,...,2516.680515,2636.211052,33.891545,33.034791,32.040421,27.884012,1,36,20,13
3,3,apartment,149.0,1.370239,103.962894,pasir ris drive,pasir ris,east region,2850,2021,...,2686.857477,2878.725962,27.057755,21.639655,22.531163,20.31339,2,66,9,11
4,4,improved,68.0,1.320502,103.863341,bendemeer,kallang,central region,2100,2022,...,2702.635659,2636.211052,33.891545,34.378542,33.885155,27.884012,1,37,8,13


In [27]:
test_data.head()

Unnamed: 0,rent_approval_date,town,block,street_name,flat_type,flat_model,floor_area_sqm,furnished,lease_commence_date,latitude,longitude,elevation,subzone,planning_area,region
0,2023-01,hougang,245,hougang street 22,5-room,improved,121.0,yes,1984,1.358411,103.891722,0.0,lorong ah soo,hougang,north-east region
1,2022-09,sembawang,316,sembawang vista,4-room,model a,100.0,yes,1999,1.446343,103.820817,0.0,sembawang central,sembawang,north region
2,2023-07,clementi,708,Clementi West Street 2,4-room,new generation,91.0,yes,1980,1.305719,103.762168,0.0,clementi west,clementi,west region
3,2021-08,jurong east,351,Jurong East Street 31,3 room,model a,74.0,yes,1986,1.344832,103.730778,0.0,yuhua west,jurong east,west region
4,2022-03,jurong east,305,jurong east street 32,5-room,improved,121.0,yes,1983,1.345437,103.735241,0.0,yuhua west,jurong east,west region


In [28]:
# Unified duplicated patterns of data
test_data['street_name'] = test_data['street_name'].str.lower()

In [29]:
# Defined the new feature what I did in train data preprocessing part
test_data[['rent_approved_year','rent_approved_month']] = test_data['rent_approval_date'].str.split('-', expand=True)
remaining_year = 99 - (test_data['rent_approved_year'].astype(int) - test_data['lease_commence_date'])
test_data['remaining_year'] = remaining_year
test_data['rent_approved_month'] = test_data['rent_approved_month'].astype(int)
test_data['flat_type'] = test_data['flat_type'].str.replace(' ', '-')

In [30]:
# Dropped the columns as I did in train data preprocessing part
test_data = test_data.drop("furnished", axis=1)
test_data = test_data.drop("elevation", axis=1)
test_data = test_data.drop("town", axis=1)
test_data = test_data.drop("street_name", axis=1)
test_data = test_data.drop("rent_approval_date", axis=1)
test_data = test_data.drop("block", axis=1)
test_data = test_data.drop("flat_type", axis=1)
test_data = test_data.drop("lease_commence_date", axis=1)

In [31]:
# Creating mappings from categorical features to labels based on the training data
region_to_label_mapping = train_data_cleaned.set_index('region')['region_label'].to_dict()
subzone_to_label_mapping = train_data_cleaned.set_index('subzone')['subzone_label'].to_dict()
planning_area_to_label_mapping = train_data_cleaned.set_index('planning_area')['planning_area_label'].to_dict()
flat_model_to_label_mapping = train_data_cleaned.set_index('flat_model')['flat_model_label'].to_dict()

# Applying the same labeling used in the training data to the test data
# This ensures that the test data has consistent labeling with the training data
test_data['region'] = test_data['region'].map(region_to_label_mapping)
test_data['subzone'] = test_data['subzone'].map(subzone_to_label_mapping)
test_data['planning_area'] = test_data['planning_area'].map(planning_area_to_label_mapping)
test_data['flat_model'] = test_data['flat_model'].map(flat_model_to_label_mapping)

# Displaying the first few rows of the test dataset to verify the applied labels
test_data.head()

Unnamed: 0,flat_model,floor_area_sqm,latitude,longitude,subzone,planning_area,region,rent_approved_year,rent_approved_month,remaining_year
0,13,121.0,1.358411,103.891722,123,21,4,2023,1,60
1,14,100.0,1.446343,103.820817,78,19,5,2022,9,76
2,18,91.0,1.305719,103.762168,130,11,3,2023,7,56
3,14,74.0,1.344832,103.730778,128,16,3,2021,8,64
4,13,121.0,1.345437,103.735241,128,16,3,2022,3,60


In [32]:
# The purpose of these codes is to augment the test data set with information from the training data set
# to ensure that the same features representing average rent values and
# average rent per square meter for the various categories are included in the test data set.

# Creation of dictionaries mapping labels to average rents in the training dataset
# This is done for different categorizations: by region, subzone, flat model, and planning area
# Additionally, dictionaries for the average rent per square meter are created for the same categories
avg_rent_by_region_dict = train_data_cleaned.set_index('region_label')['monthly_avg_rent_by_region'].to_dict()
avg_rent_by_subzone_dict = train_data_cleaned.set_index('subzone_label')['monthly_avg_rent_by_subzone'].to_dict()
avg_rent_by_flat_model_dict = train_data_cleaned.set_index('flat_model_label')['monthly_avg_rent_by_flat_model'].to_dict()
avg_rent_by_planning_area_dict = train_data_cleaned.set_index('planning_area_label')['monthly_avg_rent_by_planning_area'].to_dict()
avg_rent_per_sqm_by_region_dict = train_data_cleaned.set_index('region_label')['monthly_avg_rent_per_sqm_by_region'].to_dict()
avg_rent_per_sqm_by_subzone_dict = train_data_cleaned.set_index('subzone_label')['monthly_avg_rent_per_sqm_by_subzone'].to_dict()
avg_rent_per_sqm_by_flat_model_dict = train_data_cleaned.set_index('flat_model_label')['monthly_avg_rent_per_sqm_by_flat_model'].to_dict()
avg_rent_per_sqm_by_planning_area_dict = train_data_cleaned.set_index('planning_area_label')['monthly_avg_rent_per_sqm_by_planning_area'].to_dict()

# For each data, ensure that 'rent_approved_year' is an integer type
train_data_cleaned['rent_approved_year'] = train_data_cleaned['rent_approved_year'].astype(int)
yearly_avg_rent = train_data_cleaned.set_index('rent_approved_year')['yearly_avg_rent'].to_dict()
test_data['rent_approved_year'] = test_data['rent_approved_year'].astype(int)
test_data['yearly_avg_rent'] = test_data['rent_approved_year'].map(yearly_avg_rent)

# Map the test data to the average rent values based on the year of rent approval
# and average rents based on the labels assigned to each categorization
test_data['monthly_avg_rent_by_flat_model'] = test_data['flat_model'].map(avg_rent_by_flat_model_dict)
test_data['monthly_avg_rent_by_subzone'] = test_data['subzone'].map(avg_rent_by_subzone_dict)
test_data['monthly_avg_rent_by_planning_area'] = test_data['planning_area'].map(avg_rent_by_planning_area_dict)
test_data['monthly_avg_rent_by_region'] = test_data['region'].map(avg_rent_by_region_dict)
test_data['monthly_avg_rent_per_sqm_by_flat_model'] = test_data['flat_model'].map(avg_rent_per_sqm_by_flat_model_dict)
test_data['monthly_avg_rent_per_sqm_by_subzone'] = test_data['subzone'].map(avg_rent_per_sqm_by_subzone_dict)
test_data['monthly_avg_rent_per_sqm_by_planning_area'] = test_data['planning_area'].map(avg_rent_per_sqm_by_planning_area_dict)
test_data['monthly_avg_rent_per_sqm_by_region'] = test_data['region'].map(avg_rent_per_sqm_by_region_dict)

test_data.head()

Unnamed: 0,flat_model,floor_area_sqm,latitude,longitude,subzone,planning_area,region,rent_approved_year,rent_approved_month,remaining_year,yearly_avg_rent,monthly_avg_rent_by_flat_model,monthly_avg_rent_by_subzone,monthly_avg_rent_by_planning_area,monthly_avg_rent_by_region,monthly_avg_rent_per_sqm_by_flat_model,monthly_avg_rent_per_sqm_by_subzone,monthly_avg_rent_per_sqm_by_planning_area,monthly_avg_rent_per_sqm_by_region
0,13,121.0,1.358411,103.891722,123,21,4,2023,1,60,3158.694858,2636.211052,2427.604167,2503.252886,2558.82271,27.884012,27.108615,26.405385,27.838756
1,14,100.0,1.446343,103.820817,78,19,5,2022,9,76,2651.014066,2612.031305,2592.33279,2540.49101,2450.623806,28.536668,24.489129,24.87347,25.480723
2,18,91.0,1.305719,103.762168,130,11,3,2023,7,56,3158.694858,2369.965462,2395.588235,2646.808979,2569.167537,31.060362,31.618975,33.177556,27.233545
3,14,74.0,1.344832,103.730778,128,16,3,2021,8,64,2225.773817,2612.031305,2400.15015,2595.146199,2569.167537,28.536668,26.911587,27.915301,27.233545
4,13,121.0,1.345437,103.735241,128,16,3,2022,3,60,2651.014066,2636.211052,2400.15015,2595.146199,2569.167537,27.884012,26.911587,27.915301,27.233545


In [33]:
print(len(test_data))
test_data.to_csv('test-data-cleaned.csv')

30000
