In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import preprocessing
from sklearn.metrics.pairwise import euclidean_distances

plt.style.use('fivethirtyeight')
import warnings

warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
data = pd.read_csv('./data/test.csv')
data.head()

Unnamed: 0,listing_id,title,address,property_name,property_type,tenure,built_year,num_beds,num_baths,size_sqft,floor_level,furnishing,available_unit_types,total_num_units,property_details_url,lat,lng,elevation,subzone,planning_area
0,777912,1 bed condo for sale in the gazania,17 how sun drive,the gazania,condo,freehold,2022.0,1.0,1.0,463,,unfurnished,"studio, 1, 2, 3, 4, 5 br",250.0,https://www.99.co/singapore/condos-apartments/...,1.344334,103.87869,0,upper paya lebar,serangoon
1,936612,3 bed condo for sale in vue 8 residence,95 pasir ris heights,vue 8 residence,Condo,99-year leasehold,2017.0,3.0,3.0,1033,high,unspecified,"studio, 1, 2, 3, 4, 5 br",463.0,https://www.99.co/singapore/condos-apartments/...,1.380281,103.943878,0,pasir ris west,pasir ris
2,995264,1 bed condo for sale in icon,10 gopeng street,icon,condo,99-year leasehold,2007.0,1.0,1.0,570,,fully,"studio, 1, 2, 3 br",646.0,https://www.99.co/singapore/condos-apartments/...,1.294668,103.850074,0,bras basah,museum
3,477435,hdb flat for sale in 812b choa chu kang avenue 7,bukit batok / bukit panjang / choa chu kang (d23),keat hong colours,Hdb,99-year leasehold,2017.0,3.0,2.0,1216,,unspecified,"1, 2, 3, 4, 5 br",968.0,https://www.99.co/singapore/hdb/keat-hong-colo...,1.37312,103.746094,0,keat hong,choa chu kang
4,222529,hdb flat for sale in 204 toa payoh north,balestier / toa payoh (d12),toa payoh spring,hdb 4 rooms,99-year leasehold,1973.0,3.0,2.0,936,,unspecified,"1, 2, 3, 4 br",,https://www.99.co/singapore/hdb/toa-payoh-spri...,1.341468,103.849047,0,braddell,toa payoh


In [3]:
print('Before cleaning, there are {} records.'.format(data.shape[0]))
data.isnull().sum()

Before cleaning, there are 6966 records.


listing_id                 0
title                      0
address                    2
property_name              0
property_type              0
tenure                   637
built_year               358
num_beds                  35
num_baths                149
size_sqft                  0
floor_level             5810
furnishing                 0
available_unit_types     520
total_num_units         1900
property_details_url       0
lat                        0
lng                        0
elevation                  0
subzone                   33
planning_area             33
dtype: int64

In [4]:
data = data.drop(['elevation', 'floor_level', 'listing_id', 'address', 'property_name', 'title', 'property_details_url',
                  'total_num_units'], axis=1)
# fill na by mean
data[['built_year', 'num_beds', 'num_baths']] = data[['built_year', 'num_beds', 'num_baths']].fillna(
    data[['built_year', 'num_beds', 'num_baths']].mean())

# fill na by mode
data['available_unit_types'] = data['available_unit_types'].fillna(data['available_unit_types'].mode()[0])
data['tenure'] = data['tenure'].fillna(data['tenure'].mode()[0])

In [5]:
top10Dist = euclidean_distances(data[['lat', 'lng']], data[['lat', 'lng']]).argsort(axis=1)[:, :50]
for index, row in data[data['planning_area'].isnull()].iterrows():
    data.at[index, 'planning_area'] = data.loc[top10Dist[index]][~data['planning_area'].isna()]['planning_area'].mode()[
        0]
    data.at[index, 'subzone'] = data.loc[top10Dist[index]][~data['subzone'].isna()]['subzone'].mode()[0]

In [6]:
print('After cleaning, there are now {} records.'.format(data.shape[0]))
data.isnull().sum()

After cleaning, there are now 6966 records.


property_type           0
tenure                  0
built_year              0
num_beds                0
num_baths               0
size_sqft               0
furnishing              0
available_unit_types    0
lat                     0
lng                     0
subzone                 0
planning_area           0
dtype: int64

In [7]:
normalizer = preprocessing.MinMaxScaler()


def normLatLng(X):
    X.lat = normalizer.fit_transform(X.lat.values.reshape(-1, 1))
    X.lng = normalizer.fit_transform(X.lng.values.reshape(-1, 1))

In [8]:

commercials = pd.read_csv('./data/auxiliary-data/sg-commerical-centres.csv')
mrts = pd.read_csv('./data/auxiliary-data/sg-mrt-stations.csv')
primary_schools = pd.read_csv('./data/auxiliary-data/sg-primary-schools.csv')
second_schools = pd.read_csv('./data/auxiliary-data/sg-secondary-schools.csv')
shoppings = pd.read_csv('./data/auxiliary-data/sg-shopping-malls.csv')
subzones = pd.read_csv('./data/auxiliary-data/sg-subzones.csv')

normLatLng(commercials)
normLatLng(mrts)
normLatLng(primary_schools)
normLatLng(second_schools)
normLatLng(shoppings)

In [9]:
data['shop_dist'] = euclidean_distances(data[['lat', 'lng']], shoppings[['lat', 'lng']]).min(axis=1)
data['mrt_dist'] = euclidean_distances(data[['lat', 'lng']], mrts[['lat', 'lng']]).min(axis=1)
data['commercials_dist'] = euclidean_distances(data[['lat', 'lng']], commercials[['lat', 'lng']]).min(axis=1)
data['second_school_dist'] = euclidean_distances(data[['lat', 'lng']], second_schools[['lat', 'lng']]).min(axis=1)
data['primary_school_dist'] = euclidean_distances(data[['lat', 'lng']], primary_schools[['lat', 'lng']]).min(axis=1)

# subzone: compute population density within the subzone
subzones = subzones[subzones.population > 0]
subzones['density'] = subzones.population / subzones.area_size
subzones.rename(columns={'name': 'subzone'}, inplace=True)
data = data.merge(subzones[['subzone', 'density']], how='left', on='subzone')

In [10]:
data["property_type"] = data["property_type"].str.lower()

In [11]:
data.isnull().sum()

property_type           0
tenure                  0
built_year              0
num_beds                0
num_baths               0
size_sqft               0
furnishing              0
available_unit_types    0
lat                     0
lng                     0
subzone                 0
planning_area           0
shop_dist               0
mrt_dist                0
commercials_dist        0
second_school_dist      0
primary_school_dist     0
density                 0
dtype: int64

In [12]:
from preprocess import *
data["property_type"] = data["property_type"].apply(preprocess_property_type)
data["tenure"] = data["tenure"].apply(preprocess_tenure)

In [13]:
data.groupby(by='property_type').count()

Unnamed: 0_level_0,tenure,built_year,num_beds,num_baths,size_sqft,furnishing,available_unit_types,lat,lng,subzone,planning_area,shop_dist,mrt_dist,commercials_dist,second_school_dist,primary_school_dist,density
property_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
apartment,189,189,189,189,189,189,189,189,189,189,189,189,189,189,189,189,189
bungalow,250,250,250,250,250,250,250,250,250,250,250,250,250,250,250,250,250
condo,3676,3676,3676,3676,3676,3676,3676,3676,3676,3676,3676,3676,3676,3676,3676,3676,3676
hdb,2147,2147,2147,2147,2147,2147,2147,2147,2147,2147,2147,2147,2147,2147,2147,2147,2147
house,602,602,602,602,602,602,602,602,602,602,602,602,602,602,602,602,602
landed,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
other,94,94,94,94,94,94,94,94,94,94,94,94,94,94,94,94,94


In [15]:
data.groupby(by='tenure').count()

Unnamed: 0_level_0,property_type,built_year,num_beds,num_baths,size_sqft,furnishing,available_unit_types,lat,lng,subzone,planning_area,shop_dist,mrt_dist,commercials_dist,second_school_dist,primary_school_dist,density
tenure,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
freehold,2194,2194,2194,2194,2194,2194,2194,2194,2194,2194,2194,2194,2194,2194,2194,2194,2194
tenure-100,4603,4603,4603,4603,4603,4603,4603,4603,4603,4603,4603,4603,4603,4603,4603,4603,4603
tenure-1000,169,169,169,169,169,169,169,169,169,169,169,169,169,169,169,169,169


In [None]:
# labelEnc = preprocessing.LabelEncoder()
# data.planning_area = labelEnc.fit_transform(data.planning_area)
# data.subzone = labelEnc.fit_transform(data.subzone)
# data.furnishing = labelEnc.fit_transform(data.furnishing)
# data.tenure = labelEnc.fit_transform(data.tenure)
# data.property_type = labelEnc.fit_transform(data.property_type)
# data.available_unit_types = labelEnc.fit_transform(data.available_unit_types)
# data.size_sqft = normalizer.fit_transform(data.size_sqft.values.reshape(-1, 1))
# data.lat = normalizer.fit_transform(data.lat.values.reshape(-1, 1))
# data.lng = normalizer.fit_transform(data.lng.values.reshape(-1, 1))
# data.density = normalizer.fit_transform(data.density.values.reshape(-1, 1))

In [16]:
data.to_csv('./clean/test.csv', index=False)