In [45]:
import pandas as pd

In [46]:
# Reading data
data = pd.read_csv('Data/archive/Melbourne_Housing_Data_Cleaned.csv')

In [47]:
data.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Date,Distance,Postcode,Bathroom,Car,LandArea,Council,Latitude,Longitude,Region,State
0,Abbotsford,85 Turner St,2,h,1480000.0,3/12/2016,2.5,3067,1,1,202.0,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,VIC
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,4/02/2016,2.5,3067,1,0,156.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,VIC
2,Abbotsford,5 Charles St,3,h,1465000.0,4/03/2017,2.5,3067,2,0,134.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,VIC
3,Abbotsford,5 Charles St,3,h,1465000.0,4/03/2017,2.5,3067,2,0,134.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,VIC
4,Abbotsford,5 Charles St,3,h,1465000.0,4/03/2017,2.5,3067,2,0,134.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,VIC


In [48]:
# Dropping irrelevant columns
data = data.drop(columns=['Address', 'Date', 'Council', 'Latitude', 'Longitude', 'State', 'Region'])

In [49]:
data.head()

Unnamed: 0,Suburb,Rooms,Type,Price,Distance,Postcode,Bathroom,Car,LandArea
0,Abbotsford,2,h,1480000.0,2.5,3067,1,1,202.0
1,Abbotsford,2,h,1035000.0,2.5,3067,1,0,156.0
2,Abbotsford,3,h,1465000.0,2.5,3067,2,0,134.0
3,Abbotsford,3,h,1465000.0,2.5,3067,2,0,134.0
4,Abbotsford,3,h,1465000.0,2.5,3067,2,0,134.0


In [50]:
# Separating out the features and target variable
X = data.drop('Price', axis=1)
y = data['Price']

In [51]:
suburb = 'South Yarra'
bedrooms = 3
bathrooms = 2
car_spaces = 2
property_type = 'h'

In [52]:
postcode = X[X['Suburb'] == suburb]['Postcode'].values[0]

In [53]:
postcode

3141

In [54]:
# Creating a new data
new_data = pd.DataFrame({
    'Suburb': [suburb],
    'Rooms': [bedrooms],
    'Type': [property_type],
    'Bathroom': [bathrooms],
    'Car': [car_spaces],
    'Postcode': [postcode]
})

new_data

Unnamed: 0,Suburb,Rooms,Type,Bathroom,Car,Postcode
0,South Yarra,3,h,2,2,3141


In [55]:
X = X[X['Suburb'] == new_data['Suburb'].values[0]]

In [56]:
suburb_data = data[data['Suburb'] == new_data['Suburb'].values[0]]

In [57]:
suburb_data

Unnamed: 0,Suburb,Rooms,Type,Price,Distance,Postcode,Bathroom,Car,LandArea
6052,South Yarra,1,u,430000.0,3.3,3141,1,1,0.0
6053,South Yarra,2,u,722000.0,3.3,3141,1,1,0.0
6054,South Yarra,2,u,930000.0,3.3,3141,2,2,0.0
6055,South Yarra,1,u,467500.0,3.3,3141,1,1,0.0
6056,South Yarra,2,u,475000.0,3.3,3141,1,0,0.0
...,...,...,...,...,...,...,...,...,...
19009,South Yarra,4,h,3030000.0,2.7,3141,1,1,436.0
19532,South Yarra,3,h,2160000.0,2.7,3141,2,0,217.0
19533,South Yarra,3,h,1950000.0,2.7,3141,1,0,252.0
20086,South Yarra,3,h,1720000.0,2.7,3141,2,1,277.0


In [58]:
X_encoded = pd.get_dummies(X, columns=['Suburb', 'Type'], drop_first=True)

In [59]:
weights = {
    'Rooms': 1.0,
    'Bathroom': 1.0,
    'Car': 1.0,
    'Postcode': 1.0,
    'Distance': 0.0,
    'LandArea': 0.0
}

In [60]:
for feature, weight in weights.items():
    X_encoded[feature] *= weight

In [61]:
from sklearn.preprocessing import StandardScaler


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_encoded)

In [62]:
from sklearn.neighbors import NearestNeighbors

k = 10

knn = NearestNeighbors(n_neighbors=k, algorithm='auto')
knn.fit(X_scaled)

In [63]:
# Preprocessing the new data in the same way as the training data
new_data_encoded = pd.get_dummies(new_data, columns=['Suburb', 'Type'], drop_first=True)

# Ensure the encoded new data has the same columns as the training data
new_data_encoded = new_data_encoded.reindex(columns=X_encoded.columns, fill_value=0)

# Now you can scale
new_data_scaled = scaler.transform(new_data_encoded)

In [64]:
distances, indices = knn.kneighbors(new_data_scaled)

In [65]:
distances

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 1.26045748]])

In [66]:
X.iloc[indices[0]]

Unnamed: 0,Suburb,Rooms,Type,Distance,Postcode,Bathroom,Car,LandArea
10939,South Yarra,3,h,2.7,3141,2,2,273.0
6128,South Yarra,3,h,3.3,3141,2,2,141.0
15703,South Yarra,3,h,2.7,3141,2,2,211.0
14098,South Yarra,3,h,2.7,3141,2,2,462.0
15317,South Yarra,3,h,2.7,3141,2,2,223.0
6135,South Yarra,3,h,3.3,3141,2,2,221.0
15318,South Yarra,3,h,2.7,3141,2,2,153.0
6171,South Yarra,3,h,3.3,3141,2,2,0.0
6115,South Yarra,3,h,3.3,3141,2,2,350.0
6088,South Yarra,2,h,3.3,3141,2,2,355.0


In [67]:
exact_matches = []
for i in range(len(distances[0])):
    if distances[0][i] == 0.0:
        exact_matches.append(indices[0][i])

In [68]:
if exact_matches == []:
    print(suburb_data.iloc[indices[0]])
else:
    print(suburb_data.iloc[exact_matches].sort_values(by='Price', ascending=True))

            Suburb  Rooms Type      Price  Distance  Postcode  Bathroom  Car  \
14098  South Yarra      3    h  1400000.0       2.7      3141         2    2   
6171   South Yarra      3    h  1400000.0       3.3      3141         2    2   
6128   South Yarra      3    h  1700000.0       3.3      3141         2    2   
15318  South Yarra      3    h  1750000.0       2.7      3141         2    2   
10939  South Yarra      3    h  1810000.0       2.7      3141         2    2   
15317  South Yarra      3    h  2180000.0       2.7      3141         2    2   
6135   South Yarra      3    h  2285000.0       3.3      3141         2    2   
15703  South Yarra      3    h  2300000.0       2.7      3141         2    2   
6115   South Yarra      3    h  2870000.0       3.3      3141         2    2   

       LandArea  
14098     462.0  
6171        0.0  
6128      141.0  
15318     153.0  
10939     273.0  
15317     223.0  
6135      221.0  
15703     211.0  
6115      350.0  
