# Multivariate K-Nearest Neighbors
上一篇主要看單一因素來預測價格

In [3]:
import pandas as pd
import numpy as np
np.random.seed(1)

dc_listings = pd.read_csv('dc_airbnb.csv')
dc_listings = dc_listings.loc[np.random.permutation(len(dc_listings))]
stripped_commas = dc_listings['price'].str.replace(',', '')
stripped_dollars = stripped_commas.str.replace('$', '')
dc_listings['price'] = stripped_dollars.astype('float')
dc_listings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3723 entries, 574 to 1061
Data columns (total 19 columns):
host_response_rate      3289 non-null object
host_acceptance_rate    3109 non-null object
host_listings_count     3723 non-null int64
accommodates            3723 non-null int64
room_type               3723 non-null object
bedrooms                3702 non-null float64
bathrooms               3696 non-null float64
beds                    3712 non-null float64
price                   3723 non-null float64
cleaning_fee            2335 non-null object
security_deposit        1426 non-null object
minimum_nights          3723 non-null int64
maximum_nights          3723 non-null int64
number_of_reviews       3723 non-null int64
latitude                3723 non-null float64
longitude               3723 non-null float64
city                    3723 non-null object
zipcode                 3714 non-null object
state                   3723 non-null object
dtypes: float64(6), int64(5), objec

先分析欄位的資料型態:
1.The following columns contain non-numerical values:
room_type: e.g. Private room
city: e.g. Washington
state: e.g. DC

2.while these columns contain numerical but non-ordinal values:
latitude: e.g. 38.913458
longitude: e.g. -77.031
zipcode: e.g. 20009

3.Since a host could have many living spaces and we don't have enough information to uniquely group living spaces to the hosts themselves, let's avoid using any columns that don't directly describe the living space or the listing itself
host_response_rate
host_acceptance_rate
host_listings_count

In [4]:
drop_columns = ['room_type', 'city', 'state', 'latitude', 'longitude', 'zipcode', 'host_response_rate', 'host_acceptance_rate', 'host_listings_count']
dc_listings = dc_listings.drop(drop_columns, axis=1)
print(dc_listings.isnull().sum())

accommodates            0
bedrooms               21
bathrooms              27
beds                   11
price                   0
cleaning_fee         1388
security_deposit     2297
minimum_nights          0
maximum_nights          0
number_of_reviews       0
dtype: int64


從欄位數量來看
cleaning_fee - 37.3% of the rows
security_deposit - 61.7% of the rows
有明顯缺失值我們將不考慮

而bedrooms,bathrooms,beds 則有些為缺失值
我們將他的缺失值刪掉

In [5]:
dc_listings = dc_listings.drop(['cleaning_fee', 'security_deposit'], axis=1)
dc_listings = dc_listings.dropna(axis=0)
print(dc_listings.isnull().sum())

accommodates         0
bedrooms             0
bathrooms            0
beds                 0
price                0
minimum_nights       0
maximum_nights       0
number_of_reviews    0
dtype: int64


In [6]:
dc_listings.head()

Unnamed: 0,accommodates,bedrooms,bathrooms,beds,price,minimum_nights,maximum_nights,number_of_reviews
574,2,1.0,1.0,1.0,125.0,1,4,149
1593,2,1.0,1.5,1.0,85.0,1,30,49
3091,1,1.0,0.5,1.0,50.0,1,1125,1
420,2,1.0,1.0,1.0,209.0,4,730,2
808,12,5.0,2.0,5.0,215.0,2,1825,34


數據標準化:轉換為0~1之間

normalized_listings = (dc_listings - dc_listings.mean()) / (dc_listings.std())


In [7]:
normalized_listings = (dc_listings - dc_listings.mean())/(dc_listings.std())
#將原本的price傳回原本的欄位
normalized_listings['price'] = dc_listings['price']
print(normalized_listings.head(3))

      accommodates  bedrooms  bathrooms      beds  price  minimum_nights  \
574      -0.596544 -0.249467  -0.439151 -0.546858  125.0       -0.341375   
1593     -0.596544 -0.249467   0.412923 -0.546858   85.0       -0.341375   
3091     -1.095499 -0.249467  -1.291226 -0.546858   50.0       -0.341375   

      maximum_nights  number_of_reviews  
574        -0.016604           4.579650  
1593       -0.016603           1.159275  
3091       -0.016573          -0.482505  


使用We can instead use the distance.euclidean() function from scipy.spatial, which takes in 2 vectors as the parameters and calculates the Euclidean distance between them. The euclidean()

Calculate the Euclidean distance using only the accommodates and bathrooms features between the first row and fifth row in normalized_listings using the distance.euclidean() function.
Assign the distance value to first_fifth_distance and display using the print function.



In [8]:
'''
from scipy.spatial import distance
first_listing = [-0.596544, -0.439151]
second_listing = [-0.596544, 0.412923]
dist = distance.euclidean(first_listing, second_listing)
'''
from scipy.spatial import distance
first_listing = normalized_listings.iloc[0][['accommodates', 'bathrooms']]
fifth_listing = normalized_listings.iloc[4][['accommodates', 'bathrooms']]
first_fifth_distance = distance.euclidean(first_listing, fifth_listing)
print(first_fifth_distance)

5.272543124668404


In [9]:
#使用sklearn knn

from sklearn.neighbors import KNeighborsRegressor
#分割資料集
train_df = normalized_listings.iloc[0:2792]
test_df = normalized_listings.iloc[2792:]
#knn 初始化
knn=KNeighborsRegressor(n_neighbors=5, algorithm='brute')
train_x=train_df[['accommodates','bathrooms']]
train_y=train_df['price']
knn.fit(train_x,train_y)
predictions=knn.predict(test_df[['accommodates','bathrooms']])

In [17]:
from sklearn.metrics import mean_squared_error

train_columns = ['accommodates', 'bathrooms']
knn = KNeighborsRegressor(n_neighbors=5, algorithm='brute', metric='euclidean')
knn.fit(train_df[train_columns], train_df['price'])
predictions = knn.predict(test_df[train_columns])
two_features_mse=mean_squared_error(test_df['price'],predictions)
two_features_rmse=two_features_mse**0.5

two_features_rmse

125.02565961889556

In [15]:
features = ['accommodates', 'bedrooms', 'bathrooms', 'number_of_reviews']
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor(n_neighbors=5, algorithm='brute')
knn.fit(train_df[features],train_df['price'])
four_predictions=knn.predict(test_df[features])
four_mse=mean_squared_error(test_df['price'],four_predictions)
four_rmse=four_mse**0.5

four_rmse

116.01739640453459

In [13]:
attributes=train_df.columns.tolist()
attributes.remove('price')
knn=KNeighborsRegressor(n_neighbors=5,algorithm='brute')
knn.fit(train_df[attributes],train_df['price'])
all_features_predictions=knn.predict(test_df[attributes])
all_features_mse=mean_squared_error(test_df['price'],all_features_predictions)
all_features_rmse=all_features_mse**0.5
all_features_rmse

123.87083532970432