In [14]:
import pandas as pd
import numpy as np

In [15]:
file = pd.read_csv("dc_airbnb.csv")
file.head()

Unnamed: 0,host_response_rate,host_acceptance_rate,host_listings_count,accommodates,room_type,bedrooms,bathrooms,beds,price,cleaning_fee,security_deposit,minimum_nights,maximum_nights,number_of_reviews,latitude,longitude,city,zipcode,state
0,92%,91%,26,4,Entire home/apt,1.0,1.0,2.0,$160.00,$115.00,$100.00,1,1125,0,38.890046,-77.002808,Washington,20003,DC
1,90%,100%,1,6,Entire home/apt,3.0,3.0,3.0,$350.00,$100.00,,2,30,65,38.880413,-76.990485,Washington,20003,DC
2,90%,100%,2,1,Private room,1.0,2.0,1.0,$50.00,,,2,1125,1,38.955291,-76.986006,Hyattsville,20782,MD
3,100%,,1,2,Private room,1.0,1.0,1.0,$95.00,,,1,1125,0,38.872134,-77.019639,Washington,20024,DC
4,92%,67%,1,4,Entire home/apt,1.0,1.0,1.0,$50.00,$15.00,$450.00,7,1125,0,38.996382,-77.041541,Silver Spring,20910,MD


In [16]:
file["price"] = file["price"].str.replace("$","").str.replace(",","").astype(float)


In [17]:
file["price"].head()

0    160.0
1    350.0
2     50.0
3     95.0
4     50.0
Name: price, dtype: float64

In [18]:
np.random.seed(123)
shuffle = np.random.permutation(file.index)
file = file.loc[shuffle]

### algorithm doesn't work if null values are present in column

In [19]:
#data cleaning
file.isnull().sum()

host_response_rate       434
host_acceptance_rate     614
host_listings_count        0
accommodates               0
room_type                  0
bedrooms                  21
bathrooms                 27
beds                      11
price                      0
cleaning_fee            1388
security_deposit        2297
minimum_nights             0
maximum_nights             0
number_of_reviews          0
latitude                   0
longitude                  0
city                       0
zipcode                    9
state                      0
dtype: int64

In [20]:
file.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3723 entries, 1657 to 3582
Data columns (total 19 columns):
host_response_rate      3289 non-null object
host_acceptance_rate    3109 non-null object
host_listings_count     3723 non-null int64
accommodates            3723 non-null int64
room_type               3723 non-null object
bedrooms                3702 non-null float64
bathrooms               3696 non-null float64
beds                    3712 non-null float64
price                   3723 non-null float64
cleaning_fee            2335 non-null object
security_deposit        1426 non-null object
minimum_nights          3723 non-null int64
maximum_nights          3723 non-null int64
number_of_reviews       3723 non-null int64
latitude                3723 non-null float64
longitude               3723 non-null float64
city                    3723 non-null object
zipcode                 3714 non-null object
state                   3723 non-null object
dtypes: float64(6), int64(5), obje

### Algorithm only works on numerical data. so dropping rows with data type "object" (qualitative)

In [21]:
obj = ["host_response_rate","host_acceptance_rate","room_type","city","zipcode","state","cleaning_fee","security_deposit"]
file.drop(obj, axis=1, inplace= True)
#file.info()

In [22]:
file.dropna(axis=0,inplace=True)
file.isnull().sum()

host_listings_count    0
accommodates           0
bedrooms               0
bathrooms              0
beds                   0
price                  0
minimum_nights         0
maximum_nights         0
number_of_reviews      0
latitude               0
longitude              0
dtype: int64

In [24]:
file.drop(["longitude","latitude"], axis=1, inplace=True)
#file

Unnamed: 0,host_listings_count,accommodates,bedrooms,bathrooms,beds,price,minimum_nights,maximum_nights,number_of_reviews
1657,1,4,1.0,1.0,2.0,125.0,2,1125,1
820,6,1,0.0,1.0,1.0,95.0,5,1125,0
2405,2,6,2.0,2.5,2.0,399.0,2,1125,1
298,2,2,0.0,1.0,1.0,130.0,2,180,19
3670,1,2,1.0,1.0,1.0,90.0,1,1125,1
822,1,2,1.0,0.0,1.0,85.0,3,1125,1
1820,1,2,0.0,1.0,1.0,80.0,1,1125,4
3096,1,2,1.0,1.0,1.0,55.0,1,1125,34
1566,1,3,1.0,1.0,1.0,88.0,1,1125,0
2142,1,2,1.0,1.5,1.0,107.0,1,1125,4


### Columns like minimum nights etc need to be standardized or normalized. This can be done by calculating the Z Score

In [25]:
normalized = (file - file.mean())/file.std()
normalized.head()

Unnamed: 0,host_listings_count,accommodates,bedrooms,bathrooms,beds,price,minimum_nights,maximum_nights,number_of_reviews
1657,-0.19009,0.401366,-0.249467,-0.439151,0.297345,-0.173345,-0.065038,-0.016573,-0.482505
820,-0.109668,-1.095499,-1.43881,-0.439151,-0.546858,-0.391448,0.763971,-0.016573,-0.516709
2405,-0.174005,1.399275,0.939875,2.117072,0.297345,1.818657,-0.065038,-0.016573,-0.482505
298,-0.174005,-0.596544,-1.43881,-0.439151,-0.546858,-0.136995,-0.065038,-0.016599,0.133163
3670,-0.19009,-0.596544,-0.249467,-0.439151,-0.546858,-0.427798,-0.341375,-0.016573,-0.482505


### However we need the price in its original form. As we have to predict the price. So we update this column from our previous dataframe

In [26]:
normalized["price"] = file["price"]
normalized.head()

Unnamed: 0,host_listings_count,accommodates,bedrooms,bathrooms,beds,price,minimum_nights,maximum_nights,number_of_reviews
1657,-0.19009,0.401366,-0.249467,-0.439151,0.297345,125.0,-0.065038,-0.016573,-0.482505
820,-0.109668,-1.095499,-1.43881,-0.439151,-0.546858,95.0,0.763971,-0.016573,-0.516709
2405,-0.174005,1.399275,0.939875,2.117072,0.297345,399.0,-0.065038,-0.016573,-0.482505
298,-0.174005,-0.596544,-1.43881,-0.439151,-0.546858,130.0,-0.065038,-0.016599,0.133163
3670,-0.19009,-0.596544,-0.249467,-0.439151,-0.546858,90.0,-0.341375,-0.016573,-0.482505


In [27]:
total_rows = normalized.shape[0]

In [29]:
train_data = normalized[:round(total_rows*0.75)]
test_data = normalized[round(total_rows*0.75):]

In [35]:
features = ["accommodates","bedrooms","bathrooms","beds"]
target = "price"

In [31]:
from sklearn.neighbors import KNeighborsRegressor

In [38]:
knn = KNeighborsRegressor(algorithm="brute")
knn.fit(train_data[features],train_data[target])
prediction = knn.predict(test_data[features])

### When training the model, it needs to be trained on 2D data. so we are using list of lists. whereas it can predict on one-dimensional hence we only gave one vaiable as target, and did not enclose in list

In [39]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(test_data[target],prediction)
rmse = np.sqrt(mse)
rmse

135.69156618190243

### This is not a good model as the value of root mean square error is very high

## Hyperparamater Optimization

In [48]:
k = [1,5,10,15,20]
rmse_values = []
for i in k:
    
    knn = KNeighborsRegressor(n_neighbors=i,algorithm="brute")
    knn.fit(train_data[features],train_data[target])
    prediction = knn.predict(test_data[features])
    mse = mean_squared_error(test_data[target],prediction)
    rmse = np.sqrt(mse)
    rmse_values.append(rmse)

In [49]:
rmse_values

[179.78456909702527,
 135.69156618190243,
 126.45219114305681,
 127.54350289971384,
 125.73197388308174]