In [16]:
# Use the DataFrame.info() method to return the number of non-null values in each column.
import pandas as pd
import numpy as np
np.random.seed(1)

dc_listings = pd.read_csv('dc_airbnb.csv')
dc_listings = dc_listings.loc[np.random.permutation(len(dc_listings))]
stripped_commas = dc_listings['price'].str.replace(',', '')
stripped_dollars = stripped_commas.str.replace('$', '')
dc_listings['price'] = stripped_dollars.astype('float')
dc_listings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3723 entries, 574 to 1061
Data columns (total 19 columns):
host_response_rate      3289 non-null object
host_acceptance_rate    3109 non-null object
host_listings_count     3723 non-null int64
accommodates            3723 non-null int64
room_type               3723 non-null object
bedrooms                3702 non-null float64
bathrooms               3696 non-null float64
beds                    3712 non-null float64
price                   3723 non-null float64
cleaning_fee            2335 non-null object
security_deposit        1426 non-null object
minimum_nights          3723 non-null int64
maximum_nights          3723 non-null int64
number_of_reviews       3723 non-null int64
latitude                3723 non-null float64
longitude               3723 non-null float64
city                    3723 non-null object
zipcode                 3714 non-null object
state                   3723 non-null object
dtypes: float64(6), int64(5), objec

##  Removing features
df.drop(cols, axis = 1)

In [17]:
drop_columns = ['room_type', 'city', 'state', 'latitude', 'longitude', 'zipcode', 'host_response_rate', 'host_acceptance_rate', 'host_listings_count']
dc_listings = dc_listings.drop(drop_columns, axis=1)
print(dc_listings.isnull().sum())

accommodates            0
bedrooms               21
bathrooms              27
beds                   11
price                   0
cleaning_fee         1388
security_deposit     2297
minimum_nights          0
maximum_nights          0
number_of_reviews       0
dtype: int64


In [18]:
dc_listings = dc_listings.drop(['cleaning_fee', 'security_deposit'], axis=1)

In [19]:
dc_listings = dc_listings.dropna(axis=0)

In [20]:
print(dc_listings.isnull().sum())

accommodates         0
bedrooms             0
bathrooms            0
beds                 0
price                0
minimum_nights       0
maximum_nights       0
number_of_reviews    0
dtype: int64


## Normalize columns

In [21]:
print(dc_listings.shape)
print(dc_listings.mean().shape)

(3671, 8)
(8,)


In [22]:
normalized_listings = (dc_listings - dc_listings.mean())/(dc_listings.std())

In [23]:
normalized_listings['price'] = dc_listings['price']
print(normalized_listings.head(3))

      accommodates  bedrooms  bathrooms      beds  price  minimum_nights  \
574      -0.596544 -0.249467  -0.439151 -0.546858  125.0       -0.341375   
1593     -0.596544 -0.249467   0.412923 -0.546858   85.0       -0.341375   
3091     -1.095499 -0.249467  -1.291226 -0.546858   50.0       -0.341375   

      maximum_nights  number_of_reviews  
574        -0.016604           4.579650  
1593       -0.016603           1.159275  
3091       -0.016573          -0.482505  


## Scipy Euclidean distance


In [25]:
from scipy.spatial import distance
cols = ['accommodates','bathrooms']
first_fifth_distance = distance.euclidean(normalized_listings.iloc[0][cols], normalized_listings.iloc[4][cols])

In [26]:
print(first_fifth_distance)

5.272543124668404



<h2><b>The scikit-learn workflow consists of 4 main steps:</b></h2>

<li>instantiate the specific machine learning model you want to use</li>
<li>fit the model to the training data</li>
<li>use the model to make predictions</li>
<li>evaluate the accuracy of the predictions</li>

## the fit method takes in 2 required parameters:

<li>matrix-like object, containing the feature columns we want to use from the training set.</li>
<li>list-like object, containing correct target values.</li>

In [36]:
# fit(x, y)
# predict(x)
from sklearn.neighbors import KNeighborsRegressor

train_df = normalized_listings.iloc[0:2792]
test_df = normalized_listings.iloc[2792:]
knn = KNeighborsRegressor(n_neighbors=5, algorithm='brute')
knn.fit(train_df[['accommodates', 'bathrooms']], train_df['price'])
predictions = knn.predict(test_df[['accommodates', 'bathrooms']])

# sklearn.metrics.mean_squared_error
The mean_squared_error() function takes in 2 inputs:

<li>list-like object, representing the true values</li>
<li>list-like object, representing the predicted values using the model</li>

In [42]:
from sklearn.metrics import mean_squared_error
two_features_mse = mean_squared_error(test_df['price'], predictions)
two_features_rmse = two_features_mse ** (1/2)
print(two_features_mse)
print(two_features_rmse)

15660.3979522
125.141511707


## Using more features

In [43]:
knn = KNeighborsRegressor(n_neighbors=5, algorithm='brute')
cols = ['accommodates',
    'bedrooms',
    'bathrooms',
    'number_of_reviews']
knn.fit(train_df[cols], train_df['price'])
four_predictions = knn.predict(test_df[cols])
four_mse = mean_squared_error(four_predictions, test_df['price'])
four_rmse = np.sqrt(four_mse)
print(four_mse)
print(four_rmse)

13425.5679181
115.868752984


In [47]:
knn = KNeighborsRegressor(n_neighbors=5, algorithm='brute')
train_x = train_df.drop('price', axis=1)
knn.fit(train_x, train_df['price'])
all_features_predictions = knn.predict(test_df.drop('price', axis=1))
all_features_mse = mean_squared_error(all_features_predictions, test_df['price'])
all_features_rmse = np.sqrt(all_features_mse)
print(all_features_mse)
print(all_features_rmse)

15455.1684642
124.31881782
