In [118]:
import pandas as pd
import numpy as np

airbnb = pd.read_csv('paris_airbnb.csv')

pd.options.mode.chained_assignment = None
np.random.seed(1)
random_indexes = np.random.permutation(len(airbnb))
airbnb = airbnb.iloc[random_indexes]
cleaned_price = airbnb['price'].str.replace('$','').replace(',','')
airbnb['price'] = pd.to_numeric(cleaned_price, errors='coerce')

In [120]:
airbnb.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8000 entries, 4740 to 5157
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   host_response_rate    5000 non-null   object 
 1   host_acceptance_rate  0 non-null      float64
 2   host_listings_count   7999 non-null   float64
 3   latitude              8000 non-null   float64
 4   longitude             8000 non-null   float64
 5   city                  7997 non-null   object 
 6   zipcode               7930 non-null   object 
 7   state                 7977 non-null   object 
 8   accommodates          8000 non-null   int64  
 9   room_type             8000 non-null   object 
 10  bedrooms              7976 non-null   float64
 11  bathrooms             7942 non-null   float64
 12  beds                  7986 non-null   float64
 13  price                 7989 non-null   float64
 14  cleaning_fee          6250 non-null   object 
 15  security_deposit      6

In [122]:
numerical_cols = airbnb.select_dtypes(include=['float', 'int']).columns.to_list()

In [124]:
numerical_cols = numerical_cols[4:]

In [126]:
num_df = airbnb[numerical_cols].copy()

In [128]:
num_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8000 entries, 4740 to 5157
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   accommodates       8000 non-null   int64  
 1   bedrooms           7976 non-null   float64
 2   bathrooms          7942 non-null   float64
 3   beds               7986 non-null   float64
 4   price              7989 non-null   float64
 5   minimum_nights     8000 non-null   int64  
 6   maximum_nights     8000 non-null   int64  
 7   number_of_reviews  8000 non-null   int64  
dtypes: float64(4), int64(4)
memory usage: 562.5 KB


In [130]:
num_df.isna().sum()

accommodates          0
bedrooms             24
bathrooms            58
beds                 14
price                11
minimum_nights        0
maximum_nights        0
number_of_reviews     0
dtype: int64

In [132]:
num_df = num_df.dropna(how='any',axis=0)

In [134]:
num_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7906 entries, 4740 to 5157
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   accommodates       7906 non-null   int64  
 1   bedrooms           7906 non-null   float64
 2   bathrooms          7906 non-null   float64
 3   beds               7906 non-null   float64
 4   price              7906 non-null   float64
 5   minimum_nights     7906 non-null   int64  
 6   maximum_nights     7906 non-null   int64  
 7   number_of_reviews  7906 non-null   int64  
dtypes: float64(4), int64(4)
memory usage: 555.9 KB


In [136]:
num_df.isna().sum()

accommodates         0
bedrooms             0
bathrooms            0
beds                 0
price                0
minimum_nights       0
maximum_nights       0
number_of_reviews    0
dtype: int64

In [138]:
normalized_df=(num_df-num_df.mean())/num_df.std()

In [140]:
normalized_df.head()

Unnamed: 0,accommodates,bedrooms,bathrooms,beds,price,minimum_nights,maximum_nights,number_of_reviews
4740,0.507237,-0.295603,-0.29239,0.208665,-0.539601,-0.184459,1.062887,-0.56483
5606,-0.130317,0.896764,0.855964,0.208665,-0.162191,-0.100044,1.061047,-0.637171
4824,-0.767871,-1.48797,-0.29239,-0.645219,-0.539601,0.040649,1.062887,-0.651639
4205,-0.767871,-0.295603,-0.29239,-0.645219,-0.768334,-0.100044,1.062887,0.549223
3228,-0.130317,-0.295603,-0.29239,0.208665,-0.539601,-0.100044,-0.335444,-0.579298


In [142]:
normalized_df['price'] = num_df['price']

In [144]:
normalized_df.head()

Unnamed: 0,accommodates,bedrooms,bathrooms,beds,price,minimum_nights,maximum_nights,number_of_reviews
4740,0.507237,-0.295603,-0.29239,0.208665,65.0,-0.184459,1.062887,-0.56483
5606,-0.130317,0.896764,0.855964,0.208665,98.0,-0.100044,1.061047,-0.637171
4824,-0.767871,-1.48797,-0.29239,-0.645219,65.0,0.040649,1.062887,-0.651639
4205,-0.767871,-0.295603,-0.29239,-0.645219,45.0,-0.100044,1.062887,0.549223
3228,-0.130317,-0.295603,-0.29239,0.208665,65.0,-0.100044,-0.335444,-0.579298


In [188]:
from sklearn.neighbors import KNeighborsRegressor

knn = KNeighborsRegressor(n_neighbors=5, algorithm='brute')

In [190]:
train_df = normalized_df.iloc[:6000]
test_df = normalized_df.iloc[6000:]

features = train_df.columns.tolist()
features.remove('price')

knn.fit(train_df[features], train_df['price'])

In [192]:
predictions = knn.predict(test_df[features])

In [194]:
predictions

array([ 46.8, 260.8, 141.4, ...,  54.8, 159.4, 163.6])

In [196]:
from sklearn.metrics import mean_squared_error

In [198]:
mse = mean_squared_error(test_df['price'], predictions)
rmse = np.sqrt(mse)

In [200]:
print(mse, rmse)

3732.812675760755 61.0967484876303
