In [1]:
import numpy as np 
import pandas as pd 

In [2]:
file = pd.read_csv("dc_airbnb.csv")
file.head()

Unnamed: 0,host_response_rate,host_acceptance_rate,host_listings_count,accommodates,room_type,bedrooms,bathrooms,beds,price,cleaning_fee,security_deposit,minimum_nights,maximum_nights,number_of_reviews,latitude,longitude,city,zipcode,state
0,92%,91%,26,4,Entire home/apt,1.0,1.0,2.0,$160.00,$115.00,$100.00,1,1125,0,38.890046,-77.002808,Washington,20003,DC
1,90%,100%,1,6,Entire home/apt,3.0,3.0,3.0,$350.00,$100.00,,2,30,65,38.880413,-76.990485,Washington,20003,DC
2,90%,100%,2,1,Private room,1.0,2.0,1.0,$50.00,,,2,1125,1,38.955291,-76.986006,Hyattsville,20782,MD
3,100%,,1,2,Private room,1.0,1.0,1.0,$95.00,,,1,1125,0,38.872134,-77.019639,Washington,20024,DC
4,92%,67%,1,4,Entire home/apt,1.0,1.0,1.0,$50.00,$15.00,$450.00,7,1125,0,38.996382,-77.041541,Silver Spring,20910,MD


## Holdout Validation - type of cross validation

split the data equally in train and test data. Then train the model on first split and test on split two. Repeat with training split two and testing on split one. Calculate the mean RMSE of both the predictions

In [3]:
file["price"] = file["price"].str.replace("$","").str.replace(",","").astype(float)



In [4]:
np.random.seed(123)
shuffle = np.random.permutation(file.index)
file = file.loc[shuffle]

In [5]:
obj = ["host_response_rate","host_acceptance_rate","room_type","city","zipcode","state","cleaning_fee","security_deposit"]
file.drop(obj, axis=1, inplace= True)

In [6]:
file.dropna(axis=0,inplace=True)
file.isnull().sum()

host_listings_count    0
accommodates           0
bedrooms               0
bathrooms              0
beds                   0
price                  0
minimum_nights         0
maximum_nights         0
number_of_reviews      0
latitude               0
longitude              0
dtype: int64

In [7]:
file.drop(["longitude","latitude"], axis=1, inplace=True)

In [8]:
normalized = (file - file.mean())/file.std()
normalized.head()

Unnamed: 0,host_listings_count,accommodates,bedrooms,bathrooms,beds,price,minimum_nights,maximum_nights,number_of_reviews
1657,-0.19009,0.401366,-0.249467,-0.439151,0.297345,-0.173345,-0.065038,-0.016573,-0.482505
820,-0.109668,-1.095499,-1.43881,-0.439151,-0.546858,-0.391448,0.763971,-0.016573,-0.516709
2405,-0.174005,1.399275,0.939875,2.117072,0.297345,1.818657,-0.065038,-0.016573,-0.482505
298,-0.174005,-0.596544,-1.43881,-0.439151,-0.546858,-0.136995,-0.065038,-0.016599,0.133163
3670,-0.19009,-0.596544,-0.249467,-0.439151,-0.546858,-0.427798,-0.341375,-0.016573,-0.482505


In [9]:
normalized["price"] = file["price"]

In [12]:
features = ["accommodates","bedrooms","bathrooms","beds"]
target = "price"

In [11]:
total_rows = normalized.shape[0]
split_one = normalized[:int(total_rows*0.5)]
split_two = normalized[int(total_rows*0.5):]

In [13]:
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor(algorithm="brute")
knn.fit(split_one[features],split_one[target])
prediction = knn.predict(split_two[features])

In [15]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(split_two[target],prediction)
rmse_one = np.sqrt(mse)
rmse_one

120.48133720364778

In [16]:
knn = KNeighborsRegressor(algorithm="brute")
knn.fit(split_two[features],split_two[target])
prediction = knn.predict(split_one[features])

mse = mean_squared_error(split_one[target],prediction)
rmse_two = np.sqrt(mse)
rmse_two


112.10482113174672

In [17]:
mean_rmse = np.mean([rmse_one,rmse_two])
mean_rmse

116.29307916769724

### We considered only two partitions/folds of the data. We can increase the number of folds to evaluate the efficiency of our model

Diving in 5 folds: 

In [27]:
total_rows/5

734.2

In [26]:
file = pd.read_csv("dc_airbnb.csv")
file.head()

Unnamed: 0,host_response_rate,host_acceptance_rate,host_listings_count,accommodates,room_type,bedrooms,bathrooms,beds,price,cleaning_fee,security_deposit,minimum_nights,maximum_nights,number_of_reviews,latitude,longitude,city,zipcode,state
0,92%,91%,26,4,Entire home/apt,1.0,1.0,2.0,$160.00,$115.00,$100.00,1,1125,0,38.890046,-77.002808,Washington,20003,DC
1,90%,100%,1,6,Entire home/apt,3.0,3.0,3.0,$350.00,$100.00,,2,30,65,38.880413,-76.990485,Washington,20003,DC
2,90%,100%,2,1,Private room,1.0,2.0,1.0,$50.00,,,2,1125,1,38.955291,-76.986006,Hyattsville,20782,MD
3,100%,,1,2,Private room,1.0,1.0,1.0,$95.00,,,1,1125,0,38.872134,-77.019639,Washington,20024,DC
4,92%,67%,1,4,Entire home/apt,1.0,1.0,1.0,$50.00,$15.00,$450.00,7,1125,0,38.996382,-77.041541,Silver Spring,20910,MD


In [28]:
file["price"] = file["price"].str.replace("$","").str.replace(",","").astype(float)



In [29]:
obj = ["host_response_rate","host_acceptance_rate","room_type","city","zipcode","state","cleaning_fee","security_deposit"]
file.drop(obj, axis=1, inplace= True)

In [30]:
file.dropna(axis=0,inplace=True)
file.isnull().sum()

host_listings_count    0
accommodates           0
bedrooms               0
bathrooms              0
beds                   0
price                  0
minimum_nights         0
maximum_nights         0
number_of_reviews      0
latitude               0
longitude              0
dtype: int64

In [31]:
file.drop(["longitude","latitude"], axis=1, inplace=True)

In [32]:
normalized = (file - file.mean())/file.std()
normalized.head()

Unnamed: 0,host_listings_count,accommodates,bedrooms,bathrooms,beds,price,minimum_nights,maximum_nights,number_of_reviews
0,0.212019,0.401366,-0.249467,-0.439151,0.297345,0.081108,-0.341375,-0.016573,-0.516709
1,-0.19009,1.399275,2.129218,2.969147,1.141549,1.462423,-0.065038,-0.016603,1.706535
2,-0.174005,-1.095499,-0.249467,1.264998,-0.546858,-0.718601,-0.065038,-0.016573,-0.482505
3,-0.19009,-0.596544,-0.249467,-0.439151,-0.546858,-0.391448,-0.341375,-0.016573,-0.516709
4,-0.19009,0.401366,-0.249467,-0.439151,-0.546858,-0.718601,1.316644,-0.016573,-0.516709


In [33]:
normalized["price"] = file["price"]

In [34]:
normalized.loc[0:734,"fold"] = 1
normalized.loc[734:1468,"fold"] = 2
normalized.loc[1468:2202,"fold"] = 3
normalized.loc[2202:2936,"fold"] = 4
normalized.loc[2936:,"fold"] = 5

In [35]:
normalized["fold"].value_counts()

5.0    784
4.0    728
3.0    721
1.0    720
2.0    718
Name: fold, dtype: int64

In [38]:
folds = [1,2,3,4,5]
rmse_values = []
for i in folds:
    train = normalized[normalized["fold"] != i]
    test = normalized[normalized["fold"] == i]
    knn = KNeighborsRegressor(algorithm="brute")
    knn.fit(train[features],train[target])
    prediction = knn.predict(test[features])
    mse = mean_squared_error(test[target],prediction)
    rmse = np.sqrt(mse)
    rmse_values.append(rmse)

In [39]:
rmse_values

[122.03448442505457,
 103.78662088195524,
 152.9978597114448,
 115.51017556622732,
 117.59573802668537]

In [41]:
rmse_mean = np.mean(rmse_values)
rmse_mean

122.38497572227345

In [42]:
from sklearn.model_selection import KFold, cross_val_score

In [50]:
kf = KFold(n_splits=5,shuffle=True,random_state=1)
knn = KNeighborsRegressor()

rmse_values = []
mse = cross_val_score(knn, normalized[features], normalized[target],scoring="neg_mean_squared_error", cv= kf)
rmse = np.sqrt(np.abs(mse))
mean_rmse = np.mean(rmse)
mean_rmse

121.79153057858392

#### In cross_val_score you tell estimator i.e. which algorithm is used. knn (instance of K Neighbor). cv stands for cross validation and either you can pass instance of K-fold(which automatically creates any number of folds) or any integer can be passed

In [59]:
f = [2,3,5,10,15]

fold_rmse = {}
for i in f:
    kf = KFold(n_splits=i,shuffle=True,random_state=1)
    knn = KNeighborsRegressor()

    
    mse = cross_val_score(knn, normalized[features], normalized[target],scoring="neg_mean_squared_error", cv= i)
    rmse = np.sqrt(np.abs(mse))
    mean_rmse = np.mean(rmse)
    mean_rmse
    fold_rmse[i] = mean_rmse

In [60]:
fold_rmse

{2: 126.69623109562984,
 3: 127.05321441401254,
 5: 122.95875558533496,
 10: 118.10110528780865,
 15: 118.64238757417046}

## Conclusion: Cross Validation is the best form of validation. Better than holdout validation