# Evaluate the performance of the model

## Test the quality of predictions

+ Separate the dataframe in 2: 
 - train_df containing 6000 lines of paris_listings
 - test_df containing the rest
+ Modify the predict_price function, change the DataFrame temp_df. Change the DataFrame paris_listings to train_df, so that only the training dataset is used.
+ Use the Series apply method to apply the predict_price function to the values in the'accommodates' column of the DataFrame test_df.
+ Assign the resulting Series object to the'predicted_price' column of test_df.

In [4]:
import numpy as np
import pandas as pd
from pandas import DataFrame
np.random.seed(1)


# Read, prices to floats, randomize
paris_listings = pd.read_csv('./datasets/paris_airbnb.csv')
stripped_commas = paris_listings['price'].str.replace(',', '')
stripped_dollars = stripped_commas.str.replace('$', '')
paris_listings['price'] = stripped_dollars.astype('float')
paris_listings = paris_listings.loc[np.random.permutation(len(paris_listings))]

def predict_price(new_listing: int):
    temp_df = train_df.copy() # Train dataset instead of whole dataset
    temp_df['distance'] = temp_df['accommodates'].apply(lambda x: np.abs(x - new_listing))
    temp_df = temp_df.sort_values('distance')
    nearest_neighbors = temp_df.iloc[0:5]['price'] # k = 5
    predicted_price = nearest_neighbors.mean()
    return predicted_price

separator = np.math.floor(len(paris_listings) * 0.75)
train_df: DataFrame = paris_listings[0:separator]
test_df: DataFrame = paris_listings[separator:]

test_df['predicted_price'] = test_df['accommodates'].apply(lambda x: predict_price(x))
test_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['predicted_price'] = test_df['accommodates'].apply(lambda x: predict_price(x))


Unnamed: 0,host_response_rate,host_acceptance_rate,host_listings_count,latitude,longitude,city,zipcode,state,accommodates,room_type,bedrooms,bathrooms,beds,price,cleaning_fee,security_deposit,minimum_nights,maximum_nights,number_of_reviews,predicted_price
4465,96%,,28.0,48.89318,2.31884,Paris,75017,Paris province,2,Entire home/apt,0.0,1.0,1.0,63.0,$100.00,$400.00,14,180,61,90.4
3154,100%,,2.0,48.85114,2.29728,Paris,75015,Île-de-France,5,Entire home/apt,1.0,1.0,4.0,120.0,$50.00,$200.00,1,120,102,176.8
1969,65%,,93.0,48.85134,2.33359,Paris,75006,Île-de-France,3,Entire home/apt,1.0,1.0,2.0,149.0,$60.00,$600.00,30,1125,5,96.2
2713,,,1.0,48.87320,2.38649,Paris,75020,Île-de-France,4,Entire home/apt,2.0,1.0,1.0,79.0,$65.00,$299.00,2,1125,5,173.0
5578,100%,,1.0,48.85691,2.36013,Paris,75004,Île-de-France,2,Entire home/apt,1.0,1.0,1.0,51.0,$25.00,$150.00,4,15,17,90.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7935,,,1.0,48.84012,2.39003,Paris,75012,Île-de-France,4,Entire home/apt,1.0,1.0,2.0,72.0,$0.00,$400.00,7,7,0,173.0
5192,,,1.0,48.87754,2.35318,Paris,75010,Île-de-France,2,Entire home/apt,0.0,1.0,1.0,49.0,$50.00,,30,1125,2,90.4
3980,,,1.0,48.84617,2.37121,Paris,75012,Île-de-France,2,Entire home/apt,0.0,1.0,1.0,99.0,$20.00,$100.00,2,1125,2,90.4
235,65%,,93.0,48.85522,2.36382,Paris,75004,Île-de-France,6,Entire home/apt,2.0,1.0,4.0,230.0,$90.00,$900.00,2,25,98,256.8


## Error metrics

+ Use the numpy.absolute() method to calculate the average absolute error MAE between predicted_price and price.
+ Assign the result to the variable mae.

In [16]:
np.sqrt(np.abs((test_df['price'] - test_df['predicted_price']) ** 2).mean()) # Root Mean Square Error
np.abs((test_df['price'] - test_df['predicted_price']) ** 2).mean() # Mean Square Error
np.abs((test_df['price'] - test_df['predicted_price'])).mean() # Mean Square Error

52.562200000000004

## Mean square error

+ Calculate the MSE value between the columns'predicted_price' and'price'.
+ Assign the result to the variable mse.
+ Display the result.

## Train another model

+ Modify the following predict_price function by using the'bedrooms' column instead of the'accommodates' column to make our predictions.
+ Apply the function to the test_df dataframe and assign the resulting Series object containing the predicted price values to the'predicted_price' column of test_df.
+ Calculate the squared error between the'price' and'predicted_price' columns of test_df and assign the resulting Series object to the'squared_error' column of test_df.
+ Calculate the average of the'squared_error' column of test_df and assign it to the variable mse.
+ Display the MSE value.

In [17]:
import numpy as np
import pandas as pd
from pandas import DataFrame
np.random.seed(1)


# Read, prices to floats, randomize
paris_listings = pd.read_csv('./datasets/paris_airbnb.csv')
stripped_commas = paris_listings['price'].str.replace(',', '')
stripped_dollars = stripped_commas.str.replace('$', '')
paris_listings['price'] = stripped_dollars.astype('float')
paris_listings = paris_listings.loc[np.random.permutation(len(paris_listings))]

def predict_price(new_listing: int):
    temp_df = train_df.copy() # Train dataset instead of whole dataset
    temp_df['distance'] = temp_df['bedrooms'].apply(lambda x: np.abs(x - new_listing))
    temp_df = temp_df.sort_values('distance')
    nearest_neighbors = temp_df.iloc[0:5]['price'] # k = 5
    predicted_price = nearest_neighbors.mean()
    return predicted_price

separator = np.math.floor(len(paris_listings) * 0.75)
train_df: DataFrame = paris_listings[0:separator]
test_df: DataFrame = paris_listings[separator:]

test_df['predicted_price'] = test_df['bedrooms'].apply(lambda x: predict_price(x))
test_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['predicted_price'] = test_df['bedrooms'].apply(lambda x: predict_price(x))


Unnamed: 0,host_response_rate,host_acceptance_rate,host_listings_count,latitude,longitude,city,zipcode,state,accommodates,room_type,bedrooms,bathrooms,beds,price,cleaning_fee,security_deposit,minimum_nights,maximum_nights,number_of_reviews,predicted_price
4465,96%,,28.0,48.89318,2.31884,Paris,75017,Paris province,2,Entire home/apt,0.0,1.0,1.0,63.0,$100.00,$400.00,14,180,61,77.4
3154,100%,,2.0,48.85114,2.29728,Paris,75015,Île-de-France,5,Entire home/apt,1.0,1.0,4.0,120.0,$50.00,$200.00,1,120,102,68.4
1969,65%,,93.0,48.85134,2.33359,Paris,75006,Île-de-France,3,Entire home/apt,1.0,1.0,2.0,149.0,$60.00,$600.00,30,1125,5,68.4
2713,,,1.0,48.87320,2.38649,Paris,75020,Île-de-France,4,Entire home/apt,2.0,1.0,1.0,79.0,$65.00,$299.00,2,1125,5,121.8
5578,100%,,1.0,48.85691,2.36013,Paris,75004,Île-de-France,2,Entire home/apt,1.0,1.0,1.0,51.0,$25.00,$150.00,4,15,17,68.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7935,,,1.0,48.84012,2.39003,Paris,75012,Île-de-France,4,Entire home/apt,1.0,1.0,2.0,72.0,$0.00,$400.00,7,7,0,68.4
5192,,,1.0,48.87754,2.35318,Paris,75010,Île-de-France,2,Entire home/apt,0.0,1.0,1.0,49.0,$50.00,,30,1125,2,77.4
3980,,,1.0,48.84617,2.37121,Paris,75012,Île-de-France,2,Entire home/apt,0.0,1.0,1.0,99.0,$20.00,$100.00,2,1125,2,77.4
235,65%,,93.0,48.85522,2.36382,Paris,75004,Île-de-France,6,Entire home/apt,2.0,1.0,4.0,230.0,$90.00,$900.00,2,25,98,121.8


## Square root of the mean square error

+ Calculate the RMSE value of the driven model using the'bedrooms' column.
+ Assign the result to the rmse variable.
+ Display the result

In [18]:
np.sqrt(np.abs((test_df['price'] - test_df['predicted_price']) ** 2).mean()) # Root Mean Square Error

79.25390829479642

## Compare the absolute mean error and the square root of the mean square error

+ Calculate MAE for errors_one and assign it to the mae_one variable.
+ Calculate RMSE for errors_one and assign it to the variable rmse_one.
+ Calculate MAE for errors_two and assign it to mae_two.
+ Calculate RMSE errors_two and assign it to rmse_two.

In [None]:
errors_one = pd.Series([5, 10, 5, 10, 5, 10, 5, 10, 5, 10, 5, 10, 5, 10, 5, 10, 5, 10])
errors_two = pd.Series([5, 10, 5, 10, 5, 10, 5, 10, 5, 10, 5, 10, 5, 10, 5, 10, 5, 1000])

In [16]:
# .sum() 
# numpy.sqrt()