# Multivariate model of the K closest neighbors

## Summary

+ Use the DataFrame.info() method to return the number of non-zero values in each column.

In [2]:
import numpy as np
import pandas as pd
from pandas import DataFrame
np.random.seed(1)


# Read, prices to floats, randomize
paris_listings = pd.read_csv('./datasets/paris_airbnb.csv')
stripped_commas = paris_listings['price'].str.replace(',', '')
stripped_dollars = stripped_commas.str.replace('$', '')
paris_listings['price'] = stripped_dollars.astype('float')
paris_listings = paris_listings.loc[np.random.permutation(len(paris_listings))]

def predict_price(new_listing: int):
    temp_df = train_df.copy() # Train dataset instead of whole dataset
    temp_df['distance'] = temp_df['bedrooms'].apply(lambda x: np.abs(x - new_listing))
    temp_df = temp_df.sort_values('distance')
    nearest_neighbors = temp_df.iloc[0:5]['price'] # k = 5
    predicted_price = nearest_neighbors.mean()
    return predicted_price

separator = np.math.floor(len(paris_listings) * 0.75)
train_df: DataFrame = paris_listings[0:separator]
test_df: DataFrame = paris_listings[separator:]

#test_df['predicted_price'] = test_df['bedrooms'].apply(lambda x: predict_price(x))
#test_df

test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2000 entries, 4465 to 5157
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   host_response_rate    1253 non-null   object 
 1   host_acceptance_rate  0 non-null      float64
 2   host_listings_count   1999 non-null   float64
 3   latitude              2000 non-null   float64
 4   longitude             2000 non-null   float64
 5   city                  1998 non-null   object 
 6   zipcode               1982 non-null   object 
 7   state                 1990 non-null   object 
 8   accommodates          2000 non-null   int64  
 9   room_type             2000 non-null   object 
 10  bedrooms              1998 non-null   float64
 11  bathrooms             1987 non-null   float64
 12  beds                  1996 non-null   float64
 13  price                 2000 non-null   float64
 14  cleaning_fee          1588 non-null   object 
 15  security_deposit  

## Delete features

+ Delete the 9 columns that we have just quoted from the DataFrame paris_listings:
 - 3 containing non-numeric values (room_type, city and state)
 - 3 containing numerical but not ordinal values (longitude, latitude and zipcode)
 - 3 describing the host rather than the accommodation (host_)
+ Display the sum of the missing values of the resulting DataFrame.

In [12]:
# Dataframe.drop() to specify the columns to be deleted

# isnull() and sum() to obtain the sum of the missing values

In [36]:
tmp = paris_listings.drop(['host_listings_count', 'city', 'zipcode', 'state', 'room_type', 'host_acceptance_rate', 'host_response_rate', 'cleaning_fee', 'security_deposit','latitude', 'longitude'], axis=1)
tmp.isnull().sum()


accommodates          0
bedrooms             24
bathrooms            58
beds                 14
price                 0
minimum_nights        0
maximum_nights        0
number_of_reviews     0
dtype: int64

## Handle missing values

+ Remove the cleaning_fee and security_deposit columns from the DataFrame paris_listings.
+ Then, delete all rows containing a missing value in the'bedrooms','bathrooms' and'beds' columns of the paris_listings dataframe.
 - You can do this by using the dropna() method by setting the axis parameter to 0.
 - Since only the bedrooms, bathrooms, and beds columns contain missing values, the rows containing these missing values in these columns will be deleted.
+ Display the number of zero values for the newly updated new DataFrame to confirm that there are no more missing values.

In [16]:
# paris_listings.dropna(axis=0) to delete all lines containing values
# missing

In [37]:
tmp.dropna(axis=0, inplace=True)
tmp.isnull().sum()

accommodates         0
bedrooms             0
bathrooms            0
beds                 0
price                0
minimum_nights       0
maximum_nights       0
number_of_reviews    0
dtype: int64

## Normalize columns

+ Normalize all remaining columns of paris_listings and assign the new DataFrame containing just the normalized columns to the normalized_listings variable.
+ Add the'price' column from paris_listings to normalized_listings.
+ Display the first 3 values of normalized_listings.


In [38]:
# (x - mean_x) / variance = z1_score

import matplotlib

tmp.info() # on a que des valeurs numeriques maintenant

price_copy = paris_listings['price'].copy()

# Yields a tuple of column name and series for each column in the dataframe
for (columnName, columnData) in tmp.iteritems():
    mean = columnData.mean()
    std = columnData.std()
    tmp[columnName] = columnData.apply(lambda x: (x - mean) / std)

tmp['price'] = price_copy

normalized_listings = tmp
tmp

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7917 entries, 4740 to 5157
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   accommodates       7917 non-null   int64  
 1   bedrooms           7917 non-null   float64
 2   bathrooms          7917 non-null   float64
 3   beds               7917 non-null   float64
 4   price              7917 non-null   float64
 5   minimum_nights     7917 non-null   int64  
 6   maximum_nights     7917 non-null   int64  
 7   number_of_reviews  7917 non-null   int64  
dtypes: float64(4), int64(4)
memory usage: 556.7 KB


Unnamed: 0,accommodates,bedrooms,bathrooms,beds,price,minimum_nights,maximum_nights,number_of_reviews
4740,0.503868,-0.296884,-0.293537,0.20531,65.0,-0.184601,1.062858,-0.564545
5606,-0.131849,0.892605,0.843973,0.20531,98.0,-0.101183,1.061018,-0.636924
4824,-0.767566,-1.486372,-0.293537,-0.64526,65.0,0.037847,1.062858,-0.651400
4205,-0.767566,-0.296884,-0.293537,-0.64526,45.0,-0.101183,1.062858,0.550105
3228,-0.131849,-0.296884,-0.293537,0.20531,65.0,-0.101183,-0.335685,-0.579021
...,...,...,...,...,...,...,...,...
7935,0.503868,-0.296884,-0.293537,0.20531,72.0,-0.045571,-0.994472,-0.651400
5192,-0.767566,-1.486372,-0.293537,-0.64526,49.0,0.593969,1.062858,-0.622448
3980,-0.767566,-1.486372,-0.293537,-0.64526,99.0,-0.184601,1.062858,-0.622448
235,1.775303,0.892605,-0.293537,1.90645,230.0,-0.184601,-0.961349,0.767245


## Euclidean distance for the multivariate case


+ Calculate the Euclidean distance using only the accommodates and bedrooms characteristics between the first line and the 5th line of normalized_listings using the distance.euclidean() function.
+ Assign the distance value to the first_fifth_distance variable and display the result.



In [20]:
# normalized_listings.iloc[0[['accommodates', 'bedrooms']]

In [26]:
from scipy.spatial import distance

columns = ['accommodates', 'bedrooms']
first_listing = normalized_listings.iloc[0][columns]
fifth_listing = normalized_listings.iloc[4][columns]
distance.euclidean(first_listing, fifth_listing)

1.0

## Adapter un modèle et faire des prédictions


+ Create an instance of the class **KNeighborsRegressor** with the following parameters:
 - n_neighbors: 5
 - algorithm: raw
+ Use the **fit** method to specify the data we want to use for our model of the nearest neighbouring k. Use the following parameters:
 - The training data, the characteristic columns: only the'accommodates' and'bedrooms' columns, in this order, from the DataFrame train_df.
 - The target column: the'price' column of the DataFrame train_df.
+ Call the method **predict** to make predictions on it:
 - The'accommodates' and'bedrooms' columns of the DataFrame test_df
 - Assign the Numpy table resulting from the predicted price values to the predictions variable.

In [53]:
from sklearn.neighbors import KNeighborsRegressor

train_df = normalized_listings[0:6000]
test_df = normalized_listings[6000:]

train_columns = [x for x in normalized_listings.columns if x != 'price']
print(train_columns)

# instancier le modele
knn = KNeighborsRegressor(n_neighbors=5, algorithm='brute')

# adapter le modele
knn.fit(train_df[train_columns], train_df['price'])

predictions = knn.predict(test_df[train_columns])

predictions

['accommodates', 'bedrooms', 'bathrooms', 'beds', 'minimum_nights', 'maximum_nights', 'number_of_reviews']


array([239.8,  70.6, 106.8, ...,  56. , 159.4, 163.6])

In [40]:
predictions.mean()

123.98163797600417

## Calculate the mean square error

+ Use the mean_squared_error function to calculate the MSE root mean square error value for the predictions we have made.
+ Assign the MSE value to the variable two_features_mse.
+ Calculate the square root value of the RMSE mean square error by taking the square root of the MSE value and assign the result to the variable two_features_rmse.
+ Display these 2 error results.


In [45]:
from sklearn.metrics import mean_squared_error

two_features_mse = mean_squared_error(test_df['price'], predictions)
two_features_mse ** (1/2)

86.43134176217931

## Use more features

+ Create a new instance of the KNeighborsRegressor class with the following parameters:
+ Adapt the model so that it uses the following columns of our training set (train_df):
 - accommodating
 - bedrooms
 - bathrooms
 - number_of_reviews
+ Use the model to make predictions on the test set (test_df) using the same columns. Assign the Numpy table of predictions to the four_predictions variable.
+ Use the mean_squared_error() function to calculate the MSE value for these predictions by comparing the furnace_prediction values with the price column of the DataFrame test_df. Assign the calculated MSE value to the variable four_mse.
+ Calculate the RMSE value and assign the result to the variable four_rmse.
+ Display the results four_mse and four_rmse.

## Use all features

+ Use all columns, except the'price' column, to drive our model of the nearest neighbouring k's using the same parameters for the KNeighborsRegressor class as the previous videos.
+ Use the model to make predictions on the test set and assign the resulting Numpy array to the all_features_predictions variable.
+ Calculate the MSE and RMSE values and assign the results to the variables all_features_mse and all_features_rmse.
+ Display error scores.


In [26]:
# df.columns.tolist() to retrieve all columns in a list 
# remove() to delete a column