# Modelling using supervised-learning

The end goal is to predict the price per night of a airbnb location depending on several features. 

We will use the dataset cleaned and preprocessed for modelling (scaling and RFE feature engineering methods). 

Models to build and compare:
- Linear Regression
- KNN
- RandomForest

**Steps:**
- Build train/test sample data 
- Build models
- Get evaluation metrics for each models
- Compare them

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

%matplotlib inline
sns.set()

In [None]:
df = pd.read_csv('../data/airbnb_paris_clean_feat.csv')
print(df.shape)
df.head()

__________________
### Models set-up & train/test sampling

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import explained_variance_score
from sklearn import preprocessing

In [None]:
def mape(y_true,y_pred):
    if y_true.any() == 0:
        return "dividing by 0 is impossible"
    else:
        return np.mean(np.abs((y_true-y_pred)/y_pred))*100

In [None]:
X = df.drop('price', axis=1)
y = df.price

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=8)

print(X_train.shape, y_train.shape)

_________________
### Linear Regression

In [None]:
# building the model
model_lin = LinearRegression()
model_lin.fit(X_train, y_train)

# predicting values
y_pred_lin = model_lin.predict(X_test)

# checking evaluation metrics
print('R2 for Linear Regression:',r2_score(y_test,y_pred_lin))
print('MSE:', mean_squared_error(y_test,y_pred_lin))
print('RMSE:', mean_squared_error(y_test,y_pred_lin, squared=False))
print('MAPE:',mape(y_test,y_pred_lin))
print('RMSLE:',(mean_squared_log_error(y_test,abs(y_pred_lin))**0.5),'\n')


In [None]:
plt.scatter(y_test,y_pred_lin);

___________________
### KNN

In [None]:
# change the encoding of y (target) because float numbers are not accepted in fit model
#lab_enc = preprocessing.LabelEncoder()
#training_scores_encoded = lab_enc.fit_transform(y_train)

# building model
model_knn = KNeighborsRegressor(n_neighbors=3)
model_knn.fit(X_train, y_train)

# predicting values
y_pred_knn = model_knn.predict(X_test)

# checking evaluation metrics
print('R2 for KNN:', r2_score(y_test,y_pred_knn))
print('MSE:', mean_squared_error(y_test,y_pred_knn))
print('RMSE:', mean_squared_error(y_test,y_pred_knn, squared=False))
print('MAPE:',mape(y_test,y_pred_knn))
print('RMSLE:',(mean_squared_log_error(y_test, y_pred_knn)**0.5),'\n')

In [None]:
plt.scatter(y_test,y_pred_knn);

____________________
### RandomForest

In [None]:
# building the model 
model_rf = RandomForestClassifier()
model_rf.fit(X_train, y_train)