# Data preprocessing

- Scaling between 0 and 1 because we have a lot of dummies and columns with boolean values
- Feature engineering using filter

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

%matplotlib inline
sns.set()

pd.set_option('max_columns',70)

In [None]:
df = pd.read_csv('../data/airbnb_paris_clean_dummies.csv')
print(df.shape)
df.head()

In [None]:
# Checking duplicates and drop them

print("duplicates found:",df.duplicated().sum())
df = df.drop_duplicates()
df.shape

_________________________
### Scaling

In [None]:
to_scale = df[['time_since_host','number_of_reviews','availability_365','extra_people','guests_included','price',
'bedrooms','bathrooms','accommodates']]

In [None]:
# Checking distribution and value range before scaling 

fig,axs=plt.subplots(2,5,figsize=(17,8))

for i in range(to_scale.shape[1]):
    ax = axs[i//5,i%5]
    sns.distplot(to_scale.iloc[:,i],ax=ax)
plt.show()

In [None]:
# Creating  copy of initial dataframe in case of mistake

df_scaled = df.copy()

In [None]:
# Scaling the data using MinMax method to put all data between 0,1 because we have many columns with dummies
# Also MixMax Scale is recommanded for algos distance-based such as KNN or XGboost

for i in range(to_scale.columns.shape[0]):
    df_scaled[to_scale.columns[i]] = (df_scaled[to_scale.columns[i]] - df_scaled[to_scale.columns[i]].min())/(df_scaled[to_scale.columns[i]].max()-df_scaled[to_scale.columns[i]].min())

df_scaled.head()


_______________________
### Feature Engineering

In [None]:
# Feature Engineering using Filter method with f-regression for Linear Regression model 
# and mutual_info_regression for KNN and XGboost models

from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression

X = df.drop('price',axis=1)
y = df.price

X_scaled = df_scaled.drop('price',axis=1)
y_scaled = df_scaled.price

In [None]:
selection_freg = SelectKBest(f_regression)
X_new_freg = selection_freg.fit_transform(X_scaled,y_scaled)

In [None]:
X_new_freg = pd.DataFrame(X_new_freg)
print(X_new_freg.shape)
X_new_freg.head()

In [None]:
selection_mireg = SelectKBest(mutual_info_regression)
X_new_mireg = selection_mireg.fit_transform(X_scaled,y_scaled)

In [None]:
X_new_mireg = pd.DataFrame(X_new_mireg)
print(X_new_mireg.shape)
X_new_mireg.head()

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import r2_score
from sklearn.feature_selection import RFE

In [None]:
def mape(y_true,y_pred):
    if y_true.any() == 0:
        return "dividing by 0 is impossible"
    else:
        return np.mean(np.abs((y_true-y_pred)/y_pred))*100
    
mape(y_scaled,y_pred_lin)

In [None]:
# Test using RFE method

# build RFE on non-scaled df
estimator = LinearRegression()
selection = RFE(estimator, 10)
selection.fit(X,y)
print(X.columns[selection.support_])
y_pred_rfe = selection.predict(X)
print('R2 using RFE method:',r2_score(y,y_pred_rfe))
print('MAPE:',mape(y,y_pred_rfe),'\n')

# build RFE using scaled df
estimator2 = LinearRegression()
selection2 = RFE(estimator2, 10)
selection2.fit(X_scaled,y_scaled)
print(X_scaled.columns[selection2.support_])
y_pred_rfe2 = selection2.predict(X_scaled)
print('R2 using RFE method:',r2_score(y_scaled,y_pred_rfe2))
print('MAPE:',mape(y_scaled,y_pred_rfe2),'\n')

In [None]:
X_new_RFE = X_scaled[X_scaled.columns[selection2.support_]]
X_new_RFE.head()

In [None]:
# Testing the performance of model depending on columns used (w/ and w/o filter method)

# Use df scaled without any feature engineering method (so all columns)
lin_model2 = LinearRegression()
lin_model_fitted2 = lin_model2.fit(X_scaled,y_scaled)
y_pred_lin2 = lin_model_fitted2.predict(X_scaled)
print('R2 w/o feature engineering methods',r2_score(y_scaled,y_pred_lin2))
print('MAPE:',mape(y_scaled,y_pred_lin2),'\n')

# Use df non-scaled without any feature engineering method (so all columns)
lin_model3 = LinearRegression()
lin_model_fitted3 = lin_model3.fit(X,y)
y_pred_lin3 = lin_model_fitted3.predict(X)
print('R2 w/o feature engineering methods nor scaling',r2_score(y,y_pred_lin3))
print('MAPE:',mape(y,y_pred_lin3),'\n')

# Use filter method
lin_model = LinearRegression()
lin_model_fitted = lin_model.fit(X_new_freg,y_scaled)
y_pred_lin = lin_model_fitted.predict(X_new_freg)
print('R2 using filter method:',r2_score(y_scaled,y_pred_lin))
print('MAPE:',mape(y_scaled,y_pred_lin),'\n')

# Use RFE method
lin_model4 = LinearRegression()
lin_model_fitted4 = lin_model4.fit(X_new_RFE,y_scaled)
y_pred_lin4 = lin_model_fitted4.predict(X_new_RFE)
print('R2 using RFE method:',r2_score(y_scaled,y_pred_lin4))
print('MAPE:',mape(y_scaled,y_pred_lin4),'\n')

### Comments

- using scaling is better for linear regression especially if using RFE method for feature engineering
- filter method seems to have better performance than RFE (R2 is better) but errors are the same
- In any case it is better to use feature engineering methods because errors are smaller

**Possible improvements:**
- test Sequential Selection to find out the best number of features needed and compare results with other methods
- test with different number of features
- run PCA

**Next steps:**
- export new csv with scaled data and RFE method 
- run LinearRegression, KNN and RandomForest models to compare results using evaluation metrics for regression

In [None]:
# Saving the new csv with feature engineering methods

feat_df = df_scaled[list(df_scaled.drop('price',axis=1).columns[selection2.support_])+['price']]
print(feat_df.shape)

feat_df.to_csv('../data/airbnb_paris_clean_feat.csv',index=False)