# Data preprocessing

- Scaling between 0 and 1 because we have a lot of dummies and boolean values
- Feature engineering using filter and RFE

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

%matplotlib inline
sns.set()

pd.set_option('max_columns',70)

In [None]:
df = pd.read_csv('../data/airbnb_paris_clean_dummies.csv')
print(df.shape)
df.head()

In [None]:
df_wo_dummies = pd.read_csv('../data/airbnb_paris_clean_wo_dummies.csv')
print(df_wo_dummies.shape)
df_wo_dummies.head()

In [None]:
# Checking duplicates and drop them

print("duplicates found in dummies csv:",df.duplicated().sum())
df = df.drop_duplicates()
print("shape df with dummies",df.shape,'\n')

print("duplicates found in other csv:",df_wo_dummies.duplicated().sum())
df_wo_dummies = df_wo_dummies.drop_duplicates()
print("shape df without dummies",df_wo_dummies.shape)

_________________________
### Scaling

In [None]:
minmax_scale = df[['time_since_host','number_of_reviews','availability_365','extra_people','guests_included',
'bedrooms','bathrooms','accommodates']]

In [None]:
# Checking distribution and value range before scaling 

fig,axs=plt.subplots(2,4,figsize=(17,8))

for i in range(minmax_scale.shape[1]):
    ax = axs[i//4,i%4]
    sns.distplot(minmax_scale.iloc[:,i],ax=ax)

plt.show()

In [None]:
# Creating  copy of initial dataframe in case of mistake

df_scaled = df.copy()
df_scaled_wo_dummies = df_wo_dummies.copy()

In [None]:
# Scaling for dataframe with dummies 

    # Scaling the data using MinMax method to put all data between 0,1 because we have many columns with dummies
    # Also MixMax Scale is recommanded for algos distance-based such as KNN or XGboost

for i in range(minmax_scale.columns.shape[0]):
    df_scaled[minmax_scale.columns[i]] = (df_scaled[minmax_scale.columns[i]] - df_scaled[minmax_scale.columns[i]].min())/(df_scaled[minmax_scale.columns[i]].max()- df_scaled[minmax_scale.columns[i]].min())

    # Scaling price diving by 100 

df_scaled.price = df_scaled.price/100

df_scaled.head()

In [None]:
# Scaling data for dataframe without dummies

    # Scaling the data using MinMax method for numerical columns

for i in range(minmax_scale.columns.shape[0]):
    df_scaled_wo_dummies[minmax_scale.columns[i]] = (df_scaled_wo_dummies[minmax_scale.columns[i]] - df_scaled_wo_dummies[minmax_scale.columns[i]].min())/(df_scaled_wo_dummies[minmax_scale.columns[i]].max()-df_scaled_wo_dummies[minmax_scale.columns[i]].min())

    
    # Scaling price diving by 100 
df_scaled_wo_dummies.price = df_scaled_wo_dummies.price/100

df_scaled_wo_dummies.head()


_______________________
### Feature Engineering

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import r2_score
from sklearn.feature_selection import RFE
from sklearn.metrics import mean_squared_log_error
from sklearn.feature_selection import SelectKBest, f_regression

In [None]:
# Feature Engineering using Filter method with f-regression for Linear Regression model 
# and mutual_info_regression for KNN and XGboost models

X = df.drop('price',axis=1)
y = df.price

X_scaled = df_scaled.drop('price',axis=1)
y_scaled = df_scaled.price

In [None]:
selection_freg = SelectKBest(f_regression)
X_new_freg = selection_freg.fit_transform(X_scaled,y_scaled)

In [None]:
X_new_freg = pd.DataFrame(X_new_freg)
print(X_new_freg.shape)
X_new_freg.head()

In [None]:
def mape(y_true,y_pred):
    if y_true.any() == 0:
        return "dividing by 0 is impossible"
    else:
        return np.mean(np.abs((y_true-y_pred)/y_pred))*100

In [None]:
# Test using RFE method

estimator = LinearRegression()

# build RFE on non-scaled df
selection = RFE(estimator, 10)
selection.fit(X,y)
print(X.columns[selection.support_])
y_pred_rfe = selection.predict(X)
print('R2 using RFE method without scale:',r2_score(y,y_pred_rfe))
print('MAPE:',mape(y,y_pred_rfe),'\n')

# build RFE using scaled df
selection2 = RFE(estimator, 10)
selection2.fit(X_scaled,y_scaled)
print(X_scaled.columns[selection2.support_])
y_pred_rfe2 = selection2.predict(X_scaled)
print('R2 using RFE method:',r2_score(y_scaled,y_pred_rfe2))
print('MAPE:',mape(y_scaled,y_pred_rfe2),'\n')

In [None]:
# Try a for loop to find the best number of columns
r_score = []
mape_list = []

for i in range(2,30):
    selection = RFE(estimator, i)
    selection.fit(X_scaled,y_scaled)
    print("Number of columns",i)
    #print(X_scaled.columns[selection.support_])
    y_pred_rfe = selection.predict(X_scaled)
    r_score.append(r2_score(y_scaled,y_pred_rfe))
    #print('R2 using RFE method:',r2_score(y_scaled,y_pred_rfe))
    mape_list.append(mape(y_scaled,y_pred_rfe))
    #print('MAPE:',mape(y_scaled,y_pred_rfe),'\n')
    

In [None]:
# Drawing graph to show the result of model performance depending on the number of columns

nb_cols = list(range(2,30))

fig, ax1 = plt.subplots(figsize=(14,5))

color = 'tab:red'
ax1.plot(nb_cols, r_score, linestyle='-', marker='o', color=color)
y0, y1 = ax1.get_ylim()
ax1.vlines(x=23,ymin=y0,ymax=y1, linestyle='dashed', label='Best Shape = [20,23]')
ax1.vlines(x=20,ymin=y0,ymax=y1, linestyle='dashed')
ax1.set_xlabel('Number of columns')
ax1.set_ylabel('R^2 Score', color=color)
ax1.tick_params(axis='y', labelcolor=color)
ax1.legend()

ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis

color = 'tab:blue'
ax2.set_ylabel('MAPE', color=color)
ax2.plot(nb_cols, mape_list, linestyle='-', marker='o', color=color)
ax2.tick_params(axis='y', labelcolor=color)

fig.suptitle('Model performance depending on the number of columns')
#fig.tight_layout()  # otherwise the right y-label is slightly clipped
plt.savefig('../img/rfe_method_model_performance.png')
plt.show()

In [None]:
# Building best selection 

best_selection = RFE(estimator, 20)
best_selection.fit(X_scaled,y_scaled)


X_new_RFE = X_scaled[X_scaled.columns[best_selection.support_]]
X_new_RFE.head()

In [None]:
# Testing the performance of model depending on columns used (w/ and w/o filter method)

# Use df non-scaled without any feature engineering method (so all columns)
lin_model3 = LinearRegression()
lin_model_fitted3 = lin_model3.fit(X,y)
y_pred_lin3 = lin_model_fitted3.predict(X)
print('R2 w/o feature engineering methods nor scaling:',r2_score(y,y_pred_lin3))
print('MAPE:',mape(y,y_pred_lin3),'\n')

# Use df scaled without any feature engineering method (so all columns)
lin_model2 = LinearRegression()
lin_model_fitted2 = lin_model2.fit(X_scaled,y_scaled)
y_pred_lin2 = lin_model_fitted2.predict(X_scaled)
print('R2 w/o feature engineering methods:',r2_score(y_scaled,y_pred_lin2))
print('MAPE:',mape(y_scaled,y_pred_lin2),'\n')

# Use filter method
lin_model = LinearRegression()
lin_model_fitted = lin_model.fit(X_new_freg,y_scaled)
y_pred_lin = lin_model_fitted.predict(X_new_freg)
print('R2 using filter method:',r2_score(y_scaled,y_pred_lin))
print('MAPE:',mape(y_scaled,y_pred_lin),'\n')


# Use RFE method
lin_model4 = LinearRegression()
lin_model_fitted4 = lin_model4.fit(X_new_RFE,y_scaled)
y_pred_lin4 = lin_model_fitted4.predict(X_new_RFE)
print('R2 using RFE method:',r2_score(y_scaled,y_pred_lin4))
print('MAPE:',mape(y_scaled,y_pred_lin4))


# Not using RMSLE here because I got an error on negative predicted values 
# code: print('RMSLE:',(mean_squared_log_error(y_scaled,abs(y_pred_lin4))**0.5),'\n')


In [None]:
y_pred_lin4[y_pred_lin4<0]*100

### Comments

- using scaling is better for linear regression especially if using RFE method for feature engineering
- filter method seems to have better performance than RFE (R2 is better) but errors are the same
- In any case it is better to use feature engineering methods because errors are smaller

**Possible improvements:**
- test Sequential Selection to find out the best number of features needed and compare results with other methods
- run PCA

**Next steps:**
- ~~test with different number of features~~
- ~~export new csv with scaled data and RFE method~~
- ~~run LinearRegression, KNN and RandomForest models to compare results using evaluation metrics for regression~~~

In [None]:
# Saving the new csv with feature engineering methods

feat_df = df_scaled[list(df_scaled[X_new_RFE.columns])+['price']]
print(feat_df.shape)

feat_df.to_csv('../data/airbnb_paris_clean_RFE.csv',index=False)

In [None]:
# Saving the new scaled csv without dummies 

df_scaled_wo_dummies.to_csv('../data/airbnb_paris_clean_wo_dummies_feat.csv',index=False)