# Airbnb price prediction

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Cargar-data" data-toc-modified-id="Cargar-data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Cargar data</a></span></li><li><span><a href="#Feature-Engineering-(determinar-importancia-de-variables)" data-toc-modified-id="Feature-Engineering-(determinar-importancia-de-variables)-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Feature Engineering (determinar importancia de variables)</a></span><ul class="toc-item"><li><span><a href="#Transformar-categóricas" data-toc-modified-id="Transformar-categóricas-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Transformar categóricas</a></span><ul class="toc-item"><li><span><a href="#room_type" data-toc-modified-id="room_type-2.1.1"><span class="toc-item-num">2.1.1&nbsp;&nbsp;</span>room_type</a></span></li><li><span><a href="#city" data-toc-modified-id="city-2.1.2"><span class="toc-item-num">2.1.2&nbsp;&nbsp;</span>city</a></span></li></ul></li></ul></li><li><span><a href="#train-/-test-split" data-toc-modified-id="train-/-test-split-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>train / test split</a></span></li><li><span><a href="#Modelado" data-toc-modified-id="Modelado-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Modelado</a></span><ul class="toc-item"><li><span><a href="#Ajuste-de-los-Hiperparametros-de-un-Modelo" data-toc-modified-id="Ajuste-de-los-Hiperparametros-de-un-Modelo-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>Ajuste de los Hiperparametros de un Modelo</a></span><ul class="toc-item"><li><span><a href="#Random-Search" data-toc-modified-id="Random-Search-4.1.1"><span class="toc-item-num">4.1.1&nbsp;&nbsp;</span>Random Search</a></span></li><li><span><a href="#GridSearch" data-toc-modified-id="GridSearch-4.1.2"><span class="toc-item-num">4.1.2&nbsp;&nbsp;</span>GridSearch</a></span></li><li><span><a href="#GridSearching-Bayesiano" data-toc-modified-id="GridSearching-Bayesiano-4.1.3"><span class="toc-item-num">4.1.3&nbsp;&nbsp;</span>GridSearching Bayesiano</a></span></li></ul></li></ul></li><li><span><a href="#Exportar-csv-predicciones" data-toc-modified-id="Exportar-csv-predicciones-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Exportar csv predicciones</a></span></li></ul></div>

## Cargar data

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('../datasets/airbnb_data.csv')

In [None]:
df.shape

In [None]:
df.columns

We want to predict `price` variable

In [None]:
df.dtypes

## Feature Engineering (determinar importancia de variables)

In [None]:
sns.heatmap(df.corr())

### Transformar categóricas

#### room_type

In [None]:
df.room_type.unique()

#### city

## train / test split

In [None]:
X=df.drop('price', axis=1)

y=df.price

In [None]:
from sklearn.model_selection import train_test_split as tts

In [None]:
X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, random_state=42)

## Modelado

In [None]:
from sklearn.linear_model import LinearRegression as LinReg

from sklearn.ensemble import RandomForestRegressor as RFR

import xgboost as xgb

In [None]:
from sklearn.metrics import mean_squared_error as mse

In [None]:
from sklearn.metrics import r2_score

In [None]:
# regresion lineal

linreg=LinReg()

linreg.fit(X_train, y_train)

In [None]:
linreg.score(X_test, y_test)   # R2

In [None]:
y_pred=linreg.predict(X_test)

In [None]:
mse(y_pred, y_test)**0.5   # RMSE

In [None]:
# random forest

In [None]:
rf=RFR()


rf.fit(X_train, y_train)

In [None]:
rf.score(X_test, y_test)   # R2

In [None]:
y_pred=rf.predict(X_test)

In [None]:
mse(y_pred, y_test)**0.5   # RMSE

### Ajuste de los Hiperparametros de un Modelo

In [None]:
import numpy as np

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

#### Random Search

In [None]:
rfr=RFR()

In [None]:
#n_estimators=[int(x) for x in np.linspace(200, 2000, 10)]

n_estimators=[500, 1000]

min_samples_split=[2, 5, 10]

max_features=['auto', 'sqrt']

bootstrap=[True, False]

In [None]:
random_grid={'n_estimators': n_estimators,
             'min_samples_split': min_samples_split,
             'max_features': max_features,
             'bootstrap': bootstrap}

In [None]:
rfr_random=RandomizedSearchCV(estimator=rfr,                    # modelo
                             param_distributions=random_grid,   # dictio de hiperparametros
                             n_iter=3,                          # iteraciones
                             cv=3,                              # cross-validation
                             verbose=1,                         # que me de feedback
                             n_jobs=-1,                         # -1 es todos los nucleos
                             scoring='r2'                       # metrica de error
                             )

In [None]:
%%time

rfr_random.fit(X_train[:5000], y_train[:5000])

In [None]:
rfr_random.best_estimator_ #.fit(X, y), ya es un modelo

In [None]:
rfr_random.best_estimator_.fit(X[:5000], y[:5000])

In [None]:
rfr_random.best_score_

In [None]:
rfr_random.best_params_

#### GridSearch

In [None]:
modelo=GridSearchCV(rfr,                        # modelo
                    random_grid,                # dictio de params
                    cv=3,                       # cross_validation
                    iid=True,                   # independiente e identicamente distribuido
                    return_train_score=True,    # para ver posible overfit
                    n_jobs=2
                   )

In [None]:
%%time

modelo.fit(X_train[:2000], y_train[:2000])

In [None]:
modelo.best_estimator_

In [None]:
modelo.best_params_

#### GridSearching Bayesiano

In [None]:
!pip install hyperopt

In [None]:
from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK
from hyperopt.pyll import scope as ho_scope
from hyperopt.pyll.stochastic import sample as ho_sample

In [None]:
xgbr=xgb.XGBRegressor()

In [None]:
help(xgbr)

In [None]:
space={
    
    'n_estimators': hp.quniform('n_estimators', 10, 1000, 25), # uniforme discreta de 10 a 1000 en saltos de 25
    
    'learning_rate': hp.uniform('learning_rate', 0.0001, 1.0), # uniforme continua entre 0.0001 y 1
    
    'gamma': hp.uniform('x_gamma', 0.1, 0.5)
}

In [None]:
def objetivo(space):
    
    modelo=xgb.XGBRegressor(n_estimators=int(space['n_estimators']),
                            learning_rate=space['learning_rate'],
                            gamma=space['gamma'],
                            objective='reg:squarederror'
                           )
    
    eval_set=[(X_train, y_train), (X_test, y_test)]
    
    modelo.fit(X_train, y_train, eval_set=eval_set, eval_metric='rmse', verbose=False)
    
    y_pred=modelo.predict(X_test)
    
    rmse=mse(y_test, y_pred)**0.5
    
    return {'loss': rmse, 'status': STATUS_OK}

In [None]:
mejor=fmin(fn=objetivo, space=space, algo=tpe.suggest, max_evals=10, trials=Trials())

mejor

In [None]:
modelo=xgb.XGBRegressor(n_estimators=int(mejor['n_estimators']), 
                       learning_rate=mejor['learning_rate'], 
                       gamma=mejor['x_gamma'],
                       objective='reg:squarederror')

In [None]:
modelo.fit(X_train, y_train)

In [None]:
y_pred=modelo.predict(X_test)

mse(y_test, y_pred)**0.5

# H2O (ML for dummies)

In [None]:
import h2o
from h2o.automl import H2OAutoML

In [None]:
h2o.init()  # inicializa el servidor

In [None]:
df=h2o.import_file('airbnb_data.csv')   # importar dataframe para h2o

In [None]:
X=df.columns

y='price'

X.remove(y)

In [None]:
df[y]=df[y].asfactor()

In [None]:
aml=H2OAutoML(max_models=1, seed=1)

aml.train(x=X, y=y, training_frame=df)

In [None]:
lb=aml.leaderboard

lb.head(rows=lb.nrows)

In [None]:
aml.leader  # mejor modelo

In [None]:
pred=aml.leader.predict(X)

pred

## Exportar csv predicciones

In [None]:
len(y_pred)

In [None]:
resultados=pd.DataFrame()

In [None]:
resultados['id']=[i for i in range(len(y_test))]

In [None]:
resultados['y_pred']=y_pred

In [None]:
resultados.to_csv('my_submit.csv', index=False)  # necesario el index False para no poner otra columna

In [None]:
pd.read_csv('my_submit.csv').head()   # como este tiene que ser

In [None]:
resultados.to_csv('my_submit_malo.csv')

In [None]:
pd.read_csv('my_submit_malo.csv').head()