# Kaggle Competition

In [None]:
# Librerias
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split as tts
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.metrics import mean_squared_error as mse
from xgboost import XGBRegressor as XGBR
from sklearn.linear_model import LinearRegression as LinReg
from sklearn.metrics import r2_score as r2
from catboost import CatBoostRegressor as CTR

In [None]:
datos = pd.read_csv('data/train.csv')

In [None]:
# Observamos las columnas numéricas para ver cuantas podemos utilizar sin modificar
datos._get_numeric_data()

In [None]:
# Columnas con las que nos quedamos
datos_ok = datos.loc[:,['host_is_superhost','latitude', 'longitude', 'property_type', 'accommodates',
                        'bedrooms', 'price', 'minimum_nights', 'maximum_nights']]


In [None]:
#datos_ok

In [None]:
# ARREGLAR:  
# property_type, neighbourhood

### BUSCAR NANs

In [None]:
datos_ok.isna().sum()

### LIMPIEZA 'bedrooms', 'neighbourhood', 'property_type'

In [None]:
# Tener en cuenta que NO se pueden eliminar filas! 
# Las filas deben ser las mismas en los datos 'train.csv' y 'test.csv'

##### Limpieza 'bedrooms'

In [None]:
#reemplazo los Nans de bedrooms por la mediana
datos_ok.bedrooms.fillna(value=int(datos_ok['bedrooms'].median()), inplace=True)

In [None]:
datos_ok.isna().sum()

##### Limpieza 'neighbourhood'

In [None]:
# Reemplazo los Nans de 'neighbourhood' por 'Unknown'

In [None]:
datos_ok['neighbourhood'].fillna('Unknown', inplace=True)

In [None]:
datos_ok.isna().sum()

In [None]:
# Creación de un diccionario para pasarlo dentro de la función indicada abajo. 
# La finalidad es agrupar las diferentes localizaciones de los pisos para luego crear variables dummies.

In [None]:
dic_lugar = {
    'Zuidoost':['Amsterdam-Zuidoost', 'Zuid-Oost'],
    'Ouder-Amstel':['Ouder-Amstel', 'Amstel', 'Amstelveen'],
    'No-centro':['Badhoevedorp', 'Landsmeer', 'Oostzaan', 'Lijnden','Zwanenburg','Duivendrecht'],
    'Unknown':['Unknown'],
    'Amsterdam':['Amsterdam', '1054 ZV', 'Watergraafsmeer', 'Jordaan', 'Diemen','Zeeburg','Netherlands']
}

In [None]:
def place(p):
    for k,v in dic_lugar.items():
        if any(minivalue in p for minivalue in v):
            return k
        else:
            pass
    return p

In [None]:
datos_ok['neighbourhood'] = datos_ok['neighbourhood'].apply(place)

##### Limpieza 'property_type'

In [None]:
# Creación de un diccionario y una función para agrupar los tipos de viviendas

In [None]:
datos_ok.property_type.unique().tolist()

In [None]:
datos_ok['property_type'] = datos_ok['property_type'].str.lower()

In [None]:
dic_rooms = {
    'hotel':['hotel'],
    'BB':['bed and breakfast'],
    'private_room':['room'],
    'boat':['boat','houseboat'],
    'villa/cottage/entire_place':['entire','cottage', 'house','casa'],
    'Other':['camper', 'barn', 'cave', 'tower','bus']
}

In [None]:
def transformar(x):
    for k,v in dic_rooms.items():
        if any(minivalue in x for minivalue in v):
            return k
        else:
            pass
    return x

In [None]:
datos_ok['property_type'] = datos_ok['property_type'].apply(transformar)

In [None]:
datos_ok.property_type.value_counts()

### CREAR VARIABLES DUMMIES EN:  'property_type', 'host_is_superhost', 'neighbourhood'

#####  'property_type'

In [None]:
datos_ok=pd.concat([datos_ok, pd.get_dummies(datos_ok.property_type).reset_index(drop=True)], axis=1)
datos_ok=datos_ok.drop('property_type', axis = 1)
datos_ok.head()

##### 'host_is_superhost'

In [None]:
datos_ok=pd.concat([datos_ok, pd.get_dummies(datos_ok.host_is_superhost).reset_index(drop=True)], axis=1)
datos_ok=datos_ok.drop('host_is_superhost', axis = 1)
datos_ok.head()

##### 'neighbourhood'

In [None]:
datos_ok=pd.concat([datos_ok, pd.get_dummies(datos_ok.neighbourhood).reset_index(drop=True)], axis=1)
datos_ok=datos_ok.drop('neighbourhood', axis = 1)
datos_ok.head()

###### Matriz de correlación

In [None]:
# A medida que avanzamos en el estudio y se incorporan variables, la matriz se va apliando y modificando.

In [None]:
corr_datos = datos_ok.corr()
mascara = np.triu(np.ones_like(corr_datos, dtype=bool))
plt.figure(figsize=(10, 8))
sns.heatmap(corr_datos, mask = mascara, annot=True)
plt.show()

### MODELOS

In [None]:
X = datos_ok.drop('price', axis=1)
y = datos_ok.price

In [None]:
X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, random_state=8)

#### Función probar modelos:

In [None]:
def pruebas(modelo):
    
    modelo.fit(X_train, y_train)
    y_pred=modelo.predict(X_test)
    error = mse(y_pred, y_test, squared=False)
    
    train_score=modelo.score(X_train, y_train)
    test_score=modelo.score(X_test, y_test)
    
    
    print('Train R2:', train_score)
    print('Test R2:', test_score)
    print('rmse:', error)
    
    return modelo

In [None]:
# Regresión lineal múltiple

In [None]:
linreg=LinReg()
linreg=pruebas(linreg)
y_pred=linreg.predict(X_test)

In [None]:
# Random Forest Regressor

In [None]:
rfr=RFR()
rfr=pruebas(rfr)
y_pred=rfr.predict(X_test)
mse(y_pred, y_test, squared=False)

In [None]:
# Extreme Gradient Boosting

In [None]:
xgbr=XGBR()
xgbr=pruebas(xgbr)
y_pred=xgbr.predict(X_test)
mse(y_pred, y_test, squared=False)

In [None]:
# Catboost

In [None]:
ctr=CTR(verbose=0)
ctr=pruebas(ctr)
y_pred=ctr.predict(X_test)
mse(y_pred, y_test, squared=False)

### Modificar parámetros:

In [None]:
# Al añadir/eliminar variables de los datos el resultado siempre ha sido peor que utilizándolos todos.

In [None]:
X = datos_ok.drop(['price','bedrooms','Unknown', 'private_room'], axis=1)
y = datos_ok.price

In [None]:
X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, random_state=9)

In [None]:
# Random Forest Regressor

In [None]:
rfr=RFR()
rfr=pruebas(rfr)
y_pred=rfr.predict(X_test)
mse(y_pred, y_test, squared=False)

In [None]:
# Extreme Gradient Boosting

In [None]:
xgbr=XGBR()
xgbr=pruebas(xgbr)
y_pred=xgbr.predict(X_test)
mse(y_pred, y_test, squared=False)

In [None]:
# Catboost

In [None]:
ctr=CTR(verbose=0)
ctr=pruebas(ctr)
y_pred=ctr.predict(X_test)
mse(y_pred, y_test, squared=False)

## Predicción sobre test

In [None]:
#Hacemos la predicción sobre los datos 'test.csv'

In [None]:
X.shape

In [None]:
new_data=pd.read_csv('data/test.csv')

In [None]:
new_data=new_data.loc[:,['host_is_superhost','latitude', 'longitude', 'property_type', 'accommodates',
                        'bedrooms', 'minimum_nights', 'maximum_nights']]

In [None]:
# Se realiza exactamente la misma limpieza que en los datos 'train.csv'.
# OJO! Las filas NO pueden variar, deben ser las mismas. 

###### 'bedrooms'

In [None]:
# Reemplazar Nans

In [None]:
new_data.isna().sum()

In [None]:
new_data.bedrooms.fillna(value=int(new_data['bedrooms'].median()), inplace=True)

In [None]:
new_data.isna().sum()

###### 'neighbourhood'

In [None]:
# Reemplazar Nans y pasar la función (place)

In [None]:
new_data['neighbourhood'].fillna('Unknown', inplace=True)

In [None]:
new_data.isna().sum()

In [None]:
new_data['neighbourhood'] = new_data['neighbourhood'].apply(place)

###### 'property_type'

In [None]:
# Minúscula y función (transformar) para agrupar los tipos de propiedades

In [None]:
new_data['property_type'] = new_data['property_type'].str.lower().apply(transformar)

###### Crear variables dummies para 'property type' , 'neighbourhood' y 'superhost' y eliminar las originales

In [None]:
# property_type

In [None]:
new_data=pd.concat([new_data, pd.get_dummies(new_data.property_type).reset_index(drop=True)], axis=1)
new_data=new_data.drop('property_type', axis = 1)

In [None]:
# host_is_superhost

In [None]:
new_data=pd.concat([new_data, pd.get_dummies(new_data.host_is_superhost).reset_index(drop=True)], axis=1)
new_data=new_data.drop('host_is_superhost', axis = 1)

In [None]:
## 'neighbourhood'

In [None]:
new_data=pd.concat([new_data, pd.get_dummies(new_data.neighbourhood).reset_index(drop=True)], axis=1)
new_data=new_data.drop('neighbourhood', axis = 1)

In [None]:
X.head()

In [None]:
## new_data y X deben tener las mismas columnas

In [None]:
new_data.shape

In [None]:
new_data.head()

In [None]:
X.head()

In [None]:
X.shape

In [None]:
## .predict de new_data

In [None]:
y_pred=rfr.predict(new_data)

In [None]:
y_pred.shape

In [None]:
sample_df=pd.DataFrame()

sample_df['id']=[str(i) for i in range(len(y_pred))]
sample_df['price']=y_pred

sample_df.shape

In [None]:
sample_df.head()

In [None]:
# Guardamos en un csv para subirlo a Kaggle y ver las predicciones

In [None]:
 sample_df.to_csv('data/prediccion_try4.csv', index=False, header=True)