In [17]:
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', 100)
pd.set_option('display.float_format', '{:.2f}'.format)

# Importamos utilidades y modelos de sklearn
from sklearn.preprocessing import Imputer
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

import category_encoders as ce
from sklearn.metrics import mean_absolute_error

# Cargo los datos

In [18]:
train = pd.read_csv('./train.csv'
            ,dtype={
             "id":np.int32
            ,"titulo":str
            ,"fecha":str 
            ,"ciudad": str
            ,"provincia": str
            ,"descripcion": str  
            ,"direccion": str  
            ,"tipodepropiedad": 'category' })

test = pd.read_csv('./test.csv'
            ,dtype={
             "id":np.int32
            ,"titulo":str
            ,"fecha":str 
            ,"ciudad": str
            ,"provincia": str
            ,"descripcion": str  
            ,"direccion": str 
            ,"tipodepropiedad": 'category' })

cleanup_nums = {"tipodepropiedad":     {
    "Casa": 1,
    "Apartamento": 2, 
    "Casa en condominio": 3,
    "Terreno": 4,
    "Local Comercial": 5,
    "Oficina comercial": 6,
    "Bodega comercial": 7,
    "Edificio": 8,
    "Terreno comercial": 9,
    "Casa uso de suelo": 10,
    "Quinta Vacacional": 11,
    "Duplex": 12,
    "Villa": 13,
    "Inmuebles productivos urbanos": 14,
    "Rancho": 15,
    "Local en centro comercial": 16,
    "Departamento Compartido": 17,
    "Otros": 18,
    "Nave industrial": 19,
    "Terreno industrial": 20,
    "Huerta": 21,
    "Lote": 22,
    "Hospedaje": 23,
    "Garage": 24
                                   
}  }

# Métrica de evaluación
def RMSLE(actual, pred):
    return (np.mean((np.log(actual + 1) - np.log(pred + 1)) ** 2)) **.5

In [19]:
def fill_nulls( a ):

    a[['metroscubiertos']] = a[['metroscubiertos']].fillna(value= 174.16  )
    a[['antiguedad']] = a[['antiguedad']].fillna(value= 8.09 )
    a[['habitaciones']] = a[['habitaciones']].fillna(value= 2.90 )
    a[['banos']] = a[['banos']].fillna(value= 2.13)
    a[['idzona']] = a[['idzona']].fillna(value=2438952.04)
    a[['garages']] = a[['garages']].fillna(value=1.55)
    a[['metrostotales']] = a[['metrostotales']].fillna(value=176.92)
    a[['lng']] = a[['lng']].fillna(value=20.70)
    a[['lat']] = a[['lat']].fillna(value=-99.53)
    a[['centroscomercialescercanos']] = a[['centroscomercialescercanos']].fillna(value=0.40 )
    a[['escuelascercanas']] = a[['escuelascercanas']].fillna(value=0.44 )
    a[['piscina']] = a[['piscina']].fillna(value= 0.09 )
    a[['gimnasio']] = a[['gimnasio']].fillna(value= 0.06 )
    a[['usosmultiples']] = a[['usosmultiples']].fillna(value= 0.06 )
    a[['tipodepropiedad']] = a[['tipodepropiedad']].fillna(value= 1.87 )
    
    a[['provincia']] = a[['provincia']].fillna(value= str(a['provincia'].mode()) )
    a[['ciudad']] = a[['ciudad']].fillna(value= str(a['ciudad'].mode()) )
    a[['descripcion']] = a[['descripcion']].fillna(value= str(a['descripcion'].mode()) )
    a[['titulo']] = a[['titulo']].fillna(value= str(a['titulo'].mode()) )
    a[['direccion']] = a[['direccion']].fillna(value= str(a['direccion'].mode()) )
    a[['fecha']] = a[['fecha']].fillna(value= '2014-07-31 00:00:00' )
    
    return a

# Utilizo random forest 

In [10]:

a = train
a = a.replace(cleanup_nums)
a = fill_nulls(a)

a['fecha'] = pd.to_datetime(a['fecha'])
a['anio'] = a['fecha'].dt.year
a['mes'] = a['fecha'].dt.month
a['dia'] = a['fecha'].dt.dayofweek


X = a[['tipodepropiedad','anio','metrostotales','antiguedad','centroscomercialescercanos','garages','piscina','metroscubiertos','banos','habitaciones','lat','lng','idzona','gimnasio','usosmultiples']]
y = a['precio']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

# Modelo
regr = RandomForestRegressor(random_state=0,bootstrap =False,min_samples_leaf=2 ,n_jobs=-1,max_depth = 100,max_features = 5 ,min_samples_split = 2 ,n_estimators = 400)

# Entreno modelo
model = regr.fit(X_train, y_train)

#Calculo de Errores contra mi set de test
pred = model.predict(X_test)

#Calculo de Errores contra mi set de entrenamiento
dummy_rmsle = RMSLE(y_test, pred)
dummy_rmsle_train = RMSLE(y_train, model.predict(X_train) )
print(f"RMSLE Error (train): {dummy_rmsle_train:.5f}")
print(f"RMSLE Error (Test): {dummy_rmsle:.5f}")

dummy_rmsle = mean_absolute_error(y_test, pred)
dummy_rmsle_train = mean_absolute_error(y_train, model.predict(X_train) )

print(f"MAE Error (train): {dummy_rmsle_train:.5f}")
print(f"MAE Error (Test): {dummy_rmsle:.5f}")


RMSLE Error (train): 0.13883
RMSLE Error (Test): 0.35895
MAE Error (train): 183151.29511
MAE Error (Test): 599490.25474


# Utilizo Random Forest + Random Search + Cross Validation

###

In [20]:

a = train
a = a.replace(cleanup_nums)
a = fill_nulls(a)

a['fecha'] = pd.to_datetime(a['fecha'])
a['anio'] = a['fecha'].dt.year
a['mes'] = a['fecha'].dt.month
a['dia'] = a['fecha'].dt.dayofweek


X = a[['tipodepropiedad','anio','metrostotales','antiguedad','centroscomercialescercanos','garages','piscina',
       'metroscubiertos','banos','habitaciones','lat','lng','idzona','gimnasio','usosmultiples']]
y = a['precio']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)


n_estimators = [int(x) for x in np.linspace(start = 50, stop = 400, num = 15)]
max_features = [int(x) for x in np.linspace(start = 4, stop = 15, num = 15)]
max_depth = [int(x) for x in np.linspace(50, 100, num = 10)]
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [ False]


# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}


rf = RandomForestRegressor()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 10, cv = 5, random_state=42, n_jobs = -1)


# Fit the random search model
model = rf_random.fit(X_train, y_train)


print(rf_random.best_params_)


#Calculo de Errores contra mi set de test
pred = model.predict(X_test)

#Calculo de Errores contra mi set de entrenamiento
dummy_rmsle = RMSLE(y_test, pred)
dummy_rmsle_train = RMSLE(y_train, model.predict(X_train) )
print(f"RMSLE Error (train): {dummy_rmsle_train:.5f}")
print(f"RMSLE Error (Test): {dummy_rmsle:.5f}")

dummy_rmsle = mean_absolute_error(y_test, pred)
dummy_rmsle_train = mean_absolute_error(y_train, model.predict(X_train) )

print(f"MAE Error (train): {dummy_rmsle_train:.5f}")
print(f"MAE Error (Test): {dummy_rmsle:.5f}")


{'n_estimators': 175, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 8, 'max_depth': 50, 'bootstrap': False}
RMSLE Error (train): 0.11120
RMSLE Error (Test): 0.35721
MAE Error (train): 139121.64663
MAE Error (Test): 592872.64586


In [15]:
#Genero el csv 

c = test
c = c.replace(cleanup_nums)
c = fill_nulls(c)

c['fecha'] = pd.to_datetime(c['fecha'])
c['anio'] = c['fecha'].dt.year

b = c[['tipodepropiedad','anio','metrostotales','antiguedad','centroscomercialescercanos','garages','piscina','metroscubiertos','banos','habitaciones','lat','lng','idzona','gimnasio','usosmultiples']]
b = b.fillna(b.mean())

#Exporto csv con la prediccion
pred = model.predict(b) 
#Creo un dataframe con el formato ( id , precio (El precio predecido ))
res = test['id'].to_frame()
res.insert(1,'target', pred , True) 
#Exportamos la prediccion en formato csv sin labels
res.to_csv(r'C:\Users\Giova\JupiterLabFolder\Archivos CSV\resMejorScore.csv',index=False , header=True)

print('csv generado')


csv generado


# Score en kaggle fue de 599011.57620