## Importacion de Librerias

In [124]:
from wikiframe import Say, Extractor
import numpy as np
import pandas as pd



from transform import one_hot_transform,normalize
from utils import txt_list
import warnings
warnings.simplefilter("ignore")

from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.linear_model import HuberRegressor 
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

from sklearn.model_selection import RandomizedSearchCV

from sklearn.model_selection import cross_val_score

## Variables Principales

Para el desarrollo del modelo , se tomó 18 columnas del dataset original que consideramos relevantes , que se encuentran en el ./features.txt.

Se observó que en el los archivos data\houses_test_raw.csv había nuevas variables categorícas , por lo que se decidió concatenar ambos archivos para extraer todas las variables que conforma el dataset.

In [79]:
#Instanciar el objeto Extractor
extractor = Extractor('data')

#Extraer en df de ./data
df_dict = extractor.extract_from_csv()

#Concatenar train y test para Hacer one hot encoding 
df_all = pd.concat([df_dict['house_train_raw'],df_dict['houses_test_raw']])

#Extraer la lista de features elegidos en ./features.txt
features = txt_list('features.txt')

#Elegimos los features 
y = df_all['SalePrice']
df_all = df_all[features]


## Procesamiento de datos

Debido a que las columnas elegidas, algunas son de variable categórica, se debe hacer una transformacion de datos. Se propone One Hot Encoding ya que soporta la entradas de valores nulos.

Después de transformar las variables categóricas, se normalizará la funcion con (z-score)

Nota: El código de las funciones se encuentran en ./transform.py

In [80]:
df_all = one_hot_transform(df_all,features)

In [81]:
#Normalizar los datos
df_all = normalize(df_all)

In [82]:
df_all['SalePrice'] = list(y)

## Train - Test
Se separó en Train -Test como estuvo al principio de la extraccion de datos.
Para la validacion de nuestro modelo se optó por Cross Validation.

In [83]:
#Separando los datos como teníamos antes
train = df_all.iloc[:1460,:]
test = df_all.iloc[1460:,:]

In [241]:
X = train.drop(['SalePrice'],axis=1)
y = train['SalePrice']

In [242]:
X.head()

Unnamed: 0,LotArea,OverallCond,YearBuilt,FullBath,TotRmsAbvGrd,OpenPorchSF,PoolArea,C (all),FV,RH,...,Y,COD,CWD,Con,ConLD,ConLI,ConLw,New,Oth,WD
0,-0.217879,-0.507284,1.046258,0.781366,0.986849,0.200006,-0.06315,-0.092944,-0.223607,-0.094801,...,0.324443,-0.175272,-0.064249,-0.041423,-0.094801,-0.055613,-0.052423,-0.298629,-0.049029,0.395018
1,-0.072044,2.188279,0.154764,0.781366,-0.287758,-0.702843,-0.06315,-0.092944,-0.223607,-0.094801,...,0.324443,-0.175272,-0.064249,-0.041423,-0.094801,-0.055613,-0.052423,-0.298629,-0.049029,0.395018
2,0.137197,-0.507284,0.980221,0.781366,-0.287758,-0.081209,-0.06315,-0.092944,-0.223607,-0.094801,...,0.324443,-0.175272,-0.064249,-0.041423,-0.094801,-0.055613,-0.052423,-0.298629,-0.049029,0.395018
3,-0.078385,-0.507284,-1.859351,-1.027363,0.349546,-0.184815,-0.06315,-0.092944,-0.223607,-0.094801,...,0.324443,-0.175272,-0.064249,-0.041423,-0.094801,-0.055613,-0.052423,-0.298629,-0.049029,0.395018
4,0.518903,-0.507284,0.947203,0.781366,1.624153,0.540424,-0.06315,-0.092944,-0.223607,-0.094801,...,0.324443,-0.175272,-0.064249,-0.041423,-0.094801,-0.055613,-0.052423,-0.298629,-0.049029,0.395018


In [243]:
test.head()

Unnamed: 0,LotArea,OverallCond,YearBuilt,FullBath,TotRmsAbvGrd,OpenPorchSF,PoolArea,C (all),FV,RH,...,COD,CWD,Con,ConLD,ConLI,ConLw,New,Oth,WD,SalePrice
1460,0.184371,0.391237,-0.34051,-1.027363,-0.925062,-0.702843,-0.06315,-0.092944,-0.223607,10.548423,...,-0.175272,-0.064249,-0.041423,-0.094801,-0.055613,-0.052423,-0.298629,-0.049029,0.395018,
1461,0.519791,0.391237,-0.439565,-1.027363,-0.287758,-0.170014,-0.06315,-0.092944,-0.223607,-0.094801,...,-0.175272,-0.064249,-0.041423,-0.094801,-0.055613,-0.052423,-0.298629,-0.049029,0.395018,
1462,0.464374,-0.507284,0.848148,0.781366,-0.287758,-0.199616,-0.06315,-0.092944,-0.223607,-0.094801,...,-0.175272,-0.064249,-0.041423,-0.094801,-0.055613,-0.052423,-0.298629,-0.049029,0.395018,
1463,-0.024109,0.391237,0.881166,0.781366,0.349546,-0.170014,-0.06315,-0.092944,-0.223607,-0.094801,...,-0.175272,-0.064249,-0.041423,-0.094801,-0.055613,-0.052423,-0.298629,-0.049029,0.395018,
1464,-0.654748,-0.507284,0.683057,0.781366,-0.925062,0.510823,-0.06315,-0.092944,-0.223607,-0.094801,...,-0.175272,-0.064249,-0.041423,-0.094801,-0.055613,-0.052423,-0.298629,-0.049029,0.395018,


## Modelo

Ya que existen distintos tipos de regresores, se decidió colocarlos en un diccionario para automatizar la regresión de todos en pocas lineas de código.

La métrica que propuesta fue "Raíz Cuadrada del Error Logarítmico Medio" , con el que se comprabará la precisión del modelo.

In [198]:
estimators = {
    'ElasticNet': ElasticNet(random_state=0),
    'Lasso': Lasso(alpha=0.2),
    'Ridge': Ridge(alpha=1),
    'Huber': HuberRegressor(epsilon=1.46,fit_intercept=True),
    'RandomForest': RandomForestRegressor(),
    'DecisionTree': DecisionTreeRegressor()
}

In [199]:
for name, estimator in estimators.items():
    score = cross_val_score(estimator, X, y, cv=5, scoring='neg_mean_squared_log_error')
    print(name, np.abs(score).min())

ElasticNet 0.0279413922934663
Lasso 0.02834341817368946
Ridge 0.0283199250124809
Huber 0.02755907651671961
RandomForest 0.027036733139038933
DecisionTree 0.052303129468979685


#### Mejor Modelo
El mejor modelo fue RandomForest.
Se optimizará el resultado con los siguientes parámetros.

In [223]:
random_params = {'n_estimators': [300,400,500,600],  
               'max_features': ['sqrt'],  
               'max_depth': [ 60, 70, 80, 90, 100,], 
               'min_samples_split':  [2, 5, 10], 
               'min_samples_leaf': [1, 2, 4], 
               'bootstrap': [True, False]}

score_rand = RandomizedSearchCV(RandomForestRegressor(), random_params, cv=3, scoring='neg_mean_squared_log_error',n_iter=20).fit(X, y)

In [224]:
score_rand.best_params_

{'n_estimators': 600,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 100,
 'bootstrap': False}

In [228]:
random_forest=RandomForestRegressor(n_estimators=600,max_features='sqrt',max_depth=100,min_samples_split=2,min_samples_leaf=1,bootstrap=False)

In [229]:
score = cross_val_score(random_forest, X, y, cv=5, scoring='neg_mean_squared_log_error')
print('RMSLE: ',np.abs(score).min())

RMSLE:  0.025712706115561878


## Resultados

Se obtuvo una métrica del 0.02571 gracias a elegir correctamente los hyperparámetros del RandomForestRegressor. 

Se procederá a predecir el dataset de Test

In [246]:
X_test = test.drop(['SalePrice'],axis=1)
random_forest_model = random_forest.fit(X,y)
y_pred = random_forest_model.predict(X_test)

In [247]:
X_test['SalePrice'] = y_pred
X_test.to_csv('./out/predictions.csv',index=False)