# House Prices - Keagle

## Dataset

https://www.kaggle.com/c/house-prices-advanced-regression-techniques/

http://jse.amstat.org/v19n3/decock.pdf

In [1]:
# Imports
import numpy as np 
import pandas as pd 
from scipy.stats import skew
from sklearn.linear_model import LassoCV
from sklearn.model_selection import cross_val_score,GridSearchCV
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

In [26]:
train = pd.read_csv('treino.csv')
test = pd.read_csv("teste.csv")
all_data = pd.concat((train.loc[:,'MSSubClass':'SaleCondition'], 
                      test.loc[:,'MSSubClass':'SaleCondition']))
all_data.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


In [27]:
#Missing Values and log transform
train["SalePrice"] = np.log1p(train["SalePrice"])
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) 
skewed_feats = skewed_feats[skewed_feats > 0.75].index

all_data[skewed_feats] = np.log1p(all_data[skewed_feats])
all_data = pd.get_dummies(all_data)
all_data = all_data.fillna(all_data.mean())

numeric_feats = test.dtypes[test.dtypes != "object"].index
skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) 
skewed_feats = skewed_feats[skewed_feats > 0.75].index
test[skewed_feats] = np.log1p(test[skewed_feats])
test = pd.get_dummies(test)
test = test.fillna(test.mean())

all_col = list(all_data.columns)
test_col = list(test.columns)
eq = [i for i in all_col if i in test_col]
diff = [i for i in all_col if i not in test_col]
test_model = test[eq]
[all_data.drop(item,axis=1,inplace=True) for item in diff]

X_train = all_data[:train.shape[0]]
X_test = all_data[train.shape[0]:]
y_train = train.SalePrice
    #There isn't 'Sale Price' in test dataset

In [28]:
modelo = LassoCV(alphas = [0.0000000001]).fit(X_train, y_train)


In [29]:
def rmse_cv(modelo):
    rmse = np.sqrt(-cross_val_score(modelo, 
                                    X_train, 
                                    y_train, 
                                    scoring = "neg_mean_squared_error", 
                                    cv = 5))
    return(rmse)

In [30]:
rmse_cv(modelo).mean()

0.14825369470324215

In [31]:
numeric_feats = test.dtypes[test.dtypes != "object"].index
skewed_feats = test[numeric_feats].apply(lambda x: skew(x.dropna())) 
skewed_feats = skewed_feats[skewed_feats > 0.75].index

test[skewed_feats] = np.log1p(test[skewed_feats])
test = pd.get_dummies(test)
test = test.fillna(test.mean())

test.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1461,3.044522,4.394449,9.360741,5,6,1961,1961,0.0,6.150603,...,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,1,0.0
1,1462,3.044522,4.406719,9.565775,6,6,1958,1958,4.691348,6.828712,...,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,1,0.0
2,1463,4.110874,4.317488,9.534668,5,5,1997,1998,0.0,6.674561,...,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,1,0.0
3,1464,4.110874,4.369448,9.208238,6,6,1998,1998,3.044522,6.401917,...,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,1,0.0
4,1465,4.795791,3.78419,8.518392,8,5,1992,1992,0.0,5.575949,...,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,1,0.0


In [32]:
modelo.predict(test_model)

array([11.69366131, 11.90978634, 12.13928387, ..., 12.08495142,
       11.6992672 , 12.31115176])