# House Price - Model
## Data Loading

In [60]:
# CSV reading
import pandas as pd
pd.set_option('display.max_rows', 100)

df_train = pd.read_csv("train.csv", delimiter=',')
df_test = pd.read_csv("test.csv", delimiter=',')
print("Training dataset dimension : {}".format(df_train.shape))
print("Test dataset dimension : {}".format(df_test.shape))

Training dataset dimension : (1460, 81)
Test dataset dimension : (1459, 80)


In [61]:
# Data combination
targets = df_train['SalePrice']
df_train.drop(['SalePrice'], 1, inplace=True)
N_train = df_train.shape[0]
df_combined = df_train.append(df_test)
df_combined.reset_index(inplace=True)

#Data description
print("Data description :")
print(df_combined.dtypes)
print(df_combined.describe(include="all"))
print("\n")

# Missing data
print("Missing data :")
print(round(df_combined.isnull().sum()*100/len(df_combined),2).sort_values(ascending=False))
print("Combined dataset dimension : {}".format(df_combined.shape))

Data description :
index              int64
Id                 int64
MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
Street            object
Alley             object
LotShape          object
LandContour       object
Utilities         object
LotConfig         object
LandSlope         object
Neighborhood      object
Condition1        object
Condition2        object
BldgType          object
HouseStyle        object
OverallQual        int64
OverallCond        int64
YearBuilt          int64
YearRemodAdd       int64
RoofStyle         object
RoofMatl          object
Exterior1st       object
Exterior2nd       object
MasVnrType        object
MasVnrArea       float64
ExterQual         object
ExterCond         object
Foundation        object
BsmtQual          object
BsmtCond          object
BsmtExposure      object
BsmtFinType1      object
BsmtFinSF1       float64
BsmtFinType2      object
BsmtFinSF2       float64
BsmtUnfSF        float64
TotalB

## Feature Engineering

In [62]:
# One Hot Encoding
variables_to_encode = []
for attribute in df_combined.columns:
    if df_combined[attribute].dtypes == "object":
        variables_to_encode.append(attribute)

for variable in variables_to_encode:
    dummy = pd.get_dummies(df_combined[variable], prefix=variable, dummy_na = True)
    df_combined = pd.concat([df_combined, dummy], axis=1)
    df_combined.drop(variable, axis=1, inplace=True)

In [63]:
# Index and ID are dropped
df_combined.drop(['index','Id'], inplace=True, axis=1)

In [64]:
print("Data :")
print(df_combined.head())

print("Data types:")
pd.set_option('display.max_rows', 1000)
print(df_combined.dtypes)

print("Missing data :")
print(round(df_combined.isnull().sum()*100/len(df_combined),2).sort_values(ascending=False))

Data :
   MSSubClass  LotFrontage  LotArea  OverallQual  OverallCond  YearBuilt  \
0          60         65.0     8450            7            5       2003   
1          20         80.0     9600            6            8       1976   
2          60         68.0    11250            7            5       2001   
3          70         60.0     9550            7            5       1915   
4          60         84.0    14260            8            5       2000   

   YearRemodAdd  MasVnrArea  BsmtFinSF1  BsmtFinSF2  ...  SaleType_Oth  \
0          2003       196.0       706.0         0.0  ...             0   
1          1976         0.0       978.0         0.0  ...             0   
2          2002       162.0       486.0         0.0  ...             0   
3          1970         0.0       216.0         0.0  ...             0   
4          2000       350.0       655.0         0.0  ...             0   

   SaleType_WD  SaleType_nan  SaleCondition_Abnorml  SaleCondition_AdjLand  \
0            

In [65]:
# NaN values to fill with the mean
variables_to_fill = ["LotFrontage","GarageYrBlt","MasVnrArea","BsmtHalfBath","BsmtFullBath","TotalBsmtSF","BsmtUnfSF","BsmtFinSF2","BsmtFinSF1","GarageCars","GarageArea"]
for attribute in variables_to_fill:
    df_combined[attribute].fillna((df_combined[attribute].mean()), inplace=True)

# Check that we do not have any missing data left
print("Missing data :")
print(round(df_combined.isnull().sum()*100/len(df_combined),2).sort_values(ascending=False))

Missing data :
SaleCondition_nan        0.0
Condition1_RRAn          0.0
Condition1_RRNn          0.0
Condition1_nan           0.0
Condition2_Artery        0.0
Condition2_Feedr         0.0
Condition2_Norm          0.0
Condition2_PosA          0.0
Condition2_PosN          0.0
Condition2_RRAe          0.0
Condition2_RRAn          0.0
Condition2_RRNn          0.0
Condition2_nan           0.0
BldgType_1Fam            0.0
BldgType_2fmCon          0.0
BldgType_Duplex          0.0
BldgType_Twnhs           0.0
BldgType_TwnhsE          0.0
BldgType_nan             0.0
Condition1_RRNe          0.0
Condition1_RRAe          0.0
Neighborhood_Mitchel     0.0
Condition1_PosN          0.0
Neighborhood_NPkVill     0.0
Neighborhood_NWAmes      0.0
Neighborhood_NoRidge     0.0
Neighborhood_NridgHt     0.0
Neighborhood_OldTown     0.0
Neighborhood_SWISU       0.0
Neighborhood_Sawyer      0.0
Neighborhood_SawyerW     0.0
Neighborhood_Somerst     0.0
Neighborhood_StoneBr     0.0
Neighborhood_Timber      0.0

## Model

In [66]:
from sklearn.model_selection import train_test_split

# Building of the train and test datasets
train = df_combined.iloc[:N_train]
test = df_combined.iloc[N_train:]
X_train, X_test, y_train, y_test = train_test_split(train, targets, test_size=0.3, random_state=5)

In [92]:
# We test several regression methods
Models = []

# Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor
reg = RandomForestRegressor(random_state=0)
Models.append(reg)

# Lasso
from sklearn import linear_model
reg = linear_model.Lasso(alpha = 10, max_iter = 10000)
Models.append(reg)

# MLP
from sklearn.neural_network import MLPRegressor
reg = MLPRegressor(hidden_layer_sizes=(5), activation='logistic', solver='sgd', max_iter=10000, learning_rate_init=0.001)
Models.append(reg)

# Gaussian Process
from sklearn.gaussian_process import GaussianProcessRegressor
reg = GaussianProcessRegressor()
Models.append(reg)

In [93]:
import random
import numpy as np
from sklearn.model_selection import KFold

# Cross validation for each model
def learn(models, N):
    Kfold = KFold(n_splits=N, shuffle=True)
    for model in models:
        print(model)
        score = []
        for train_index, test_index in Kfold.split(train, targets):
            Xtrain, Xtest = train.loc[train_index], train.loc[test_index]
            ytrain, ytest = targets.loc[train_index], targets.loc[test_index]
            model.fit(Xtrain,ytrain);
            score += [model.score(Xtest,ytest)]
            print('*', end='')
        print(" done!")
        print("Average generalization score:", np.mean(score))
        print("Standard deviation:", np.std(score))
        print()
        
learn(Models,3)

RandomForestRegressor(random_state=0)
*** done!
Average generalization score: 0.8538601360603179
Standard deviation: 0.016968000255485467

Lasso(alpha=10, max_iter=10000)
*** done!
Average generalization score: 0.8219961374142243
Standard deviation: 0.06314240494006386

MLPRegressor(activation='logistic', hidden_layer_sizes=5, max_iter=10000,
             solver='sgd')
*** done!
Average generalization score: -0.0009158497758523509
Standard deviation: 0.0005534134567801709

GaussianProcessRegressor()
*** done!
Average generalization score: -5.21359046390218
Standard deviation: 0.22474984165366596



In [89]:
from sklearn.model_selection import GridSearchCV

# Hyperparameters of Lasso tuning using GridSearchCV
parameters = {'alpha':np.logspace(0,5,6)}
Lreg = linear_model.Lasso(max_iter = 100000)
reg = GridSearchCV(Lreg, parameters)
reg.fit(X_train, y_train)

print("Best parameters set found on development set:")
print()
print(reg.best_params_)
print()
print("Grid scores on development set:")
print()
means = reg.cv_results_['mean_test_score']
stds = reg.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, reg.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))

Best parameters set found on development set:

{'alpha': 100.0}

Grid scores on development set:

0.761 (+/-0.213) for {'alpha': 1.0}
0.786 (+/-0.340) for {'alpha': 10.0}
0.802 (+/-0.365) for {'alpha': 100.0}
0.756 (+/-0.398) for {'alpha': 1000.0}
0.696 (+/-0.411) for {'alpha': 10000.0}
0.672 (+/-0.419) for {'alpha': 100000.0}


In [90]:
from sklearn import linear_model
import numpy as np
Lreg = linear_model.Lasso(alpha = 100, max_iter = 10000)
Lreg.fit(X_train,y_train)
Lreg.score(X_test,y_test)
print(Lreg.coef_.shape)
print(train.columns.shape)
array = np.array([train.columns,Lreg.coef_])
array = array.T
df = pd.DataFrame(array, columns = ['Attribute', 'Coeff'])
df[df.Coeff != 0].sort_values(by=['Coeff'])

(331,)
(331,)


Unnamed: 0,Attribute,Coeff
138,RoofMatl_ClyTile,-475965.0
21,KitchenAbvGr,-9022.39
82,Neighborhood_Mitchel,-6910.68
216,BsmtExposure_No,-6779.84
85,Neighborhood_NWAmes,-6173.45
20,BedroomAbvGr,-6128.22
143,RoofMatl_Tar&Grv,-5384.47
221,BsmtFinType1_LwQ,-5263.58
180,MasVnrType_BrkCmn,-5209.7
78,Neighborhood_Edwards,-4830.11


## Submission

In [91]:
# Training of the model
model = reg
model.fit(train,targets)
# Prediction on the test dataset
ypred = model.predict(test)

In [71]:
# Formatting of the output
dfypred = pd.DataFrame({'SalePrice': ypred})
dfsubmission = pd.concat([df_test['Id'],dfypred],axis=1)
# CSV creation
dfsubmission.to_csv('out.csv', index = False)