In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

In [2]:
#Let us read the data
df = pd.read_csv('train.csv')
df.shape

(1460, 81)

In [3]:
#choose target as SalePrice
y = df.SalePrice
#Now set features except target
X_possible = df.drop(['SalePrice'], axis=1)
#To make simple letus drop colums having srting
X = X_possible.select_dtypes(exclude = ['object'])
X.head(5)

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,548,0,61,0,0,0,0,0,2,2008
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,...,460,298,0,0,0,0,0,0,5,2007
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,...,608,0,42,0,0,0,0,0,9,2008
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,...,642,0,35,272,0,0,0,0,2,2006
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,...,836,192,84,0,0,0,0,0,12,2008


In [4]:
X_train, X_val, y_train, y_val = train_test_split(X,y, random_state=0)
X_train.head(5)

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
1292,1293,70,60.0,6600,5,4,1892,1965,0.0,0,...,432,0,287,0,0,0,0,0,12,2009
1018,1019,80,,10784,7,5,1991,1992,76.0,0,...,402,164,0,0,0,0,0,0,5,2007
1213,1214,80,,10246,4,9,1965,2001,0.0,648,...,364,88,0,0,0,0,0,0,5,2006
1430,1431,60,60.0,21930,5,5,2005,2005,0.0,0,...,372,100,40,0,0,0,0,0,7,2006
810,811,20,78.0,10140,6,6,1974,1999,99.0,663,...,484,265,0,0,0,0,648,0,1,2006


In [5]:
#Many columns has missing values so let us find those column which has missing values
X_missing = [ col for col in X_train.columns if X_train[col].isnull().any()]
#now drops the colums which has missing value in train and test set
X_reduced_train = X_train.drop(X_missing, axis=1)
X_reduced_val = X_val.drop(X_missing, axis=1)
X_missing

['LotFrontage', 'MasVnrArea', 'GarageYrBlt']

In [6]:
#Now fit the model
model_1 = RandomForestRegressor(n_estimators=10, random_state=0)
model_1.fit(X_reduced_train, y_train)
predict_1 = model_1.predict(X_reduced_val)
mae_1 = mean_absolute_error(predict_1, y_val)
mae_1

19895.280821917808

# Let us use imputation to replace the missing value

In [7]:
from sklearn.impute import SimpleImputer

In [8]:
my_imputer = SimpleImputer()
X_impute_train = pd.DataFrame(my_imputer.fit_transform(X_train))
X_impute_val = pd.DataFrame(my_imputer.fit_transform(X_val))

In [9]:
#Imputation removed column names so that need to put them back
X_impute_train.columns = X_train.columns
X_impute_val.columns = X_val.columns

In [10]:
#Now fit the model 
model_2 = RandomForestRegressor(n_estimators=10, random_state=1)
model_2.fit(X_impute_train, y_train)
predict_2 = model_2.predict(X_impute_val)
mae_2 = mean_absolute_error(predict_2, y_val)
mae_2

18909.105205479453

In [11]:
#Another way to impute

In [12]:
#First copy data
X_train_new = X_train.copy()
X_val_new = X_val.copy()

In [13]:
for col in X_missing:
    X_train_new[col + '_was_missing'] = X_train_new[col].isnull()
    X_val_new[col + '_was_missing'] = X_val_new[col].isnull()
    

In [14]:
X_train_new.head(5)

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,LotFrontage_was_missing,MasVnrArea_was_missing,GarageYrBlt_was_missing
1292,1293,70,60.0,6600,5,4,1892,1965,0.0,0,...,0,0,0,0,0,12,2009,False,False,False
1018,1019,80,,10784,7,5,1991,1992,76.0,0,...,0,0,0,0,0,5,2007,True,False,False
1213,1214,80,,10246,4,9,1965,2001,0.0,648,...,0,0,0,0,0,5,2006,True,False,False
1430,1431,60,60.0,21930,5,5,2005,2005,0.0,0,...,0,0,0,0,0,7,2006,False,False,False
810,811,20,78.0,10140,6,6,1974,1999,99.0,663,...,0,0,0,648,0,1,2006,False,False,False


In [15]:
my_imputer = SimpleImputer()
X_impute_train_new = pd.DataFrame(my_imputer.fit_transform(X_train_new))
X_impute_val_new = pd.DataFrame(my_imputer.fit_transform(X_val_new))

In [16]:
X_impute_train_new.columns = X_train_new.columns
X_impute_val_new.columns = X_val_new.columns

In [17]:
model_3 = RandomForestRegressor(n_estimators=10, random_state=1)
model_3.fit(X_impute_train_new, y_train)
predict_3 = model_3.predict(X_impute_val_new)
mae_3 = mean_absolute_error(predict_3, y_val)
mae_3

19125.07698630137