In [8]:
# Code you have previously used to load data
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor


# Path of the file to read. We changed the directory structure to simplify submitting to a competition
iowa_file_path = './train.csv'

home_data = pd.read_csv(iowa_file_path)
# Create target object and call it y
y = home_data.SalePrice
# Create X
features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
X = home_data[features]

# Split into validation and training data
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

# Specify Model
iowa_model = DecisionTreeRegressor(random_state=1)
# Fit Model
iowa_model.fit(train_X, train_y)

# Make validation predictions and calculate mean absolute error
val_predictions = iowa_model.predict(val_X)
val_mae = mean_absolute_error(val_predictions, val_y)
print("Validation MAE when not specifying max_leaf_nodes: {:,.0f}".format(val_mae))

# Using best value for max_leaf_nodes
iowa_model = DecisionTreeRegressor(max_leaf_nodes=100, random_state=1)
iowa_model.fit(train_X, train_y)
val_predictions = iowa_model.predict(val_X)
val_mae = mean_absolute_error(val_predictions, val_y)
print("Validation MAE for best value of max_leaf_nodes: {:,.0f}".format(val_mae))

# Define the model. Set random_state to 1
rf_model = RandomForestRegressor(random_state=1)
rf_model.fit(train_X, train_y)
rf_val_predictions = rf_model.predict(val_X)
rf_val_mae = mean_absolute_error(rf_val_predictions, val_y)

print("Validation MAE for Random Forest Model: {:,.0f}".format(rf_val_mae))

# RF trained with all the data inside features 
rf_model_on_full_data = RandomForestRegressor(random_state=1)
rf_model_on_full_data.fit(X,y)

test_preds = rf_model_on_full_data.predict(val_X)
rf_full_mae = mean_absolute_error(test_preds, val_y)

print("Validation MAE for Full Random Forest Model: {:,.0f}".format(rf_full_mae))

# RF dropping columns with missing values
cols_with_missing = [col for col in home_data.columns 
                                 if home_data[col].isnull().any()]
X = home_data.drop(['SalePrice'], axis=1)
X = X.drop(cols_with_missing, axis=1)
X = X.select_dtypes(exclude=['object'])

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

rf_nocol = RandomForestRegressor(random_state=1)
rf_nocol.fit(X,y)

test_nocol = rf_nocol.predict(val_X)
rf_nocol_mae = mean_absolute_error(test_nocol, val_y)

print("Validation MAE for NoCol Random Forest Model: {:,.0f}".format(rf_nocol_mae))

# RF using imputation
from sklearn.impute import SimpleImputer

X = home_data.drop(['SalePrice'], axis=1)
X = X.select_dtypes(exclude=['object'])

my_imputer = SimpleImputer()
imputed_X = my_imputer.fit_transform(X)

train_X, val_X, train_y, val_y = train_test_split(imputed_X, y, random_state=1)

rf_imp = RandomForestRegressor(random_state=1)
rf_imp.fit(imputed_X,y)

test_imp = rf_imp.predict(val_X)
rf_imp_mae = mean_absolute_error(test_imp, val_y)

print("Validation MAE for Imputated Random Forest Model: {:,.0f}".format(rf_imp_mae))

#RF with extra columns for imputed values
imputed_X_plus = X.copy()

cols_with_missing = (col for col in X.columns 
                                 if X[col].isnull().any())
print(type(cols_with_missing))
for col in cols_with_missing:
    imputed_X_plus[col + '_was_missing'] = imputed_X_plus[col].isnull()

my_imputer = SimpleImputer()
imputed_X_plus = my_imputer.fit_transform(imputed_X_plus)

train_X, val_X, train_y, val_y = train_test_split(imputed_X_plus, y, random_state=1)

rf_imp_col = RandomForestRegressor(random_state=1)
rf_imp_col.fit(imputed_X_plus,y)

test_imp_col = rf_imp_col.predict(val_X)
rf_imp_col_mae = mean_absolute_error(test_imp_col, val_y)

print("Validation MAE for Imputated extra Columns Random Forest Model: {:,.0f}".format(rf_imp_col_mae))


Validation MAE when not specifying max_leaf_nodes: 29,653
Validation MAE for best value of max_leaf_nodes: 27,283
Validation MAE for Random Forest Model: 22,762
Validation MAE for Full Random Forest Model: 8,809
Validation MAE for NoCol Random Forest Model: 7,196
Validation MAE for Imputated Random Forest Model: 7,340
<type 'generator'>
Validation MAE for Imputated extra Columns Random Forest Model: 7,006
