In [1]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

# Load the data, and separate the target
iowa_file_path = '../input/home-data-for-ml-course/train.csv'
home_data = pd.read_csv(iowa_file_path)

# Add engineered features
home_data['TotalBathrooms'] = home_data['FullBath'] + 0.5 * home_data['HalfBath']
home_data['TotalSF'] = home_data['1stFlrSF'] + home_data['2ndFlrSF'] + home_data['GrLivArea']

# Define target with log1p transformation
y = np.log1p(home_data['SalePrice'])

# Create X #features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
features = ['MSSubClass','LotArea','OverallQual','OverallCond','YearBuilt',
            'YearRemodAdd','1stFlrSF','2ndFlrSF','LowQualFinSF','GrLivArea',
            'FullBath','HalfBath','BedroomAbvGr','KitchenAbvGr','TotRmsAbvGrd',
            'Fireplaces','WoodDeckSF','OpenPorchSF','EnclosedPorch','3SsnPorch',
            'ScreenPorch','PoolArea','MiscVal','MoSold','YrSold'
]

# Select columns corresponding to features, and preview the data
X = home_data[features]

# Handle missing values using median imputation
imputer = SimpleImputer(strategy='median')
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# Split into validation and training data
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

# Define a random forest model
rf_model = RandomForestRegressor(random_state=1, n_estimators=700)
rf_model.fit(train_X, train_y)
rf_val_predictions = rf_model.predict(val_X)

# Train Gradient Boosting model
gbm_model = GradientBoostingRegressor(random_state=1, n_estimators=500)
gbm_model.fit(train_X, train_y)
gbm_val_predictions = gbm_model.predict(val_X)

# Blend validation predictions
val_preds = (rf_val_predictions + gbm_val_predictions) / 2
print("Validation RMSE:", np.sqrt(mean_squared_error(val_y, val_preds)))

Validation RMSE: 0.14184472861566697


In [2]:
rf_model.fit(X,y)
gbm_model.fit(X,y)

# Load test data
test_data_path = '../input/home-data-for-ml-course/test.csv'
test_data = pd.read_csv(test_data_path)

# read test data file using pandas
test_data = pd.read_csv(test_data_path)
test_data = test_data.fillna(-1)

# Add engineered features to test data
test_data['TotalBathrooms'] = test_data['FullBath'] + 0.5 * test_data['HalfBath']
test_data['TotalSF'] = test_data['1stFlrSF'] + test_data['2ndFlrSF'] + test_data['GrLivArea']

# Select test features
test_X = test_data[features]

# Impute missing values in test data using same imputer
test_X = pd.DataFrame(imputer.transform(test_X), columns=test_X.columns)

# Make predictions and reverse log transform
test_preds1 = np.expm1(rf_model.predict(test_X))
test_preds2 = np.expm1(gbm_model.predict(test_X))
test_preds = (test_preds1 + test_preds2) / 2

# Create submission
output = pd.DataFrame({'Id': test_data["Id"], 'SalePrice': test_preds})
output.to_csv('submission.csv', index=False)

In [3]:

['MSSubClass',
'LotArea',
'OverallQual',
'OverallCond',
'YearBuilt',
'YearRemodAdd',
'1stFlrSF',
'2ndFlrSF',
'LowQualFinSF',
'GrLivArea',
'FullBath',
'HalfBath',
'BedroomAbvGr',
'KitchenAbvGr',
'TotRmsAbvGrd',
'Fireplaces',
'WoodDeckSF',
'OpenPorchSF',
'EnclosedPorch',
'3SsnPorch',
'ScreenPorch',
'PoolArea',
'MiscVal',
'MoSold',
'YrSold']

['MSSubClass',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal',
 'MoSold',
 'YrSold']

In [4]:
output.head()

Unnamed: 0,Id,SalePrice
0,1461,125451.680656
1,1462,158010.052852
2,1463,177963.947741
3,1464,185192.347682
4,1465,191270.479038
