Importing Libararies

In [24]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

Load data and seperate target

In [25]:
file_path = './train.csv'
data = pd.read_csv(file_path)
test_data = pd.read_csv('./test.csv')

y = data.SalePrice
data = data.drop('SalePrice', axis=1)

Split test data

In [26]:
numerical_cols = [col for col in data.columns if data[col].dtype in ['int64', 'float64']]
ordinal_cols = ['Street', 'Alley', 'LotShape', 'Utilities', 'HouseStyle', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
                'BsmtFinType1', 'BsmtFinType2', 'HeatingQC', 'CentralAir', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageFinish', 
                'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence']
onehot_cols = ['MSSubClass', 'MSZoning', 'LandContour', 'LotConfig', 'LandSlope', 'Condition1', 'Condition2', 'BldgType', 'RoofStyle', 'RoofMatl',
               'MasVnrType', 'Foundation', 'Heating', 'Electrical', 'GarageType', 'MiscFeature', 'SaleType', 'SaleCondition']
#categorical_cols = [col for col in data.columns if data[col].nunique() < 10 and data[col].dtype == 'object']

my_cols = numerical_cols + ordinal_cols + onehot_cols
my_cols = list(set(my_cols))

data = data[my_cols].copy()
test_data = test_data[my_cols].copy()

trainX, validX, trainY, validY = train_test_split(data, y, random_state=0)

Pipelines

In [27]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

numerical_transformer = SimpleImputer(strategy='most_frequent')

ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

onehot_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('ord', ordinal_transformer, ordinal_cols),
        ('one', onehot_transformer, onehot_cols)
])

model = RandomForestRegressor(n_estimators=100, random_state=0)

final_pipeline = Pipeline(steps = [
    ('preprocessor', preprocessor),
    ('model', model)
])

final_pipeline.fit(trainX, trainY)
predictions = final_pipeline.predict(validX)

print(mean_absolute_error(validY, predictions))

17052.46898630137


Run predictions

In [6]:
final_pipeline.fit(data, y)
predictions = final_pipeline.predict(test_data)

Generate Kaggle Housing Prices Competition Submission

In [7]:
output = pd.DataFrame({'Id': test_data.Id, 'SalePrice': predictions})
output.to_csv('submission.csv', index=False)