Importing Libararies

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

Cleaning Func

In [2]:
def rm_nan(df):
    for col in df:
        df.fillna({col: 0}, inplace=True)
    return df
    
def clean(df):
    df.fillna({'LotFrontage': 0}, inplace=True)
    df = df.drop(['MasVnrArea', 'GarageYrBlt'], axis=1)
    return df

def estimate_mae(trainX, validX, trainY, validY, n=100):
    model = RandomForestRegressor(n_estimators=n, random_state=0)
    model.fit(trainX, trainY)
    predictions = model.predict(validX)
    return mean_absolute_error(validY, predictions)

Load data and seperate target

In [3]:
file_path = './train.csv'
data = pd.read_csv(file_path)
test_data = pd.read_csv('./test.csv')

data = clean(data)
test_data = clean(test_data)

Clean column differences

In [4]:
cols_data = data.drop('SalePrice', axis=1)
cold_data = cols_data.columns
cols_test = test_data.columns

match_cols = [col for col in cols_test if col in cols_data]

bad_cols_data = list(set(cols_data) - set(match_cols))
bad_cols_test = list(set(cols_test) - set(match_cols))

data = data.drop(bad_cols_data, axis=1)
test_data = test_data.drop(bad_cols_test, axis=1)

Check cardinal columns

In [5]:
good_cols = [col for col in data.columns if data[col].dtype == 'object']
lcols = [col for col in good_cols if data[col].nunique() < 10]
hcols = list(set(good_cols) - set(lcols))

Split test data

In [6]:
trainX, validX, trainY, validY = train_test_split(data.drop('SalePrice', axis=1), data.SalePrice, random_state=0)

Naive Imputation

In [13]:
from sklearn.impute import SimpleImputer

IMP = SimpleImputer()

imp_data = data.drop('SalePrice', axis=1)
imp_cols = [col for col in imp_data.columns if imp_data[col].dtype != 'object']
nimp_cols = list(set(imp_data.columns) - set(imp_cols))

imp_trainX = pd.DataFrame(IMP.fit_transform(trainX[imp_cols]))
imp_validX = pd.DataFrame(IMP.transform(validX[imp_cols]))

imp_trainX.columns = trainX[imp_cols].columns
imp_validX.columns = validX[imp_cols].columns

trainX = imp_trainX.join(trainX[nimp_cols])
validX = imp_validX.join(validX[nimp_cols])

One-hot encoding

In [14]:
from sklearn.preprocessing import OneHotEncoder

OH = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

OH_cols_train = pd.DataFrame(OH.fit_transform(trainX[lcols]))
OH_cols_valid = pd.DataFrame(OH.transform(validX[lcols]))

OH_cols_train.index = trainX.index
OH_cols_valid.index = validX.index

num_X_train = trainX.drop(lcols, axis=1)
num_X_valid = validX.drop(lcols, axis=1)

OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)

OH_X_train = OH_X_train.drop(hcols, axis=1)
OH_X_valid = OH_X_valid.drop(hcols, axis=1)

OH_X_train.columns = OH_X_train.columns.astype(str)
OH_X_valid.columns = OH_X_valid.columns.astype(str)

Test model

In [15]:
print(estimate_mae(OH_X_train, OH_X_valid, trainY, validY, 500))

17649.415320547945


Encode test data

In [10]:
OH_cols_test = pd.DataFrame(OH.transform(test_data[lcols]))
OH_cols_test.index = test_data.index
num_X_test = test_data.drop(lcols, axis=1)
OH_test = pd.concat([num_X_test, OH_cols_test], axis=1)
OH_test = OH_test.drop(hcols, axis=1)
OH_test.columns = OH_test.columns.astype(str)

['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']
[]


Run predictions

In [11]:
model = RandomForestRegressor(n_estimators=500, random_state=0)

#print(OH_X_train.columns, OH_test.columns)

model.fit(OH_X_train, trainY)
final_predictions = model.predict(OH_test)

Generate Kaggle Housing Prices Competition Submission

In [12]:
output = pd.DataFrame({'Id': test_data.Id, 'SalePrice': final_predictions})
output.to_csv('submission.csv', index=False)