In [1]:
#Read the data
import pandas as pd
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')

# Drop houses where the target is missing
train_data.dropna(axis=0, subset=['SalePrice'], inplace=True)

target = train_data.SalePrice

# Since missing values isn't the focus of this tutorial, we use the simplest
# possible approach, which drops these columns. 
# For more detail (and a better approach) to missing values, see
# https://www.kaggle.com/dansbecker/handling-missing-values
cols_with_missing = [col for col in train_data.columns 
                                 if train_data[col].isnull().any()]                                  
candidate_train_predictors = train_data.drop(['Id', 'SalePrice'] + cols_with_missing, axis=1)
candidate_test_predictors = test_data.drop(['Id'] + cols_with_missing, axis=1)

# "cardinality" means the number of unique values in a column.
# We use it as our only way to select categorical columns here. This is convenient, though
# a little arbitrary.
low_cardinality_cols = [cname for cname in candidate_train_predictors.columns if 
                                candidate_train_predictors[cname].nunique() < 10 and
                                candidate_train_predictors[cname].dtype == "object"]
numeric_cols = [cname for cname in candidate_train_predictors.columns if 
                                candidate_train_predictors[cname].dtype in ['int64', 'float64']]
my_cols = low_cardinality_cols + numeric_cols
train_predictors = candidate_train_predictors[my_cols]
test_predictors = candidate_test_predictors[my_cols]

In [2]:
train_predictors.dtypes.sample(5)

ExterCond       object
Street          object
PoolArea         int64
MSZoning        object
BsmtFullBath     int64
dtype: object

In [3]:
one_hot_encoded_training_predictors = pd.get_dummies(train_predictors)

In [4]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor

def get_mae(X, y):
    # multiple by -1 to make positive MAE score instead of neg value returned as sklearn convention
    return -1 * cross_val_score(RandomForestRegressor(50), 
                                X, y, 
                                scoring = 'neg_mean_absolute_error').mean()

predictors_without_categoricals = train_predictors.select_dtypes(exclude=['object'])

mae_without_categoricals = get_mae(predictors_without_categoricals, target)

mae_one_hot_encoded = get_mae(one_hot_encoded_training_predictors, target)

print('Mean Absolute Error when Dropping Categoricals: ' + str(int(mae_without_categoricals)))
print('Mean Abslute Error with One-Hot Encoding: ' + str(int(mae_one_hot_encoded)))

Mean Absolute Error when Dropping Categoricals: 18369
Mean Abslute Error with One-Hot Encoding: 18226


In [5]:
#print(test_predictors)
test_predictors.dtypes.sample(5)

LotConfig      object
HouseStyle     object
OverallCond     int64
PoolArea        int64
Functional     object
dtype: object

In [6]:
one_hot_encoded_test_predictors = pd.get_dummies(test_predictors)
#one_hot_encoded_test_predictors.to_csv('one_hot_encoded_test_predictors.csv', index=False)

In [7]:
final_train, final_test = one_hot_encoded_training_predictors.align(one_hot_encoded_test_predictors, join='inner', axis=1)  

In [54]:
from sklearn.preprocessing import Imputer
forest_model = RandomForestRegressor(50)
forest_model.fit(final_train, target)
my_imputer = Imputer()
predicted_prices = forest_model.predict(my_imputer.fit_transform(final_test))
print(predicted_prices)

[ 127092.52  159418.    177000.5  ...,  158074.68  115162.    216911.04]


In [9]:
from xgboost import XGBRegressor

my_model = XGBRegressor()
# Add silent=True to avoid printing out updates with each cycle
my_model.fit(final_train, target, verbose=False)

ModuleNotFoundError: No module named 'xgboost'

In [None]:
# make predictions
predictions = my_model.predict(test_X)

from sklearn.metrics import mean_absolute_error
print("Mean Absolute Error : " + str(mean_absolute_error(predictions, test_y)))

In [56]:
my_submission = pd.DataFrame({'Id': test_data.Id, 'SalePrice': predicted_prices})
# you could use any filename. We choose submission here
my_submission.to_csv('submission_ohe.csv', index=False)