In [43]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Read the data
X_train = pd.read_csv('../input/housing-prices-competition-for-kaggle-learn-users/train.csv', index_col='Id') 
X_test = pd.read_csv('../input/housing-prices-competition-for-kaggle-learn-users/test.csv', index_col='Id')

# Remove rows with missing target 
X_train.dropna(axis=0, subset=['SalePrice'], inplace=True)

#separate target from predictors
y_train = X_train.SalePrice
X_train.drop(['SalePrice'], axis=1, inplace=True)

# Break off validation set from training data
X_train_2, X_valid, y_train_2, y_valid = train_test_split(X_train, y_train, train_size=0.8, test_size=0.2,
                                                      random_state=0)

In [44]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error

# function for comparing different approaches
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = XGBRegressor(n_estimators=1000, learning_rate=0.05, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

In [45]:
#Question 1:

drop_X_train_2 = X_train_2.select_dtypes(exclude=['object'])
drop_X_valid = X_valid.select_dtypes(exclude=['object'])

from sklearn.impute import SimpleImputer
my_imputer = SimpleImputer(strategy='median')
imputed_X_train_2 = pd.DataFrame(my_imputer.fit_transform(drop_X_train_2))
imputed_X_valid = pd.DataFrame(my_imputer.transform(drop_X_valid))

imputed_X_train_2.columns = drop_X_train_2.columns
imputed_X_valid.columns = drop_X_valid.columns

print("MAE:")
print(score_dataset(imputed_X_train_2, imputed_X_valid, y_train_2, y_valid))

MAE:
17608.601228060787


MAE:
17608.601228060787

In [46]:
#Question 2:

drop_X_train = X_train.select_dtypes(exclude=['object'])
drop_X_test = X_test.select_dtypes(exclude=['object'])

imputed_X_train = pd.DataFrame(my_imputer.fit_transform(drop_X_train))
imputed_X_test = pd.DataFrame(my_imputer.transform(drop_X_test))

imputed_X_train.columns = drop_X_train.columns
imputed_X_test.columns = drop_X_test.columns

model = XGBRegressor(n_estimators=1000, learning_rate=0.05, random_state=0)
model.fit(imputed_X_train, y_train)
preds = model.predict(imputed_X_test)

output = pd.DataFrame({'Id': X_test.index,
                       'SalePrice': preds})
output.to_csv('Drop columns with categorical features.csv', index=False)



leaderboard score: 15977.47894

In [47]:
#Question 3:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer

object_cols=[col for col in X_train_2.columns if X_train_2[col].dtype == "object"]
good_label_cols = [col for col in object_cols if set(X_valid[col]).issubset(set(X_train_2[col]))]
        
bad_label_cols = list(set(object_cols)-set(good_label_cols))

from sklearn.preprocessing import OrdinalEncoder

ordinal_encoded_X_train_2 = X_train_2.drop(bad_label_cols, axis=1)
ordinal_encoded_X_valid= X_valid.drop(bad_label_cols, axis=1)

ordinal_encoder = OrdinalEncoder()
ordinal_encoded_X_train_2[good_label_cols] = ordinal_encoder.fit_transform(X_train_2[good_label_cols])
ordinal_encoded_X_valid[good_label_cols] = ordinal_encoder.transform(X_valid[good_label_cols])

my_imputer = SimpleImputer(strategy="median")
imputed_X_train_2= pd.DataFrame(my_imputer.fit_transform(ordinal_encoded_X_train_2))
imputed_X_valid = pd.DataFrame(my_imputer.transform(ordinal_encoded_X_valid))

imputed_X_train_2.columns = ordinal_encoded_X_train_2.columns
imputed_X_valid.columns = ordinal_encoded_X_valid.columns

print("MAE")
print(score_dataset(imputed_X_train_2,imputed_X_valid, y_train_2, y_valid))



MAE
16501.22931827911


MAE:
16501.22931827911

In [48]:
#Question 4:

object_cols=[col for col in X_train.columns if X_train[col].dtype == "object"]
good_label_cols = [col for col in object_cols if set(X_test[col]).issubset(set(X_train[col]))]

bad_label_cols = list(set(object_cols)-set(good_label_cols))

ordinal_encoded_X_train = X_train.drop(bad_label_cols, axis=1)
ordinal_encoded_X_test= X_test.drop(bad_label_cols, axis=1)

ordinal_encoder = OrdinalEncoder()
ordinal_encoded_X_train[good_label_cols] = ordinal_encoder.fit_transform(X_train[good_label_cols])
ordinal_encoded_X_test[good_label_cols] = ordinal_encoder.transform(X_test[good_label_cols])

my_imputer = SimpleImputer(strategy="median")
imputed_X_train= pd.DataFrame(my_imputer.fit_transform(ordinal_encoded_X_train))
imputed_X_test = pd.DataFrame(my_imputer.transform(ordinal_encoded_X_test))

imputed_X_train.columns = ordinal_encoded_X_train.columns
imputed_X_test.columns = ordinal_encoded_X_test.columns

model = XGBRegressor(n_estimators=1000, learning_rate=0.05, random_state=0)
model.fit(imputed_X_train, y_train)

test_preds = model.predict(imputed_X_test)

# Save predictions to CSV file
output = pd.DataFrame({'Id': X_test.index,
                       'SalePrice': test_preds})
output.to_csv('Apply ordinal encoding to categorical features.csv', index=False)


leaderboard score: 14664.82491

In [49]:
#Question 5:

from sklearn.preprocessing import OneHotEncoder

categorical_cols = [col for col in X_train_2.columns if X_train_2[col].dtype == 'object']
low_cardinality_cols = [col for col in categorical_cols if X_train_2[col].nunique() < 10]

onehot_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
X_train_encoded = pd.DataFrame(onehot_encoder.fit_transform(X_train_2[low_cardinality_cols]))
X_valid_encoded = pd.DataFrame(onehot_encoder.transform(X_valid[low_cardinality_cols]))

X_train_2 = X_train_2.set_index(X_train_encoded.index)
X_valid = X_valid.set_index(X_valid_encoded.index)

X_train_numeric = X_train_2.select_dtypes(exclude=['object'])
X_valid_numeric = X_valid.select_dtypes(exclude=['object'])

OH_X_train_2 = pd.concat([X_train_numeric, X_train_encoded], axis=1)
OH_X_valid = pd.concat([X_valid_numeric, X_valid_encoded], axis=1)

OH_X_train_2.columns = OH_X_train_2.columns.astype(str)
OH_X_valid.columns = OH_X_valid.columns.astype(str)

imputer = SimpleImputer(strategy='median')
imputed_X_train_2 = pd.DataFrame(imputer.fit_transform(OH_X_train_2), columns=OH_X_train_2.columns)
imputed_X_valid = pd.DataFrame(imputer.transform(OH_X_valid), columns=OH_X_valid.columns)

imputed_X_train_2.columns = OH_X_train_2.columns
imputed_X_valid.columns = OH_X_valid.columns

print("MAE:")
print(score_dataset(imputed_X_train_2, imputed_X_valid, y_train_2, y_valid))



MAE:
16625.319670376713


MAE:
16625.319670376713

In [50]:
#Question 6:

categorical_cols = [col for col in X_train.columns if X_train[col].dtype == 'object']
low_cardinality_cols = [col for col in categorical_cols if X_train[col].nunique() < 10]

onehot_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
X_train_encoded = pd.DataFrame(onehot_encoder.fit_transform(X_train[low_cardinality_cols]))
X_test_encoded = pd.DataFrame(onehot_encoder.transform(X_test[low_cardinality_cols]))

X_train = X_train.set_index(X_train_encoded.index)
X_test = X_test.set_index(X_test_encoded.index)

X_train_numeric = X_train.select_dtypes(exclude=['object'])
X_test_numeric = X_test.select_dtypes(exclude=['object'])

OH_X_train = pd.concat([X_train_numeric, X_train_encoded], axis=1)
OH_X_test = pd.concat([X_test_numeric, X_test_encoded], axis=1)

OH_X_train.columns = OH_X_train.columns.astype(str)
OH_X_test.columns = OH_X_test.columns.astype(str)

imputer = SimpleImputer(strategy='median')
imputed_X_train = pd.DataFrame(imputer.fit_transform(OH_X_train), columns=OH_X_train.columns)
imputed_X_test = pd.DataFrame(imputer.transform(OH_X_test), columns=OH_X_test.columns)

imputed_X_train.columns = OH_X_train.columns
imputed_X_test.columns = OH_X_test.columns

model = XGBRegressor(n_estimators=1000, learning_rate=0.05, random_state=0)
model.fit(imputed_X_train, y_train)
preds = model.predict(imputed_X_test)

output = pd.DataFrame({'Id': X_test.index,
                       'SalePrice': preds})
output.to_csv('Apply one-hot encoding to categorical features.csv', index=False)



