In [5]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

# Load the data
training_data_file_path = ('./data/4_housing_competition/train.csv')
home_data = pd.read_csv(training_data_file_path)

y = home_data.SalePrice

# Remove duplicate features from the list
feature_names = [
    'MSSubClass',
    'LotArea',
    'OverallQual',
    'OverallCond',
    'YearBuilt',
    'YearRemodAdd',
    '1stFlrSF',
    '2ndFlrSF',
    'LowQualFinSF',
    'GrLivArea',
    'FullBath',
    'HalfBath',
    'BedroomAbvGr',
    'KitchenAbvGr',
    'TotRmsAbvGrd',
    'Fireplaces',
    'WoodDeckSF',
    'OpenPorchSF',
    'EnclosedPorch',
    '3SsnPorch',
    'ScreenPorch',
    'PoolArea',
    'MiscVal',
    'MoSold',
    'YrSold'
]

X = home_data[feature_names]
X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=1)

print("=== APPROACH 1: DROPPING CATEGORICAL COLUMNS ===")
# Dropping categorical columns
model = RandomForestRegressor(random_state=1)

drop_X_train = X_train.select_dtypes(exclude=['object'])
drop_X_valid = X_valid.select_dtypes(exclude=['object'])

# Create a model with dropped columns
model.fit(drop_X_train, y_train)
val_predictions = model.predict(drop_X_valid)  # Fixed: predict on validation data
mae = mean_absolute_error(y_valid, val_predictions)
print(f"Mean Absolute Error (Dropped Categorical): {mae}")

print("\n=== APPROACH 2: ORDINAL ENCODING ===")
# Ordinal Encoding - should be used when there's an inherent ordering to the categories
object_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]

# Find categorical columns where all validation values exist in training data
good_label_cols = []
for col in object_cols:
    validation_values = set(X_valid[col])
    training_values = set(X_train[col])
    if validation_values.issubset(training_values):
        good_label_cols.append(col)

# Find problematic columns that have new categories in validation data
bad_label_cols = []
for col in object_cols:
    if col not in good_label_cols:
        bad_label_cols.append(col)

print('Categorical columns that will be ordinally encoded:', good_label_cols)
print('Categorical columns that will be dropped from the dataset:', bad_label_cols)

# Create copies and drop bad columns
ordinal_X_train = X_train.drop(bad_label_cols, axis=1).copy()
ordinal_X_valid = X_valid.drop(bad_label_cols, axis=1).copy()

# Apply ordinal encoder only if there are good columns
if good_label_cols:
    ordinal_encoder = OrdinalEncoder()
    ordinal_X_train[good_label_cols] = ordinal_encoder.fit_transform(X_train[good_label_cols])
    ordinal_X_valid[good_label_cols] = ordinal_encoder.transform(X_valid[good_label_cols])

# Create a model with ordinal encoding
model = RandomForestRegressor(random_state=1)
model.fit(ordinal_X_train, y_train)
val_predictions = model.predict(ordinal_X_valid)  # Fixed: predict on validation data
mae = mean_absolute_error(y_valid, val_predictions)
print(f"Mean Absolute Error (Ordinal Encoding): {mae}")

print("\n=== APPROACH 3: ONE-HOT ENCODING ===")

# One-Hot Encoding
# This should only be used when there is no inherent order to the categories, and the cardinality of the categories is low (~<15)
if object_cols:  # Only proceed if there are categorical columns
    object_nunique = list(map(lambda col: X_train[col].nunique(), object_cols))
    d = dict(zip(object_cols, object_nunique))

    # Print number of unique entries by column, in ascending order
    print("Unique values per categorical column:")
    for col, count in sorted(d.items(), key=lambda x: x[1]):
        print(f"  {col}: {count}")

    low_cardinality_cols = [col for col in object_cols if X_train[col].nunique() < 10]
    high_cardinality_cols = list(set(object_cols) - set(low_cardinality_cols))

    print('\nCategorical columns that will be one-hot encoded:', low_cardinality_cols)
    print('Categorical columns that will be dropped from the dataset:', high_cardinality_cols)

    if low_cardinality_cols:  # Only proceed if there are low cardinality columns
        OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)  # Updated parameter name
        OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[low_cardinality_cols]))
        OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[low_cardinality_cols]))

        # One-hot encoding removed index; put it back
        OH_cols_train.index = X_train.index
        OH_cols_valid.index = X_valid.index

        # Remove categorical columns (will replace with one-hot encoding)
        num_X_train = X_train.drop(object_cols, axis=1)
        num_X_valid = X_valid.drop(object_cols, axis=1)

        # Add one-hot encoded columns to numerical features
        OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
        OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)

        # Ensure all columns have string type
        OH_X_train.columns = OH_X_train.columns.astype(str)
        OH_X_valid.columns = OH_X_valid.columns.astype(str)

        # Create a model with one-hot encoding
        model = RandomForestRegressor(random_state=1)
        model.fit(OH_X_train, y_train)
        val_predictions = model.predict(OH_X_valid)  # Fixed: predict on validation data
        mae = mean_absolute_error(y_valid, val_predictions)
        print(f"Mean Absolute Error (One-Hot Encoding): {mae}")
    else:
        print("No low cardinality categorical columns found for one-hot encoding")
else:
    print("No categorical columns found in the dataset")

print("\n=== SUMMARY ===")
print("All three approaches have been tested successfully!")

=== APPROACH 1: DROPPING CATEGORICAL COLUMNS ===
Mean Absolute Error (Dropped Categorical): 17905.8978630137

=== APPROACH 2: ORDINAL ENCODING ===
Categorical columns that will be ordinally encoded: []
Categorical columns that will be dropped from the dataset: []
Mean Absolute Error (Ordinal Encoding): 17905.8978630137

=== APPROACH 3: ONE-HOT ENCODING ===
No categorical columns found in the dataset

=== SUMMARY ===
All three approaches have been tested successfully!
