# 🛠️ Feature Engineering – House Prices Dataset

In [8]:
# Import libraries
import pandas as pd
import numpy as np

# Load training data
train = pd.read_csv('../data/train.csv')
train.shape

(1460, 81)

## 🧩 Interpret missing values as absence of features

In [9]:
# Replace NaNs with 'None' or 0 where appropriate (not imputation, but semantic replacement)
none_cols = [
    'PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 'GarageType', 'GarageFinish',
    'GarageQual', 'GarageCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
    'BsmtFinType1', 'BsmtFinType2', 'MasVnrType', 'Electrical'
]

for col in none_cols:
    train[col] = train[col].fillna('None')

zero_cols = ['GarageYrBlt', 'MasVnrArea', 'BsmtFullBath', 'BsmtHalfBath', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF']
for col in zero_cols:
    train[col] = train[col].fillna(0)

# Check remaining missing values
train.isnull().sum().sort_values(ascending=False).head(10)

LotFrontage     259
Id                0
BedroomAbvGr      0
GarageYrBlt       0
GarageType        0
FireplaceQu       0
Fireplaces        0
Functional        0
TotRmsAbvGrd      0
KitchenQual       0
dtype: int64

## 🧪 Add binary features to indicate presence of key attributes

In [10]:
# Create binary indicator columns
train['HasPool'] = (train['PoolQC'] != 'None').astype(int)
train['HasGarage'] = (train['GarageType'] != 'None').astype(int)
train['HasBasement'] = (train['BsmtQual'] != 'None').astype(int)
train['HasFireplace'] = (train['FireplaceQu'] != 'None').astype(int)
train['HasFence'] = (train['Fence'] != 'None').astype(int)

train[['HasPool', 'HasGarage', 'HasBasement', 'HasFireplace', 'HasFence']].head()

Unnamed: 0,HasPool,HasGarage,HasBasement,HasFireplace,HasFence
0,0,1,1,0,0
1,0,1,1,1,0
2,0,1,1,1,0
3,0,1,1,1,0
4,0,1,1,1,0


## 🧮 Neighborhood-wise imputation of LotFrontage

In [11]:
# Fill LotFrontage by median value per Neighborhood
train['LotFrontage'] = train.groupby('Neighborhood')['LotFrontage'].transform(
    lambda x: x.fillna(x.median())
)

# Confirm LotFrontage has no missing values
train['LotFrontage'].isnull().sum()

np.int64(0)

## 🔤 Encode Categorical Variables

In [12]:
# Select categorical columns
cat_cols = train.select_dtypes(include=['object']).columns.tolist()
cat_cols

['MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'SaleType',
 'SaleCondition']

In [13]:
# Apply Label Encoding for simplicity (for baseline models)
from sklearn.preprocessing import LabelEncoder

label_encoders = {}

for col in cat_cols:
    le = LabelEncoder()
    train[col] = le.fit_transform(train[col])
    label_encoders[col] = le

# Preview encoded data
train[cat_cols].head()

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,3,1,1,3,3,0,4,0,5,2,...,1,2,5,5,2,3,4,1,8,4
1,3,1,1,3,3,0,2,0,24,1,...,1,2,5,5,2,3,4,1,8,4
2,3,1,1,0,3,0,4,0,5,2,...,1,2,5,5,2,3,4,1,8,4
3,3,1,1,0,3,0,0,0,6,2,...,5,3,5,5,2,3,4,1,8,0
4,3,1,1,0,3,0,2,0,15,2,...,1,2,5,5,2,3,4,1,8,4


## 💾 Save Processed Data

In [14]:
# Save the processed training data to CSV
train.to_csv('../data/train_clean.csv', index=False)
print("✅ Saved to ../data/train_clean.csv")

✅ Saved to ../data/train_clean.csv
