# 🛠️ Feature Engineering – House Prices Dataset

In [None]:
# Import libraries
import pandas as pd
import numpy as np

# Load training data
train = pd.read_csv('../data/train.csv')
train.shape

## 🧩 Interpret missing values as absence of features

In [None]:
# Replace NaNs with 'None' or 0 where appropriate (not imputation, but semantic replacement)
none_cols = [
    'PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 'GarageType', 'GarageFinish',
    'GarageQual', 'GarageCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
    'BsmtFinType1', 'BsmtFinType2', 'MasVnrType', 'Electrical'
]

for col in none_cols:
    train[col] = train[col].fillna('None')

zero_cols = ['GarageYrBlt', 'MasVnrArea', 'BsmtFullBath', 'BsmtHalfBath', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF']
for col in zero_cols:
    train[col] = train[col].fillna(0)

# Check remaining missing values
train.isnull().sum().sort_values(ascending=False).head(10)

## 🧪 Add binary features to indicate presence of key attributes

In [None]:
# Create binary indicator columns
train['HasPool'] = (train['PoolQC'] != 'None').astype(int)
train['HasGarage'] = (train['GarageType'] != 'None').astype(int)
train['HasBasement'] = (train['BsmtQual'] != 'None').astype(int)
train['HasFireplace'] = (train['FireplaceQu'] != 'None').astype(int)
train['HasFence'] = (train['Fence'] != 'None').astype(int)

train[['HasPool', 'HasGarage', 'HasBasement', 'HasFireplace', 'HasFence']].head()

## 🧮 Neighborhood-wise imputation of LotFrontage

In [None]:
# Fill LotFrontage by median value per Neighborhood
train['LotFrontage'] = train.groupby('Neighborhood')['LotFrontage'].transform(
    lambda x: x.fillna(x.median())
)

# Confirm LotFrontage has no missing values
train['LotFrontage'].isnull().sum()

## 🔤 Encode Categorical Variables

In [None]:
# Select categorical columns
cat_cols = train.select_dtypes(include=['object']).columns.tolist()
cat_cols

In [None]:
# Apply Label Encoding for simplicity (for baseline models)
from sklearn.preprocessing import LabelEncoder

label_encoders = {}

for col in cat_cols:
    le = LabelEncoder()
    train[col] = le.fit_transform(train[col])
    label_encoders[col] = le

# Preview encoded data
train[cat_cols].head()

## 💾 Save Processed Data

In [None]:
# Save the processed training data to CSV
train.to_csv('../data/train_clean.csv', index=False)
print("✅ Saved to ../data/train_clean.csv")