In [2]:
import pandas as pd

train = pd.read_csv('/content/drive/MyDrive/ames-housing-ml/data/processed/train_cleaned.csv')
test = pd.read_csv('/content/drive/MyDrive/ames-housing-ml/data/processed/test_cleaned.csv')

In [3]:
import numpy as np
train['SalePrice'] = np.log1p(train['SalePrice'])

In [4]:
train['TotalSF'] = train['TotalBsmtSF'] + train['1stFlrSF'] + train['2ndFlrSF']
test['TotalSF'] = test['TotalBsmtSF'] + test['1stFlrSF'] + test['2ndFlrSF']

In [5]:
train['TotalBath'] = train['FullBath'] + 0.5*train['HalfBath'] + train['BsmtFullBath'] + 0.5*train['BsmtHalfBath']
test['TotalBath'] = test['FullBath'] + 0.5*test['HalfBath'] + test['BsmtFullBath'] + 0.5*test['BsmtHalfBath']

In [6]:
train['OverallScore'] = train['OverallQual'] + train['OverallCond']
test['OverallScore'] = test['OverallQual'] + test['OverallCond']

In [7]:
train['YrSinceRemod'] = train['YrSold'] - train['YearRemodAdd']
test['YrSinceRemod'] = test['YrSold'] - test['YearRemodAdd']

In [8]:
train['HasPool'] = train['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
test['HasPool'] = test['PoolArea'].apply(lambda x: 1 if x > 0 else 0)

train['HasGarage'] = train['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
test['HasGarage'] = test['GarageArea'].apply(lambda x: 1 if x > 0 else 0)

train['HasBasement'] = train['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
test['HasBasement'] = test['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)

In [9]:
from scipy.stats import skew

numeric_feats = train.dtypes[train.dtypes != "object"].index
skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
print(skewed_feats[skewed_feats > 0.75])

MiscFeature_TenC       35.763120
Condition2_RRAn        35.763120
Exterior1st_ImStucc    35.763120
Functional_Sev         35.763120
RoofMatl_Metal         35.763120
                         ...    
GarageFinish_RFn        0.891611
BsmtUnfSF               0.852684
BsmtFinType1_Unf        0.847116
GrLivArea               0.805649
2ndFlrSF                0.755857
Length: 223, dtype: float64


  skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)


In [10]:
for feat in skewed_feats[skewed_feats > 0.75].index:
    train[feat] = np.log1p(train[feat])
    test[feat] = np.log1p(test[feat])

In [12]:
#encoding categorical variables
train = pd.get_dummies(train)
test = pd.get_dummies(test)

train, test = train.align(test, join='left', axis=1)
test = test.fillna(0)

| Feature                                           | Reasoning                                                                                                 |
| ------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------- |
| `HasPool` (from `PoolArea`)                       | Binary feature captures presence of a pool. Pool size is noisy (few large pools); presence alone is more predictive. |
| `HasGarage` (from `GarageArea`)                   | Some homes may not have garages; the model benefits from a binary signal.                                            |
| `HasFireplace` (from `Fireplaces`)                | Presence of a fireplace is more important than how many.                                                             |
| `HasBasement` (from `TotalBsmtSF`)                | Like garages and pools, presence of a basement can be more predictive than size.                                     |
| `IsRemodeled` (from `YearBuilt` & `YearRemodAdd`) | Whether a house was remodeled or not, gives insight into house condition.                                            |
| `Age` (from `YrSold` & `YearBuilt`)               | Instead of raw year, the age gives better numeric scale and direct interpretation.                                   |
| `RemodAge` (from `YrSold` & `YearRemodAdd`)       | Similar to age, but for renovations.                                                                                 |
| `TotalBathrooms`                                  | Combines full and half baths in a normalized way (half = 0.5), better than modeling them separately.                 |
| `TotalSF`                                         | Sum of all floor areas + basement area. This total size is often more relevant than individual floor measurements.   |
| `PorchArea`                                       | Sum of porch areas (`OpenPorchSF`, `EnclosedPorch`, etc.), simplified as one indicator of porch space.               |
