In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans

In [2]:
from sklearn.feature_selection import SelectFromModel
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split

In [3]:

#Feature engineering
# load cleaned data
train1 = pd.read_csv('train_cleaned_knn.csv',keep_default_na=False).set_index('Id') 
#keep_default_na=False to avoid NaN issues, important
#set_index('Id') to exclude id column, important
test1 = pd.read_csv('test_cleaned_knn.csv',keep_default_na=False).set_index('Id') 
#keep_default_na=False to avoid NaN issues, important
#set_index('Id') to exclude id column, important

# combine datasets for feature engineering consistency
train1['is_train'] = 1
test1['is_train'] = 0
combined = pd.concat([train1.drop('SalePrice', axis=1), test1], axis=0)


In [4]:
# 1. create new features
# total area (above ground + basement)
combined['TotalSF'] = combined['TotalBsmtSF'] + combined['1stFlrSF'] + combined['2ndFlrSF']

# house age (year sold - year built)
combined['HouseAge'] = combined['YrSold'] - combined['YearBuilt']

# renewal age (year sold - year remodeled)
combined['RemodAge'] = combined['YrSold'] - combined['YearRemodAdd']

# total bathroom count (above ground + basement)
combined['TotalBath'] = combined['FullBath'] + 0.5 * combined['HalfBath'] + \
                       combined['BsmtFullBath'] + 0.5 * combined['BsmtHalfBath']

# total rooms (excluding bathrooms)
combined['TotalRooms'] = combined['TotRmsAbvGrd'] + combined['BedroomAbvGr']

# garage age (if no garage, fill 0)
combined['GarageAge'] = combined['YrSold'] - combined['GarageYrBlt'].fillna(0)

# swimming pool, fence, miscellaneous features
combined['HasPool'] = combined['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
combined['HasFence'] = combined['Fence'].apply(lambda x: 1 if x != 'None' else 0)
combined['HasMisc'] = combined['MiscFeature'].apply(lambda x: 1 if x != 'None' else 0)

# additional features
combined['SF_Qual'] = combined['TotalSF'] * combined['OverallQual']
combined['LivArea_Bedroom'] = combined['GrLivArea'] * combined['BedroomAbvGr']

# financial crisis feature
combined['beforeCrisis'] = (((combined['YrSold'] < 2007) | ((combined['YrSold'] == 2007) & combined['MoSold'] < 8))).astype(int)
combined['inCrisis'] = (((combined['YrSold'] >= 2007) & (combined['MoSold'] >= 8)) | ((combined['YrSold'] <= 2008) & (combined['MoSold'] <= 9 ))).astype(int)
combined['afterCrisis'] = (((combined['YrSold'] >= 2009) | ((combined['YrSold'] == 2008) & combined['MoSold'] > 9))).astype(int)

# season feature
def get_season(month):
    if month in [3, 4, 5]: return 'Spring'
    elif month in [6, 7, 8]: return 'Summer'
    elif month in [9, 10, 11]: return 'Fall'
    else: return 'Winter'
combined['Season'] = combined['MoSold'].apply(get_season)


In [5]:
combined['FireplaceQu'].nunique()
combined['FireplaceQu'].unique()

array(['None', 'TA', 'Gd', 'Fa', 'Ex', 'Po'], dtype=object)

In [6]:
combined['ExterQual'].unique()

array(['Gd', 'TA', 'Ex', 'Fa'], dtype=object)

In [7]:
# 2. categorical encoding
# ordinal variable mapping
ordinal_cols = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC', 
                'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 
                'BsmtExposure', 'Functional','PoolQC']
quality_map = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'None': 0, 'NA': 0 }
bsmt_exposure_map = {'Gd': 4, 'Av': 3, 'Mn': 2, 'No': 1,  'NA': 0}
functional_map = {'Typ': 8, 'Min1': 7, 'Min2': 6, 'Mod': 5, 'Maj1': 4, 
                  'Maj2': 3, 'Sev': 2, 'Sal': 1}
PoolQC_map = {'Ex': 4, 'Gd': 3, 'TA': 2, 'Fa': 1, 'None': 0}
for col in ordinal_cols:
    if col == 'BsmtExposure':
        combined[col] = combined[col].map(bsmt_exposure_map)
    elif col == 'Functional':
        combined[col] = combined[col].map(functional_map)
    elif col == 'PoolQC':
        combined[col] = combined[col].map(PoolQC_map)
    else:
        combined[col] = combined[col].map(quality_map)

In [8]:
# label encoding 
nominal_cols = ['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 
                'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 
                'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 
                'Exterior2nd', 'MasVnrType', 'Foundation', 'BsmtFinType1', 
                'BsmtFinType2', 'Heating', 'CentralAir', 'Electrical', 'GarageType', 
                'GarageFinish', 'PavedDrive', 'Fence', 'MiscFeature', 'SaleType', 
                'SaleCondition', 'Season']
label_encoders = {}
for col in nominal_cols:
    le = LabelEncoder()
    combined[col] = le.fit_transform(combined[col])
    label_encoders[col] = le

# encoding Neighborhood
neighborhood_mean = train1.groupby('Neighborhood')['SalePrice'].mean()
combined['Neighborhood_MeanPrice'] = combined['Neighborhood'].map(neighborhood_mean)
combined['Neighborhood_MeanPrice'] = combined['Neighborhood_MeanPrice'].fillna(neighborhood_mean.mean())

# 3. clustering
cluster_features = ['TotalSF', 'OverallQual', 'GrLivArea']
kmeans = KMeans(n_clusters=5, random_state=42)
combined['Cluster'] = kmeans.fit_predict(combined[cluster_features])

# 4. delete low information columns
low_info_cols = ['LowQualFinSF', 'MiscVal', 'PoolArea']
combined = combined.drop(low_info_cols, axis=1)

# 5. deal with skewness
# log transform SalePrice
train1['SalePrice'] = np.log1p(train1['SalePrice'])

# transform skewed numeric features when skewness > 0.75
colums_to_drop = ['is_train', 'beforeCrisis', 'inCrisis', 'afterCrisis'] + nominal_cols
numeric_cols = combined.select_dtypes(include=[np.number]).columns.drop(colums_to_drop)
skewed_cols = combined[numeric_cols].apply(lambda x: x.skew()).sort_values(ascending=False)
skewed_cols = skewed_cols[abs(skewed_cols) > 0.75].index
for col in skewed_cols:
    combined[col] = np.log1p(combined[col].clip(lower=0))

# 6. standardize numeric features
scaler = StandardScaler()
combined[numeric_cols] = scaler.fit_transform(combined[numeric_cols])

# 7. separate train and test sets
train_processed = combined[combined['is_train'] == 1].drop('is_train', axis=1)
test_processed = combined[combined['is_train'] == 0].drop('is_train', axis=1)
train_processed['SalePrice'] = train1['SalePrice']

# 8. validate feature engineering
print("New features created:", ['TotalSF', 'HouseAge', 'RemodAge', 'TotalBath', 
                              'TotalRooms', 'GarageAge', 'HasPool', 'HasFence', 
                              'HasMisc', 'SF_Qual', 'LivArea_Bedroom', 
                              'Neighborhood_MeanPrice', 'Season', 'Cluster',
                              'beforeCrisis','inCrisis','afterCrisis'])
print("Train processed shape:", train_processed.shape)
print("Test processed shape:", test_processed.shape)
print("Missing values in train:", train_processed.isnull().sum().max())
print("Missing values in test:", test_processed.isnull().sum().max())

# save processed data
train_processed.to_csv('train_processed.csv', index=False)
test_processed.to_csv('test_processed.csv', index=False)

New features created: ['TotalSF', 'HouseAge', 'RemodAge', 'TotalBath', 'TotalRooms', 'GarageAge', 'HasPool', 'HasFence', 'HasMisc', 'SF_Qual', 'LivArea_Bedroom', 'Neighborhood_MeanPrice', 'Season', 'Cluster', 'beforeCrisis', 'inCrisis', 'afterCrisis']
Train processed shape: (1456, 94)
Test processed shape: (1459, 93)
Missing values in train: 0
Missing values in test: 0


In [9]:
print(train_processed.isnull().sum().sort_values(ascending=False).head(5))

MSSubClass     0
MSZoning       0
LotFrontage    0
LotArea        0
Street         0
dtype: int64


In [10]:
print(test_processed.isnull().sum().sort_values(ascending=False).head(5))

MSSubClass     0
MSZoning       0
LotFrontage    0
LotArea        0
Street         0
dtype: int64
