## Categorical & Advanced Feature Engineering
### 1. Load and Merge Data
- Combine cleaned training data with numerical features



In [1]:
import pandas as pd
import numpy as np

# Load raw training data and numerical features 
train_raw = pd.read_csv("train_data.csv")  
train_num = pd.read_csv("engineered_numerical_train.csv") 

# Select categorical columns to be engineered
categorical_cols = ['street', 'city', 'statezip', 'country']
categorical_data = train_raw[categorical_cols]

# Combine numerical and categorical features into 1 DataFrame
combined_data = pd.concat([train_num, categorical_data], axis=1)




### 2. Encoding Categorical Variables
- Convert categorical data into numeric features

In [2]:
# Drop categorical columns that are either uninformative or too unique
combined_data = combined_data.drop(columns=['street', 'country'])

# Target encoding, calculate average price for each city and statezip
city_mean = train_raw.groupby('city')['price'].mean()
statezip_mean = train_raw.groupby('statezip')['price'].mean()

# Map the average prices to each row
combined_data['city_encoded'] = train_raw['city'].map(city_mean)
combined_data['statezip_encoded'] = train_raw['statezip'].map(statezip_mean)

# Save the result
combined_data.to_csv("engineered_categorical_train.csv", index=False)




### 3. Apply Same Encoding to Test Set
- Prevent data leakage

In [3]:
# Load raw test data
test_raw = pd.read_csv("test_data.csv")

# Drop uninformative columns
test_cleaned = test_raw.drop(columns=["street", "country"])

# Apply the same mappings to avoid data leakage
test_raw['city_encoded'] = test_raw['city'].map(city_mean)
test_raw['statezip_encoded'] = test_raw['statezip'].map(statezip_mean)

# Apply target encoding (map training means to test cities)
# Fill missing values for unseen categories with training global mean
global_mean = train_raw["price"].mean()
test_cleaned["city_encoded"] = test_raw["city"].map(city_mean).fillna(global_mean)
test_cleaned["statezip_encoded"] = test_raw["statezip"].map(statezip_mean).fillna(global_mean)

# Drop raw categorical columns
test_encoded = test_cleaned.drop(columns=["city", "statezip"])

# Save to file
test_encoded.to_csv("engineered_categorical_test.csv", index=False)


### 4. Create KNN-Based Feature
- Add an advanced feature: average price of 5 most similar homes
- In the training set, we used `n_neighbors=6` and manually excluded the first neighbor (which is the sample itself) when calculating `knn_avg_price_5`. This is necessary because each training point is always its own closest neighbor.

In [4]:
# Create `knn_avg_price_5` for training set
# Load files again
train_num = pd.read_csv("engineered_numerical_train.csv")
categorical_features = pd.read_csv("engineered_categorical_train.csv")

# Drop unnecessary columns
train_num = train_num.drop(columns=['Unnamed: 0'], errors='ignore')

# Reconstruct price column from raw data
price = train_raw['price'].reset_index(drop=True)

# Merge all components into one training set
train_combined = pd.concat([
    train_num.reset_index(drop=True),
    categorical_features[['city_encoded', 'statezip_encoded']].reset_index(drop=True),
    price
], axis=1)

# Select features for KNN similarity
knn_features = [
    'house_age',
    'sqft_living_per_floor',
    'sqft_living_lot_ratio',
    'total_rooms_approx',
    'log_sqft_living',
    'bed_bath_interaction'
]

# Drop rows with missing values
train_knn = train_combined.dropna(subset=knn_features + ['price']).reset_index(drop=True)

# Build NearestNeighbors model to find 5 closest houses
from sklearn.neighbors import NearestNeighbors
knn_model = NearestNeighbors(n_neighbors=6, metric='euclidean')  # 6 includes self
knn_model.fit(train_knn[knn_features])

# Find nearest neighbors
distances, indices = knn_model.kneighbors(train_knn[knn_features])

# Compute average price of the 5 nearest neighbors (excluding self)
neighbor_prices = []
for idx_list in indices:
    neighbor_idx = idx_list[1:]  # Exclude the first (self)
    avg_price = train_knn.loc[neighbor_idx, 'price'].mean()
    neighbor_prices.append(avg_price)

# Add as a new feature
train_knn['knn_avg_price_5'] = neighbor_prices

# Save the final version with the KNN feature
train_knn.to_csv("engineered_categorical_train_knn.csv", index=False)


However, when predicting KNN-based features for the test set, we query neighbors from the training data — and the test instance does not exist in the training set. Therefore, it cannot be its own neighbor.\
As such, we set `n_neighbors=5` directly when constructing `knn_avg_price_5` for the test set, and we do not need to drop the first neighbor.\
This distinction ensures that the KNN feature reflects the local price environment of truly external neighboring homes — which is exactly how the model would behave in deployment.

In [5]:
# Load test data (same feature sources as train)
test_num = pd.read_csv("engineered_numerical_test.csv")
test_cat = pd.read_csv("engineered_categorical_test.csv")

# Drop unnecessary columns
test_num = test_num.drop(columns=['Unnamed: 0'], errors='ignore')

# Combine features and keep row_index for alignment
test_combined = pd.concat([
    test_num.reset_index(drop=True),
    test_cat[['city_encoded', 'statezip_encoded']].reset_index(drop=True)
], axis=1)

# Reload training KNN data that already contains price
train_knn = pd.read_csv("engineered_categorical_train_knn.csv")

# Align columns with training set (same as knn_features in training set)
knn_features = [
    'house_age',
    'sqft_living_per_floor',
    'sqft_living_lot_ratio',
    'total_rooms_approx',
    'log_sqft_living',
    'bed_bath_interaction'
]

# Drop rows from test if any KNN feature is NaN (temporary)
X_test_valid = test_combined.dropna(subset=knn_features).copy()

# Fit KNN model on training set
knn_model = NearestNeighbors(n_neighbors=5, metric='euclidean')
knn_model.fit(train_knn[knn_features])

# Find neighbors and calculate average price
distances, indices = knn_model.kneighbors(X_test_valid[knn_features])
knn_avg_prices = [train_knn.loc[idx, 'price'].mean() for idx in indices]

# Assign KNN feature
X_test_valid['knn_avg_price_5'] = knn_avg_prices

# Save the final version with the KNN feature
X_test_valid.to_csv("engineered_categorical_test_knn.csv", index=False)
