In [3]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler

# 读取数据
df = pd.read_csv("./cleaned_house_dataset.csv")

### 1. 拆分 statezip 为 state 和 zipcode
df[['state', 'zipcode']] = df['statezip'].str.extract(r'([A-Z]+)\s+(\d+)', expand=True)
df['zipcode'] = df['zipcode'].astype(str)

### 2. 构造 city / zipcode 的 target encoding 特征
df['city_avg_price'] = df.groupby('city')['price'].transform('mean')
df['zipcode_avg_price'] = df.groupby('zipcode')['price'].transform('mean')

### 3. 合并稀有城市为 'Other'
city_counts = df['city'].value_counts()
rare_cities = city_counts[city_counts < 10].index
df['city_grouped'] = df['city'].apply(lambda x: 'Other' if x in rare_cities else x)

### 4. One-hot 编码城市和州
df = pd.get_dummies(df, columns=['city_grouped', 'state'], drop_first=True)

### 5. 构造 KNN 相似房价特征
# 选取代表相似性的特征
knn_features = ['sqft_living', 'bedrooms', 'bathrooms', 'zipcode_avg_price']
knn_data = df[knn_features].copy()

# 标准化
scaler = StandardScaler()
knn_data_scaled = scaler.fit_transform(knn_data)

# 建立 KNN 模型（寻找每个房子的 5 个最相似邻居，排除自身）
knn = NearestNeighbors(n_neighbors=6, algorithm='auto').fit(knn_data_scaled)
distances, indices = knn.kneighbors(knn_data_scaled)

# 计算 KNN 平均价格（排除自己）
knn_avg_prices = []
for i, neighbor_idxs in enumerate(indices):
    neighbor_prices = df.iloc[neighbor_idxs[1:]]['price']  # 排除自己
    knn_avg_prices.append(neighbor_prices.mean())
df['knn_avg_price'] = knn_avg_prices

### 6. 将 zipcode_avg_price 分为高/中/低三档
df['zipcode_price_tier'] = pd.qcut(df['zipcode_avg_price'], q=3, labels=['Low', 'Medium', 'High'])

# One-hot 编码 price tier
df = pd.get_dummies(df, columns=['zipcode_price_tier'], drop_first=True)



### 7. 保存结果
df.to_csv("./house_dataset_with_location_features.csv", index=False)
