In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid", palette="pastel")

df = pd.read_csv("house_dataset.csv")

# 数据清洗
df_clean = df.copy()

# date转为datetime
df_clean["date"] = pd.to_datetime(df_clean["date"])

# yr_renovated 为 0 表示未翻新，替换为 NaN
df_clean["yr_renovated"] = df_clean["yr_renovated"].replace(0, np.nan)

# 新增一列是否翻新
df_clean["renovated"] = df_clean["yr_renovated"].notna().astype(int)

# 移除卧室数量异常值（>10）
df_clean = df_clean[df_clean["bedrooms"] <= 10]

# 移除房价为 0 
df_clean = df_clean[df_clean["price"] > 0]

# 可视化
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# 房价分布图
sns.histplot(df_clean["price"], bins=50, kde=True, ax=axes[0, 0])
axes[0, 0].set_title("House Price Distribution")

# 房屋面积 vs 房价 散点图
sns.scatterplot(data=df_clean, x="sqft_living", y="price", alpha=0.3, ax=axes[0, 1])
axes[0, 1].set_title("Price vs. Living Area (sqft)")

# 各城市房价箱型图（Top 10 城市）
top_cities = df_clean["city"].value_counts().nlargest(10).index
sns.boxplot(data=df_clean[df_clean["city"].isin(top_cities)], x="city", y="price", ax=axes[1, 0])
axes[1, 0].tick_params(axis='x', rotation=45)
axes[1, 0].set_title("House Prices by City (Top 10)")

# 特征相关性热图
numeric_cols = df_clean.select_dtypes(include=["int64", "float64"]).drop(columns=["price"]).columns
corr_matrix = df_clean[["price"] + list(numeric_cols)].corr()
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="coolwarm", ax=axes[1, 1])
axes[1, 1].set_title("Correlation Heatmap")

plt.tight_layout()
plt.show()

In [None]:
# 翻新 vs 未翻新的房价对比
plt.figure(figsize=(8, 6))
sns.boxplot(data=df_clean, x="renovated", y="price")
plt.xticks([0, 1], ["Not Renovated", "Renovated"])
plt.title("Price Comparison: Renovated vs Not Renovated")
plt.ylabel("Price")
plt.xlabel("Renovation Status")
plt.tight_layout()
plt.show()

# 房龄 vs 房价
# 创建房龄列（基于建造年份）
df_clean["house_age"] = df_clean["date"].dt.year - df_clean["yr_built"]
plt.figure(figsize=(8, 6))
sns.scatterplot(data=df_clean, x="house_age", y="price", alpha=0.4)
plt.title("House Age vs Price")
plt.xlabel("House Age (years)")
plt.ylabel("Price")
plt.tight_layout()
plt.show()

# 楼层数量 vs 房价
plt.figure(figsize=(8, 6))
sns.boxplot(data=df_clean, x="floors", y="price")
plt.title("Price by Number of Floors")
plt.xlabel("Number of Floors")
plt.ylabel("Price")
plt.tight_layout()
plt.show()

In [None]:
#房龄 vs 房价回归
plt.figure(figsize=(8, 6))
sns.regplot(data=df_clean, x="house_age", y="price", scatter_kws={'alpha':0.3}, line_kws={"color": "red"})
plt.title("House Age vs Price (with Trend Line)")
plt.xlabel("House Age (years)")
plt.ylabel("Price")
plt.tight_layout()
plt.show()

In [None]:
# 楼层数 vs 房价回归
plt.figure(figsize=(8, 6))
sns.pointplot(data=df_clean, x="floors", y="price", errorbar="sd", capsize=0.2)
plt.title("Average Price by Number of Floors")
plt.xlabel("Number of Floors")
plt.ylabel("Average Price ± Std")
plt.tight_layout()
plt.show()