<a href="https://colab.research.google.com/github/HakureiPOI/HousePrice_Homework2/blob/main/HousePrice_Prediction_Backup.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# *大数据导论 大作业2*
## ***HakureiPOI***

---

### 准备数据集

In [1]:
# 从 Github 上下载数据集
!git clone https://github.com/HakureiPOI/HousePrice_Homework2.git

Cloning into 'HousePrice_Homework2'...
remote: Enumerating objects: 9, done.[K
remote: Counting objects: 100% (9/9), done.[K
remote: Compressing objects: 100% (8/8), done.[K
remote: Total 9 (delta 1), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (9/9), 217.33 KiB | 5.05 MiB/s, done.
Resolving deltas: 100% (1/1), done.


In [2]:
!unzip /content/HousePrice_Homework2/house-prices-advanced-regression-techniques.zip

Archive:  /content/HousePrice_Homework2/house-prices-advanced-regression-techniques.zip
  inflating: data_description.txt    
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


In [3]:
# 查看 data_description
!cat data_description.txt

MSSubClass: Identifies the type of dwelling involved in the sale.	

        20	1-STORY 1946 & NEWER ALL STYLES
        30	1-STORY 1945 & OLDER
        40	1-STORY W/FINISHED ATTIC ALL AGES
        45	1-1/2 STORY - UNFINISHED ALL AGES
        50	1-1/2 STORY FINISHED ALL AGES
        60	2-STORY 1946 & NEWER
        70	2-STORY 1945 & OLDER
        75	2-1/2 STORY ALL AGES
        80	SPLIT OR MULTI-LEVEL
        85	SPLIT FOYER
        90	DUPLEX - ALL STYLES AND AGES
       120	1-STORY PUD (Planned Unit Development) - 1946 & NEWER
       150	1-1/2 STORY PUD - ALL AGES
       160	2-STORY PUD - 1946 & NEWER
       180	PUD - MULTILEVEL - INCL SPLIT LEV/FOYER
       190	2 FAMILY CONVERSION - ALL STYLES AND AGES

MSZoning: Identifies the general zoning classification of the sale.
		
       A	Agriculture
       C	Commercial
       FV	Floating Village Residential
       I	Industrial
       RH	Residential High Density
       RL	Residential Low Density
       RP	Residential Low Density Park 
       RM

---
### 使用的第三方库

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from xgboost import XGBRegressor
from sklearn.impute import SimpleImputer

---

### 数据预处理

In [5]:
# 加载数据集
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [6]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [7]:
test.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [8]:
print(f'train shape : {train.shape}\n test shape : {test.shape}')

train shape : (1460, 81)
 test shape : (1459, 80)


In [9]:
# 检查缺失值并删除缺失值较高的列
nnull = train.isnull().sum(axis=0).sort_values(ascending=False)
nnull_threshold = train.shape[0] * 0.1
null_cols = nnull[nnull > nnull_threshold].index
for col in null_cols:
    print(f'{col} : {train[col].isnull().sum()}')

train = train.drop(columns=null_cols)
test = test.drop(columns=null_cols)

PoolQC : 1453
MiscFeature : 1406
Alley : 1369
Fence : 1179
MasVnrType : 872
FireplaceQu : 690
LotFrontage : 259


In [10]:
# 分离特征和目标变量
y = train['SalePrice']
X = train.drop(columns=['SalePrice'])

In [11]:
# 对数值特征和分类特征进行分离
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

In [12]:
# 创建预处理流水线
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

---
### 模型选择与训练

In [13]:
# 创建完整的训练流水线，使用XGBoost回归器
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor(objective='reg:squarederror', n_estimators=500, learning_rate=0.05, max_depth=4, random_state=42))
])

In [14]:
# 将数据分为训练集和验证集
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
# 模型训练
model.fit(X_train, y_train)

In [16]:
# 预测与评估
y_pred = model.predict(X_val)
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)
print(f"验证集 RMSE: {rmse}")

验证集 RMSE: 25349.228174458935


In [17]:
# 对测试集进行预测
test_predictions = model.predict(test)

In [18]:
# 创建提交文件
submission = pd.DataFrame({'Id': test['Id'], 'SalePrice': test_predictions})
submission.to_csv('house_price_submission.csv', index=False)
print("提交文件已创建: house_price_submission.csv")

提交文件已创建: house_price_submission.csv


---
### 查看 house_price_submission

In [19]:
submission

Unnamed: 0,Id,SalePrice
0,1461,127398.414062
1,1462,163515.890625
2,1463,176636.312500
3,1464,188362.515625
4,1465,195841.421875
...,...,...
1454,2915,80180.140625
1455,2916,71134.539062
1456,2917,153507.640625
1457,2918,125159.234375
