In [1]:
# 读取文件
import pandas as pd
train_path = r'C:\Users\NENOOD\Desktop\house-prices-advanced-regression-techniques\train.csv'
test_path = r'C:\Users\NENOOD\Desktop\house-prices-advanced-regression-techniques\test.csv'
train_data = pd.read_csv(train_path)
test_feature_org = pd.read_csv(test_path)

In [2]:
# 设置特征和目标
train_feature_org = train_data.copy()
train_label = train_feature_org.pop('SalePrice')
train_feature_org.pop("Id")
ID = test_feature_org[['Id']].copy()
test_feature_org.pop("Id")

0       1461
1       1462
2       1463
3       1464
4       1465
        ... 
1454    2915
1455    2916
1456    2917
1457    2918
1458    2919
Name: Id, Length: 1459, dtype: int64

In [3]:
# 对数据集进行预处理 train_feature and test_feature
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

cat_cols = [cname for cname in train_feature_org.columns if train_feature_org[cname].nunique() < 10 and train_feature_org[cname].dtype == 'object']
num_cols = [cname for cname in train_feature_org.columns if train_feature_org[cname].dtype in ['int64','float64']]

num_transformer = SimpleImputer(strategy='constant')
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohenc', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers = [
        ('num',num_transformer,num_cols),
        ('cat',cat_transformer,cat_cols)
    ]
)

train_feature = preprocessor.fit_transform(train_feature_org)
test_feature = preprocessor.transform(test_feature_org)

In [4]:
train_feature.shape

(1460, 232)

In [5]:
test_feature.shape

(1459, 232)

In [6]:
# 建立训练集和检验集
from sklearn.model_selection import train_test_split
train_X,val_X,train_y,val_y = train_test_split(train_feature,train_label,test_size = 0.3,random_state = 1091)

In [7]:
# 训练并优化模型
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

def get_mae(nodes,train_X,val_X,train_y,val_y) :
    model = RandomForestRegressor(max_leaf_nodes = nodes,random_state = 1091)
    model.fit(train_X,train_y)
    preds = model.predict(val_X)
    mae = mean_absolute_error(val_y,preds)
    return mae

pre_nodes = [5, 25, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500]
scores = {nodes: get_mae(nodes,train_X,val_X,train_y,val_y) for nodes in pre_nodes}
best_node = min(scores, key=scores.get)
fin_model = RandomForestRegressor(max_leaf_nodes = best_node,random_state = 1091)
fin_model.fit(train_feature,train_label)

In [8]:
# 预测
fin_preds = fin_model.predict(test_feature)

In [9]:
frame_preds = pd.DataFrame(fin_preds,columns=['SalePrice'])
sub_preds = ID.join(frame_preds)

In [10]:
sub_preds.to_csv('submission.csv', index=False)