In [22]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor


def score_dataset(X, y, model=XGBRegressor()):
    # Label encoding for categoricals
    for colname in X.select_dtypes(["category", "object"]):
        X[colname], _ = X[colname].factorize()
    # Metric for Housing competition is RMSLE (Root Mean Squared Log Error)
    score = cross_val_score(
        model, X, y, cv=5, scoring="neg_mean_squared_log_error",
    )
    score = -1 * score.mean()
    score = np.sqrt(score)
    return score


# Prepare data
df = pd.read_csv("../input/ames.csv")
X = df.copy()
y = X.pop("SalePrice")

X.columns
# X.GrLivArea
X[['Neighborhood', 'GrLivArea']]

Unnamed: 0,Neighborhood,GrLivArea
0,North_Ames,1656.0
1,North_Ames,896.0
2,North_Ames,1329.0
3,North_Ames,2110.0
4,Gilbert,1629.0
...,...,...
2925,Mitchell,1003.0
2926,Mitchell,902.0
2927,Mitchell,970.0
2928,Mitchell,1389.0


In [4]:
# YOUR CODE HERE
X_1 = pd.DataFrame()  # dataframe to hold new features

X_1["LivLotRatio"] = X["GrLivArea"] / X["LotArea"]
X_1["Spaciousness"] = (X["FirstFlrSF"] + X["SecondFlrSF"]) / X["TotRmsAbvGrd"]
X_1["TotalOutsideSF"] = X["WoodDeckSF"] + X["OpenPorchSF"] + X["EnclosedPorch"] + X["Threeseasonporch"] + X["ScreenPorch"]



In [13]:
# YOUR CODE HERE
# One-hot encode BldgType. Use `prefix="Bldg"` in `get_dummies`
X_2 = pd.get_dummies(X["BldgType"], prefix="Bldg")
# X_2
# Multiply

X_2 = X_2.mul(X["GrLivArea"], axis=0)

X_2

Unnamed: 0,Bldg_Duplex,Bldg_OneFam,Bldg_Twnhs,Bldg_TwnhsE,Bldg_TwoFmCon
0,0.0,1656.0,0.0,0.0,0.0
1,0.0,896.0,0.0,0.0,0.0
2,0.0,1329.0,0.0,0.0,0.0
3,0.0,2110.0,0.0,0.0,0.0
4,0.0,1629.0,0.0,0.0,0.0
...,...,...,...,...,...
2925,0.0,1003.0,0.0,0.0,0.0
2926,0.0,902.0,0.0,0.0,0.0
2927,0.0,970.0,0.0,0.0,0.0
2928,0.0,1389.0,0.0,0.0,0.0


Let's try creating a feature that describes how many kinds of outdoor areas a dwelling has. Create a feature PorchTypes that counts how many of the following are greater than 0.0:
让我们尝试创建一个特征来描述住宅有多少种户外区域。创建一个功能 PorchTypes，用于计算以下大于 0.0 的数目：

WoodDeckSF
OpenPorchSF
EnclosedPorch
Threeseasonporch
ScreenPorch

In [14]:
X_3 = pd.DataFrame()

# YOUR CODE HERE
# gt是greater than函数，返回的是一个bool值，sum(axis=1)是对每一行求和，表示有几个大于0的元素
X_3["PorchTypes"] = df[["WoodDeckSF", "OpenPorchSF", "EnclosedPorch", "Threeseasonporch", "ScreenPorch"]].gt(0.0).sum(axis=1)

In [15]:
df.MSSubClass.unique()

array(['One_Story_1946_and_Newer_All_Styles', 'Two_Story_1946_and_Newer',
       'One_Story_PUD_1946_and_Newer',
       'One_and_Half_Story_Finished_All_Ages', 'Split_Foyer',
       'Two_Story_PUD_1946_and_Newer', 'Split_or_Multilevel',
       'One_Story_1945_and_Older', 'Duplex_All_Styles_and_Ages',
       'Two_Family_conversion_All_Styles_and_Ages',
       'One_and_Half_Story_Unfinished_All_Ages',
       'Two_Story_1945_and_Older', 'Two_and_Half_Story_All_Ages',
       'One_Story_with_Finished_Attic_All_Ages',
       'PUD_Multilevel_Split_Level_Foyer',
       'One_and_Half_Story_PUD_All_Ages'], dtype=object)

You can see that there is a more general categorization described (roughly) by the first word of each category. Create a feature containing only these first words by splitting MSSubClass at the first underscore _. (Hint: In the split method use an argument n=1.)
你可以看到，每个类别的第一个词（大致）描述了一个更通用的分类。通过在第一个下划线 _ 处拆分 MSSubClass 来创建仅包含这些前几个单词的特征。（提示：在 split 方法中使用参数 n=1。

In [17]:
X_4 = pd.DataFrame()
# 将MSSubClass按照_分割，取第一个元素作为MSClass

X_4["MSClass"] = df.MSSubClass.str.split("_", n=1, expand=True)[0]

X_4

Unnamed: 0,MSClass
0,One
1,One
2,One
3,One
4,Two
...,...
2925,Split
2926,One
2927,Split
2928,One


- str.split("_", n=1, expand=True):

  - str.split("_", n=1): 这是 Pandas 提供的一个字符串操作方法，用于将字符串按照指定的分隔符（这里是 _）进行拆分。
    - "_": 代表分隔符，在这里使用下划线 _ 作为分隔符。
    - n=1: 指定最多只拆分一次，这意味着将字符串最多拆分为两部分。
  - expand=True: 表示将结果扩展成一个数据框，每个拆分的部分作为数据框中的一列。比如，如果 MSSubClass 是 "20_1Story"，则拆分后的结果会是一个两列的数据框，第一列是 "20"，第二列是 "1Story"。
- [0]:
  - 这是在对拆分结果进行索引。[0] 表示取拆分结果的第一列，即下划线前面的部分。例如，对于字符串 "20_1Story"，结果为 "20"。这部分通常表示房屋的某种编码或类别。

# 5) Use a Grouped Transform

The value of a home often depends on how it compares to typical homes in its neighborhood. Create a feature `MedNhbdArea` that describes the *median* of `GrLivArea` grouped on `Neighborhood`.

In [18]:
X_5 = pd.DataFrame()

# YOUR CODE HERE
# 以 neighborhood 为分组，计算 GrLivArea 的中位数，并且将中位数作为新的特征 MedNhbdArea
X_5["MedNhbdArea"] = df.groupby("Neighborhood")["GrLivArea"].transform("median")

X_5.head()

Unnamed: 0,MedNhbdArea
0,1200.0
1,1200.0
2,1200.0
3,1200.0
4,1560.0


In [23]:
X_new = X.join([X_1, X_2, X_3, X_4, X_5])
score_dataset(X_new, y) # 均方根对数误差 越小越好

0.13954039790897127