In [None]:
# create features

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=14,
    titlepad=10,
)

accidents = pd.read_csv("../input/fe-course-data/accidents.csv")
autos = pd.read_csv("../input/fe-course-data/autos.csv")
concrete = pd.read_csv("../input/fe-course-data/concrete.csv")
customer = pd.read_csv("../input/fe-course-data/customer.csv")

In [None]:
# Mathematical Transforms 数学转换
# 汽车数据集
# “冲程比”是衡量发动机效率与性能的指标：
autos["stroke_ratio"] = autos.stroke / autos.bore

autos[["stroke", "bore", "stroke_ratio"]].head()

# 组合越复杂，模型就越难学习，就像发动机“排量”的公式一样，这是衡量其功率的指标：
autos["displacement"] = (
    np.pi * ((0.5 * autos.bore) ** 2) * autos.stroke * autos.num_of_cylinders
)

In [None]:
# 数据可视化可以建议转换，通常是通过幂或对数对特征进行“重塑”。例如，WindSpeed在美国事故中的分布高度倾斜。
# 在这种情况下，对数可以有效地对其进行归一化：数据可视化可以建议变换，通常是通过幂或对数对特征进行“重塑”。
# 例如，WindSpeed在美国事故中的分布高度倾斜。在这种情况下，对数可以有效地对其进行归一化：


# If the feature has 0.0 values, use np.log1p (log(1+x)) instead of np.log
accidents["LogWindSpeed"] = accidents.WindSpeed.apply(np.log1p)

# Plot a comparison
fig, axs = plt.subplots(1, 2, figsize=(8, 4))
sns.kdeplot(accidents.WindSpeed, shade=True, ax=axs[0])
sns.kdeplot(accidents.LogWindSpeed, shade=True, ax=axs[1])

# 另外我们可以使用Box-Cox归一器，这点在datacleaning中有提到

In [None]:
# 计数¶
# 描述某物存在与否的特征通常成组出现，例如疾病的风险因素集。您可以通过创建计数来聚合此类要素。

# 这些特征将是二进制（1 表示存在，0 表示不存在）或布尔值（True 或 False）。在 Python 中，布尔值可以像整数一样相加。

# 在交通事故中，有几个特征表明某些道路物体是否在事故附近。这将使用 sum 方法创建附近道路要素总数的计数：
roadway_features = ["Amenity", "Bump", "Crossing", "GiveWay",
    "Junction", "NoExit", "Railway", "Roundabout", "Station", "Stop",
    "TrafficCalming", "TrafficSignal"]
accidents["RoadwayFeatures"] = accidents[roadway_features].sum(axis=1)

accidents[roadway_features + ["RoadwayFeatures"]].head(10)

# 尽管对不是bool类型的特征我们也可以对其进行计数，
# 使用 DataFrame 的内置方法来创建布尔值。在混凝土数据集中是混凝土配方中的组分量。
# 许多配方缺少一个或多个组分（即，该组分的值为 0）。
# 这将使用 DataFrame 的内置 greater-than gt 方法计算公式中有多少个组件：
components = [ "Cement", "BlastFurnaceSlag", "FlyAsh", "Water",
               "Superplasticizer", "CoarseAggregate", "FineAggregate"]
concrete["Components"] = concrete[components].gt(0).sum(axis=1)

concrete[components + ["Components"]].head(10)

In [None]:
# Building-Up and Breaking-Down Features¶
# Often you'll have complex strings that can usefully be broken into simpler pieces. Some common examples:
# ID numbers: '123-45-6789'
# Phone numbers: '(999) 555-0123'
# Street addresses: '8241 Kaggle Ln., Goose City, NV'
# Internet addresses: 'http://www.kaggle.com
# Product codes: '0 36000 29145 2'
# Dates and times: 'Mon Sep 30 07:06:05 2013'

# The str accessor lets you apply string methods like split directly to columns. 
# The Customer Lifetime Value dataset contains features describing customers of an insurance company. 
# From the Policy feature, we could separate the Type from the Level of coverage:
customer[["Type", "Level"]] = (  # Create two new features
    customer["Policy"]           # from the Policy feature
    .str                         # through the string accessor
    .split(" ", expand=True)     # by splitting on " "
                                 # and expanding the result into separate columns
)

customer[["Policy", "Type", "Level"]].head(10)

# combine features
autos["make_and_style"] = autos["make"] + "_" + autos["body_style"]
autos[["make", "body_style", "make_and_style"]].head()

In [None]:
# 组转换¶
# 最后，我们有组转换，它聚合了按某个类别分组的多行的信息。
# 通过组转换，您可以创建诸如“一个人居住州的平均收入”或“按类型划分的工作日上映的电影比例”等功能。
# 如果您发现了类别交互，则对该类别进行组转换可能是值得调查的好事。

# 使用聚合函数，组转换将两个特征组合在一起：一个提供分组的分类特征，另一个特征要聚合其值。
# 对于“按州划分的平均收入”，您可以为分组要素选择州，为聚合函数选择均值，为聚合要素选择收入。
# 为了在 Pandas 中计算这一点，我们使用 groupby 和 transform 方法：
customer["AverageIncome"] = (
    customer.groupby("State")  # for each state
    ["Income"]                 # select the income
    .transform("mean")         # and compute its mean
)

customer[["State", "Income", "AverageIncome"]].head(10)

# The mean function is a built-in dataframe method, which means we can pass it as a string to transform. 
# Other handy methods include max, min, median, var, std, and count. 
# Here's how you could calculate the frequency with which each state occurs in the dataset:每个州出现的频率
customer["StateFreq"] = (
    customer.groupby("State")
    ["State"]
    .transform("count")
    / customer.State.count()
)

customer[["State", "StateFreq"]].head(10)
# You could use a transform like this to create a "frequency encoding" for a categorical feature.

In [None]:
# If you're using training and validation splits, to preserve their independence, 
# it's best to create a grouped feature using only the training set and then join it to the validation set. 
# We can use the validation set's merge method after creating a unique set of values with drop_duplicates on the training set:
# Create splits
df_train = customer.sample(frac=0.5)
df_valid = customer.drop(df_train.index)

# Create the average claim amount by coverage type, on the training set
df_train["AverageClaim"] = df_train.groupby("Coverage")["ClaimAmount"].transform("mean")

# Merge the values into the validation set
df_valid = df_valid.merge(
    df_train[["Coverage", "AverageClaim"]].drop_duplicates(),
    on="Coverage",
    how="left",
)

df_valid[["Coverage", "AverageClaim"]].head(10)

In [None]:
""" Tips on Creating Features
It's good to keep in mind your model's own strengths and weaknesses when creating features. Here are some guidelines:
Linear models learn sums and differences naturally, but can't learn anything more complex.
Ratios seem to be difficult for most models to learn. Ratio combinations often lead to some easy performance gains.
Linear models and neural nets generally do better with normalized features. Neural nets especially need features scaled to values not too far from 0. Tree-based models (like random forests and XGBoost) can sometimes benefit from normalization, but usually much less so.
Tree models can learn to approximate almost any combination of features, but when a combination is especially important they can still benefit from having it explicitly created, especially when data is limited.
Counts are especially helpful for tree models, since these models don't have a natural way of aggregating information across many features at once. """

In [None]:
# ex
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor


def score_dataset(X, y, model=XGBRegressor()):
    # Label encoding for categoricals
    for colname in X.select_dtypes(["category", "object"]):
        X[colname], _ = X[colname].factorize()
    # Metric for Housing competition is RMSLE (Root Mean Squared Log Error)
    score = cross_val_score(
        model, X, y, cv=5, scoring="neg_mean_squared_log_error",
    )
    score = -1 * score.mean()
    score = np.sqrt(score)
    return score


# Prepare data
df = pd.read_csv("../input/fe-course-data/ames.csv")
X = df.copy()
y = X.pop("SalePrice")

In [None]:
# 让我们从几个数学组合开始。我们将重点介绍描述区域的特征 - 具有相同的单位（平方英尺）可以很容易地以合理的方式组合它们。
# 由于我们使用的是 XGBoost（基于树的模型），因此我们将重点关注比率和总和。
X_1 = pd.DataFrame()  # dataframe to hold new features

X_1["LivLotRatio"] = X["GrLivArea"] / X.LotArea
X_1["Spaciousness"] = (X.FirstFlrSF + X.SecondFlrSF) / X.TotRmsAbvGrd
X_1["TotalOutsideSF"] = X.WoodDeckSF + X.OpenPorchSF + X.EnclosedPorch + X.Threeseasonporch + X.ScreenPorch

In [None]:
# 如果发现数值特征和分类特征之间存在交互效应，则可能需要使用单热编码对其进行显式建模，如下所示：
# Categorical是分类特征，Continuous是数值特征
# One-hot encode Categorical feature, adding a column prefix "Cat"
X_new = pd.get_dummies(df.Categorical, prefix="Cat")

# Multiply row-by-row
X_new = X_new.mul(df.Continuous, axis=0)

# Join the new features to the feature set
X = X.join(X_new)

In [None]:
# One-hot encode BldgType. Use `prefix="Bldg"` in `get_dummies`
X_2 = pd.get_dummies(X.BldgType, prefix="Bldg") 
# Multiply
X_2 = X_2.mul(X.GrLivArea, axis=0)
print(X_2.tail())

In [None]:
# Count Feature
X_3 = pd.DataFrame()

# YOUR CODE HERE
components = ["WoodDeckSF","OpenPorchSF","EnclosedPorch","Threeseasonporch","ScreenPorch"]
X_3["PorchTypes"] = X[components].gt(0).sum(axis=1)
print(X_3.head())

In [None]:
# Break Down a Categorical Feature
df.MSSubClass.unique()

X_4 = pd.DataFrame()

# YOUR CODE HERE
X_4["MSClass"] = X["MSSubClass"].str.split("_", n=1, expand=True)[0]

In [None]:
# Use a Grouped Transform
# 房屋的价值通常取决于它与附近典型房屋的比较情况。创建一个要素 MedNhbdArea，用于描述在邻域上分组的 GrLivArea 的中位数。
X_5 = pd.DataFrame()

# YOUR CODE HERE
X_5["MedNhbdArea"] = (
                    X.groupby("Neighborhood")
                    ["GrLivArea"].transform("median")
)

In [None]:
score_dataset(X, y)
X_new = X.join([X_1, X_2, X_3, X_4, X_5])
score_dataset(X_new, y)

In [None]:
# XGBoost
def score_dataset(X, y, model=XGBRegressor()):
    # Label encoding for categoricals
    for colname in X.select_dtypes(["category", "object"]):
        X[colname], _ = X[colname].factorize()
    # Metric for Housing competition is RMSLE (Root Mean Squared Log Error)
    score = cross_val_score(
        model, X, y, cv=5, scoring="neg_mean_squared_log_error",
    )
    score = -1 * score.mean()
    score = np.sqrt(score)
    return score