In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor

In [2]:
def score_dataset(X, y, model=XGBRegressor()):
    # Label encoding for categoricals
    for colname in X.select_dtypes(["category", "object"]):
        X[colname], _ = X[colname].factorize()
    # Metric for Housing competition is RMSLE (Root Mean Squared Log Error)
    score = cross_val_score(
        model, X, y, cv=5, scoring="neg_mean_squared_log_error",
    )
    score = -1 * score.mean()
    score = np.sqrt(score)
    return score

In [3]:
# Prepare data
df = pd.read_csv(r'C:\Users\Dell\Downloads\ames.csv')
X = df.copy()
y = X.pop("SalePrice")

1) Create Mathematical Transforms
Create the following features:

LivLotRatio: the ratio of GrLivArea to LotArea

TotalOutsideSF: the sum of WoodDeckSF, OpenPorchSF, EnclosedPorch, and ScreenPorch

In [4]:
X_1 = pd.DataFrame()  # dataframe to hold new features

X_1["LivLotRatio"] = df.GrLivArea / df.LotArea

X_1["TotalOutsideSF"] = df.WoodDeckSF + df.OpenPorchSF + df.EnclosedPorch + df.ScreenPorch

2) Interaction with a Categorical

We discovered an interaction between BldgType and GrLivArea. Now create their interaction features.

In [5]:
# One-hot encode BldgType. Use `prefix="Bldg"` in `get_dummies`
X_2 = pd.get_dummies(df.BldgType, prefix="Bldg")
# Multiply
X_2 = X_2.mul(df.GrLivArea, axis=0)

3) Count Feature

Let's try creating a feature that describes how many kinds of outdoor areas a dwelling has. Create a feature PorchTypes that counts how many of the following are greater than 0.0:

WoodDeckSF
OpenPorchSF
EnclosedPorch
ScreenPorch

In [6]:
X_3 = pd.DataFrame()

# YOUR CODE HERE
X_3["PorchTypes"] = df[[
    "WoodDeckSF",
    "OpenPorchSF",
    "EnclosedPorch",
    "ScreenPorch",
]].gt(0.0).sum(axis=1)

4) Break Down a Categorical Feature

MSSubClass describes the type of a dwelling:

In [7]:
df.MSSubClass.unique()

array([ 60,  20,  70,  50, 190,  45,  90, 120,  30,  85,  80, 160,  75,
       180,  40], dtype=int64)

It shows no categorical feature

5) Use a Grouped Transform

The value of a home often depends on how it compares to typical homes in its neighborhood. Create a feature MedNhbdArea that describes the median of GrLivArea grouped on Neighborhood

In [8]:
X_5 = pd.DataFrame()

# YOUR CODE HERE
X_5["MedNhbdArea"] = df.groupby("Neighborhood")["GrLivArea"].transform("median")

In [9]:
X_new = X.join([X_1, X_2, X_3, X_5])
score_dataset(X_new, y)

0.14404881809062525