## Importing libraries

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor


def score_dataset(X, y, model=XGBRegressor()):
    # Label encoding for categoricals
    for colname in X.select_dtypes(["category", "object"]):
        X[colname], _ = X[colname].factorize()
    # Metric for Housing competition is RMSLE (Root Mean Squared Log Error)
    score = cross_val_score(
        model, X, y, cv=5, scoring="neg_mean_squared_log_error",
    )
    score = -1 * score.mean()
    score = np.sqrt(score)
    return score



## Preparation of Data

In [4]:
# Prepare data
df = pd.read_csv("ames.csv")
X = df.copy()
y = X.pop("SalePrice")

## 1) Create Mathematical Transforms
Create the following features:

LivLotRatio: the ratio of GrLivArea to LotArea
Spaciousness: the sum of FirstFlrSF and SecondFlrSF divided by TotRmsAbvGrd
TotalOutsideSF: the sum of WoodDeckSF, OpenPorchSF, EnclosedPorch, Threeseasonporch, and ScreenPorch

In [5]:
X_1 = pd.DataFrame()  # dataframe to hold new features

X_1["LivLotRatio"] = df.GrLivArea /df.LotArea
X_1["Spaciousness"] = (df.FirstFlrSF + df.SecondFlrSF)/df.TotRmsAbvGrd
X_1["TotalOutsideSF"] = df.WoodDeckSF + df.OpenPorchSF + df.EnclosedPorch + df.Threeseasonporch + df.ScreenPorch

## 2) Interaction with a Categorical
We discovered an interaction between BldgType and GrLivArea in Exercise 2. Now create their interaction features.

In [6]:
# One-hot encode BldgType. Use `prefix="Bldg"` in `get_dummies`
X_2 = pd.get_dummies(df.BldgType, prefix="Bldg")
X_2 = X_2.mul(df.GrLivArea, axis=0)


## 3) Count Feature
Let's try creating a feature that describes how many kinds of outdoor areas a dwelling has. Create a feature PorchTypes that counts how many of the following are greater than 0.0:

- WoodDeckSF
- OpenPorchSF
- EnclosedPorch
- Threeseasonporch
- ScreenPorch

In [7]:
X_3 = pd.DataFrame()

X_3["PorchTypes"] = df[[
    "WoodDeckSF",
    "OpenPorchSF",
    "EnclosedPorch",
    "Threeseasonporch",
    "ScreenPorch",
]].gt(0.0).sum(axis=1)


## 4) Break Down a Categorical Feature
MSSubClass describes the type of a dwelling:

In [9]:
df.MSSubClass.unique()

array(['One_Story_1946_and_Newer_All_Styles', 'Two_Story_1946_and_Newer',
       'One_Story_PUD_1946_and_Newer',
       'One_and_Half_Story_Finished_All_Ages', 'Split_Foyer',
       'Two_Story_PUD_1946_and_Newer', 'Split_or_Multilevel',
       'One_Story_1945_and_Older', 'Duplex_All_Styles_and_Ages',
       'Two_Family_conversion_All_Styles_and_Ages',
       'One_and_Half_Story_Unfinished_All_Ages',
       'Two_Story_1945_and_Older', 'Two_and_Half_Story_All_Ages',
       'One_Story_with_Finished_Attic_All_Ages',
       'PUD_Multilevel_Split_Level_Foyer',
       'One_and_Half_Story_PUD_All_Ages'], dtype=object)

In [12]:
X_4 = pd.DataFrame()
X_4['MSClass'] = df.MSSubClass.str.split("_",n=1, expand = True)[0]
X_4['MSClass']

0         One
1         One
2         One
3         One
4         Two
        ...  
2925    Split
2926      One
2927    Split
2928      One
2929      Two
Name: MSClass, Length: 2930, dtype: object

##  5) Use a Grouped Transform
​
The value of a home often depends on how it compares to typical homes in its neighborhood. Create a feature `MedNhbdArea` that describes the *median* of `GrLivArea` grouped on `Neighborhood`.

In [14]:
X_5 = pd.DataFrame()

X_5["MedNhbdArea"] = df.groupby("Neighborhood")["GrLivArea"].transform("median")
X_5["MedNhbdArea"] 

0       1200.0
1       1200.0
2       1200.0
3       1200.0
4       1560.0
         ...  
2925    1282.0
2926    1282.0
2927    1282.0
2928    1282.0
2929    1282.0
Name: MedNhbdArea, Length: 2930, dtype: float64