## 1.0 MI

In [None]:
# import pandas as pd
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.model_selection import cross_val_score

# df = pd.read_csv("../input/fe-course-data/concrete.csv")
# df.head()

In [None]:
# X = df.copy()
# y = X.pop("CompressiveStrength")

# # Train and score baseline model
# baseline = RandomForestRegressor(criterion="absolute_error", random_state=0)
# baseline_score = cross_val_score(
#     baseline, X, y, cv=5, scoring="neg_mean_absolute_error"
# )
# baseline_score = -1 * baseline_score.mean()

# print(f"MAE Baseline Score: {baseline_score:.4}")

In [None]:
# X = df.copy()
# y = X.pop("CompressiveStrength")

# # Create synthetic features
# X["FCRatio"] = X["FineAggregate"] / X["CoarseAggregate"]
# X["AggCmtRatio"] = (X["CoarseAggregate"] + X["FineAggregate"]) / X["Cement"]
# X["WtrCmtRatio"] = X["Water"] / X["Cement"]

# # Train and score model on dataset with additional ratio features
# model = RandomForestRegressor(criterion="absolute_error", random_state=0)
# score = cross_val_score(
#     model, X, y, cv=5, scoring="neg_mean_absolute_error"
# )
# score = -1 * score.mean()

# print(f"MAE Score with Ratio Features: {score:.4}")

First encountering a new dataset can sometimes feel overwhelming. You might be presented with hundreds or thousands of features without even a description to go by. Where do you even begin?

A great first step is to construct a ranking with a feature utility metric, a function measuring associations between a feature and the target. Then you can choose a smaller set of the most useful features to develop initially and have more confidence that your time will be well spent.

The metric we'll use is called "mutual information". Mutual information is a lot like correlation in that it measures a relationship between two quantities. The advantage of mutual information is that it can detect any kind of relationship, while correlation only detects linear relationships.

Mutual information is a great general-purpose metric and especially useful at the start of feature development when you might not know what model you'd like to use yet. It is:

easy to use and interpret,
computationally efficient,
theoretically well-founded,
resistant to overfitting, and,
able to detect any kind of relationship

The scikit-learn algorithm for MI treats discrete features differently from continuous features. Consequently, you need to tell it which are which. As a rule of thumb, anything that must have a float dtype is not discrete. Categoricals (object or categorial dtype) can be treated as discrete by giving them a label encoding. (You can review label encodings in our Categorical Variables lesson.)

In [None]:
# X = df.copy()
# y = X.pop("price")

# # Label encoding for categoricals
# for colname in X.select_dtypes("object"):
#     X[colname], _ = X[colname].factorize()

# # All discrete features should now have integer dtypes (double-check this before using MI!)
# discrete_features = X.dtypes == int

Scikit-learn has two mutual information metrics in its feature_selection module: one for real-valued targets (mutual_info_regression) and one for categorical targets (mutual_info_classif). Our target, price, is real-valued. The next cell computes the MI scores for our features and wraps them up in a nice dataframe.

In [None]:
# from sklearn.feature_selection import mutual_info_regression

# def make_mi_scores(X, y, discrete_features):
#     mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features)
#     mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
#     mi_scores = mi_scores.sort_values(ascending=False)
#     return mi_scores

# mi_scores = make_mi_scores(X, y, discrete_features)
# mi_scores[::3]  # show a few features with their MI scores

In [None]:
# def plot_mi_scores(scores):
#     scores = scores.sort_values(ascending=True)
#     width = np.arange(len(scores))
#     ticks = list(scores.index)
#     plt.barh(width, scores)
#     plt.yticks(width, ticks)
#     plt.title("Mutual Information Scores")


# plt.figure(dpi=100, figsize=(8, 5))
# plot_mi_scores(mi_scores)

In [None]:
# sns.relplot(x="curb_weight", y="price", data=df);

In [None]:
# sns.lmplot(x="horsepower", y="price", hue="fuel_type", data=df);


In [None]:
# sns.catplot(x="BldgType", y="SalePrice", data=df, kind="boxen");

In [None]:
# OverallQual     0.581262
# Neighborhood    0.569813
# GrLivArea       0.496909
# YearBuilt       0.437939
# GarageArea      0.415014
# TotalBsmtSF     0.390280
# GarageCars      0.381467
# FirstFlrSF      0.368825
# BsmtQual        0.364779
# KitchenQual     0.326194
# Name: MI Scores, dtype: float64

## 2.0 Creating Features

Ratios

In [None]:
# autos["stroke_ratio"] = autos.stroke / autos.bore

# autos[["stroke", "bore", "stroke_ratio"]].head()

Counting Features ( True and False)

In [None]:
# roadway_features = ["Amenity", "Bump", "Crossing", "GiveWay",
#     "Junction", "NoExit", "Railway", "Roundabout", "Station", "Stop",
#     "TrafficCalming", "TrafficSignal"]
# accidents["RoadwayFeatures"] = accidents[roadway_features].sum(axis=1)

# accidents[roadway_features + ["RoadwayFeatures"]].head(10)

Building-up or Breaking-down Features

In [None]:
# customer[["Type", "Level"]] = (  # Create two new features
#     customer["Policy"]           # from the Policy feature
#     .str                         # through the string accessor
#     .split(" ", expand=True)     # by splitting on " "
#                                  # and expanding the result into separate columns
# )

# customer[["Policy", "Type", "Level"]].head(10)

In [None]:
# autos["make_and_style"] = autos["make"] + "_" + autos["body_style"]
# autos[["make", "body_style", "make_and_style"]].head()

Group Transforms 
(groupby, average, min, median, var, std, count, frequency )

In [None]:
# customer["AverageIncome"] = (
#     customer.groupby("State")  # for each state
#     ["Income"]                 # select the income
#     .transform("mean")         # and compute its mean
# )

# customer[["State", "Income", "AverageIncome"]].head(10)

In [None]:
# customer["StateFreq"] = (
#     customer.groupby("State")
#     ["State"]
#     .transform("count")
#     / customer.State.count()
# )

# customer[["State", "StateFreq"]].head(10)

In [None]:
# # Create splits
# df_train = customer.sample(frac=0.5)
# df_valid = customer.drop(df_train.index)

# # Create the average claim amount by coverage type, on the training set
# df_train["AverageClaim"] = df_train.groupby("Coverage")["ClaimAmount"].transform("mean")

# # Merge the values into the validation set
# df_valid = df_valid.merge(
#     df_train[["Coverage", "AverageClaim"]].drop_duplicates(),
#     on="Coverage",
#     how="left",
# )

# df_valid[["Coverage", "AverageClaim"]].head(10)

In [None]:
# f you've discovered an interaction effect between a numeric feature and a categorical feature, you might want to model it explicitly using a one-hot encoding, like so:

# # One-hot encode Categorical feature, adding a column prefix "Cat"
# X_new = pd.get_dummies(df.Categorical, prefix="Cat")

# # Multiply row-by-row
# X_new = X_new.mul(df.Continuous, axis=0)

# # Join the new features to the feature set
# X = X.join(X_new)

## 3.0 Clustering with K-Means

Let's review how the k-means algorithm learns the clusters and what that means for feature engineering. We'll focus on three parameters from scikit-learn's implementation: n_clusters, max_iter, and n_init.

You may need to increase the max_iter for a large number of clusters or n_init for a complex dataset. Ordinarily though the only parameter you'll need to choose yourself is n_clusters (k, that is). The best partitioning for a set of features depends on the model you're using and what you're trying to predict, so it's best to tune it like any hyperparameter (through cross-validation, say).

In [None]:
# # Create cluster feature
# kmeans = KMeans(n_clusters=6)
# X["Cluster"] = kmeans.fit_predict(X)
# X["Cluster"] = X["Cluster"].astype("category")

# X.head()

In [None]:
# Ploting Clustering Scatter Plot

# sns.relplot(
#     x="Longitude", y="Latitude", hue="Cluster", data=X, height=6,
# );

In [None]:
# Ploting blox plot

# X["MedHouseVal"] = df["MedHouseVal"]
# sns.catplot(x="MedHouseVal", y="Cluster", data=X, kind="boxen", height=6);

In [None]:
# Xy = X.copy()
# Xy["Cluster"] = Xy.Cluster.astype("category")
# Xy["SalePrice"] = y
# sns.relplot(
#     x="value", y="SalePrice", hue="Cluster", col="variable",
#     height=4, aspect=1, facet_kws={'sharex': False}, col_wrap=3,
#     data=Xy.melt(
#         value_vars=features, id_vars=["SalePrice", "Cluster"],
#     ),
# );

## 4.0 Principal Component Analysis (PCA)

There are two ways you could use PCA for feature engineering.

The first way is to use it as a descriptive technique. Since the components tell you about the variation, you could compute the MI scores for the components and see what kind of variation is most predictive of your target. That could give you ideas for kinds of features to create -- a product of 'Height' and 'Diameter' if 'Size' is important, say, or a ratio of 'Height' and 'Diameter' if Shape is important. You could even try clustering on one or more of the high-scoring components.

The second way is to use the components themselves as features. Because the components expose the variational structure of the data directly, they can often be more informative than the original features. Here are some use-cases:

Dimensionality reduction: When your features are highly redundant (multicollinear, specifically), PCA will partition out the redundancy into one or more near-zero variance components, which you can then drop since they will contain little or no information.

Anomaly detection: Unusual variation, not apparent from the original features, will often show up in the low-variance components. These components could be highly informative in an anomaly or outlier detection task.

Noise reduction: A collection of sensor readings will often share some common background noise. PCA can sometimes collect the (informative) signal into a smaller number of features while leaving the noise alone, thus boosting the signal-to-noise ratio.

Decorrelation: Some ML algorithms struggle with highly-correlated features. PCA transforms correlated features into uncorrelated components, which could be easier for your algorithm to work with.

PCA basically gives you direct access to the correlational structure of your data. You'll no doubt come up with applications of your own!

PCA Best Practices
There are a few things to keep in mind when applying PCA:

PCA only works with numeric features, like continuous quantities or counts.

PCA is sensitive to scale. It's good practice to standardize your data before applying PCA, unless you know you have good reason not to.

Consider removing or constraining outliers, since they can have an undue influence on the results.

In [None]:
# import matplotlib.pyplot as plt
# import numpy as np
# import pandas as pd
# import seaborn as sns
# from IPython.display import display
# from sklearn.feature_selection import mutual_info_regression


# plt.style.use("seaborn-v0_8-whitegrid")
# plt.rc("figure", autolayout=True)
# plt.rc(
#     "axes",
#     labelweight="bold",
#     labelsize="large",
#     titleweight="bold",
#     titlesize=14,
#     titlepad=10,
# )


# def plot_variance(pca, width=8, dpi=100):
#     # Create figure
#     fig, axs = plt.subplots(1, 2)
#     n = pca.n_components_
#     grid = np.arange(1, n + 1)
#     # Explained variance
#     evr = pca.explained_variance_ratio_
#     axs[0].bar(grid, evr)
#     axs[0].set(
#         xlabel="Component", title="% Explained Variance", ylim=(0.0, 1.0)
#     )
#     # Cumulative Variance
#     cv = np.cumsum(evr)
#     axs[1].plot(np.r_[0, grid], np.r_[0, cv], "o-")
#     axs[1].set(
#         xlabel="Component", title="% Cumulative Variance", ylim=(0.0, 1.0)
#     )
#     # Set up figure
#     fig.set(figwidth=8, dpi=100)
#     return axs

# def make_mi_scores(X, y, discrete_features):
#     mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features)
#     mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
#     mi_scores = mi_scores.sort_values(ascending=False)
#     return mi_scores

In [None]:
# features = ["highway_mpg", "engine_size", "horsepower", "curb_weight"]

# X = df.copy()
# y = X.pop('price')
# X = X.loc[:, features]

# # Standardize
# X_scaled = (X - X.mean(axis=0)) / X.std(axis=0)

In [None]:
# from sklearn.decomposition import PCA

# # Create principal components
# pca = PCA()
# X_pca = pca.fit_transform(X_scaled)

# # Convert to dataframe
# component_names = [f"PC{i+1}" for i in range(X_pca.shape[1])]
# X_pca = pd.DataFrame(X_pca, columns=component_names)

# X_pca.head()

In [None]:
# loadings = pd.DataFrame(
#     pca.components_.T,  # transpose the matrix of loadings
#     columns=component_names,  # so the columns are the principal components
#     index=X.columns,  # and the rows are the original features
# )
# loadings

In [None]:
# # Look at explained variance
# plot_variance(pca);

In [None]:
# mi_scores = make_mi_scores(X_pca, y, discrete_features=False)
# mi_scores

In [None]:
# # Show dataframe sorted by PC3
# idx = X_pca["PC3"].sort_values(ascending=False).index
# cols = ["make", "body_style", "horsepower", "curb_weight"]
# df.loc[idx, cols]

In [None]:
# df["sports_or_wagon"] = X.curb_weight / X.horsepower
# sns.regplot(x="sports_or_wagon", y='price', data=df, order=2);

In [None]:
# # Setup feedback system
# from learntools.core import binder
# binder.bind(globals())
# from learntools.feature_engineering_new.ex5 import *

# import matplotlib.pyplot as plt
# import numpy as np
# import pandas as pd
# import seaborn as sns
# from sklearn.decomposition import PCA
# from sklearn.feature_selection import mutual_info_regression
# from sklearn.model_selection import cross_val_score
# from xgboost import XGBRegressor

# # Set Matplotlib defaults
# plt.style.use("seaborn-v0_8-whitegrid")
# plt.rc("figure", autolayout=True)
# plt.rc(
#     "axes",
#     labelweight="bold",
#     labelsize="large",
#     titleweight="bold",
#     titlesize=14,
#     titlepad=10,
# )


# def apply_pca(X, standardize=True):
#     # Standardize
#     if standardize:
#         X = (X - X.mean(axis=0)) / X.std(axis=0)
#     # Create principal components
#     pca = PCA()
#     X_pca = pca.fit_transform(X)
#     # Convert to dataframe
#     component_names = [f"PC{i+1}" for i in range(X_pca.shape[1])]
#     X_pca = pd.DataFrame(X_pca, columns=component_names)
#     # Create loadings
#     loadings = pd.DataFrame(
#         pca.components_.T,  # transpose the matrix of loadings
#         columns=component_names,  # so the columns are the principal components
#         index=X.columns,  # and the rows are the original features
#     )
#     return pca, X_pca, loadings


# def plot_variance(pca, width=8, dpi=100):
#     # Create figure
#     fig, axs = plt.subplots(1, 2)
#     n = pca.n_components_
#     grid = np.arange(1, n + 1)
#     # Explained variance
#     evr = pca.explained_variance_ratio_
#     axs[0].bar(grid, evr)
#     axs[0].set(
#         xlabel="Component", title="% Explained Variance", ylim=(0.0, 1.0)
#     )
#     # Cumulative Variance
#     cv = np.cumsum(evr)
#     axs[1].plot(np.r_[0, grid], np.r_[0, cv], "o-")
#     axs[1].set(
#         xlabel="Component", title="% Cumulative Variance", ylim=(0.0, 1.0)
#     )
#     # Set up figure
#     fig.set(figwidth=8, dpi=100)
#     return axs


# def make_mi_scores(X, y):
#     X = X.copy()
#     for colname in X.select_dtypes(["object", "category"]):
#         X[colname], _ = X[colname].factorize()
#     # All discrete features should now have integer dtypes
#     discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
#     mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)
#     mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
#     mi_scores = mi_scores.sort_values(ascending=False)
#     return mi_scores


# def score_dataset(X, y, model=XGBRegressor()):
#     # Label encoding for categoricals
#     for colname in X.select_dtypes(["category", "object"]):
#         X[colname], _ = X[colname].factorize()
#     # Metric for Housing competition is RMSLE (Root Mean Squared Log Error)
#     score = cross_val_score(
#         model, X, y, cv=5, scoring="neg_mean_squared_log_error",
#     )
#     score = -1 * score.mean()
#     score = np.sqrt(score)
#     return score


# df = pd.read_csv("../input/fe-course-data/ames.csv")

Correct:

The first component, PC1, seems to be a kind of "size" component, similar to what we saw in the tutorial: all of the features have the same sign (positive), indicating that this component is describing a contrast between houses having large values and houses having small values for these features.

The interpretation of the third component PC3 is a little trickier. The features GarageArea and YearRemodAdd both have near-zero loadings, so let's ignore those. This component is mostly about TotalBsmtSF and GrLivArea. It describes a contrast between houses with a lot of living area but small (or non-existant) basements, and the opposite: small houses with large basements.

## 5.0 Target Encoding

A target encoding is any kind of encoding that replaces a feature's categories with some number derived from the target.

A simple and effective version is to apply a group aggregation from Lesson 3, like the mean. Using the Automobiles dataset, this computes the average price of each vehicle's make:

This kind of target encoding is sometimes called a mean encoding. Applied to a binary target, it's also called bin counting. (Other names you might come across include: likelihood encoding, impact encoding, and leave-one-out encoding.)

#### Smoothing

An encoding like this presents a couple of problems, however. First are unknown categories. Target encodings create a special risk of overfitting, which means they need to be trained on an independent "encoding" split. When you join the encoding to future splits, Pandas will fill in missing values for any categories not present in the encoding split. These missing values you would have to impute somehow.

Second are rare categories. When a category only occurs a few times in the dataset, any statistics calculated on its group are unlikely to be very accurate. In the Automobiles dataset, the mercurcy make only occurs once. The "mean" price we calculated is just the price of that one vehicle, which might not be very representative of any Mercuries we might see in the future. Target encoding rare categories can make overfitting more likely.

A solution to these problems is to add smoothing. The idea is to blend the in-category average with the overall average. Rare categories get less weight on their category average, while missing categories just get the overall average.

In pseudocode:



In [None]:
# encoding = weight * in_category + (1 - weight) * overall

where weight is a value between 0 and 1 calculated from the category frequency.

An easy way to determine the value for weight is to compute an m-estimate:

In [None]:
# weight = n / (n + m)

where n is the total number of times that category occurs in the data. The parameter m determines the "smoothing factor". Larger values of m put more weight on the overall estimate.

In the Automobiles dataset there are three cars with the make chevrolet. If you chose m=2.0, then the chevrolet category would be encoded with 60% of the average Chevrolet price plus 40% of the overall average price.

In [None]:
# chevrolet = 0.6 * 6000.00 + 0.4 * 13285.03

When choosing a value for m, consider how noisy you expect the categories to be. Does the price of a vehicle vary a great deal within each make? Would you need a lot of data to get good estimates? If so, it could be better to choose a larger value for m; if the average price for each make were relatively stable, a smaller value could be okay.

#### Use Cases for Target Encoding
Target encoding is great for:

High-cardinality features: A feature with a large number of categories can be troublesome to encode: a one-hot encoding would generate too many features and alternatives, like a label encoding, might not be appropriate for that feature. A target encoding derives numbers for the categories using the feature's most important property: its relationship with the target.

Domain-motivated features: From prior experience, you might suspect that a categorical feature should be important even if it scored poorly with a feature metric. A target encoding can help reveal a feature's true informativeness.

In [None]:
# X = df.copy()
# y = X.pop('Rating')

# X_encode = X.sample(frac=0.25)
# y_encode = y[X_encode.index]
# X_pretrain = X.drop(X_encode.index)
# y_train = y[X_pretrain.index]

In [None]:
# from category_encoders import MEstimateEncoder

# # Create the encoder instance. Choose m to control noise.
# encoder = MEstimateEncoder(cols=["Zipcode"], m=5.0)

# # Fit the encoder on the encoding split.
# encoder.fit(X_encode, y_encode)

# # Encode the Zipcode column to create the final training data
# X_train = encoder.transform(X_pretrain)

In [None]:
# # Encoding split
# X_encode = df.sample(frac=0.20, random_state=0)
# y_encode = X_encode.pop("SalePrice")

# # Training split
# X_pretrain = df.drop(X_encode.index)
# y_train = X_pretrain.pop("SalePrice")

In [None]:
# X = df.copy()
# y = X.pop("SalePrice")
# score_base = score_dataset(X, y)
# score_new = score_dataset(X_train, y_train)

# print(f"Baseline Score: {score_base:.4f} RMSLE")
# print(f"Score with Encoding: {score_new:.4f} RMSLE")

Based on your understanding of how mean-encoding works, can you explain how XGBoost was able to get an almost a perfect fit after mean-encoding the count feature?

Since Count never has any duplicate values, the mean-encoded Count is essentially an exact copy of the target. In other words, mean-encoding turned a completely meaningless feature into a perfect feature.

Now, the only reason this worked is because we trained XGBoost on the same set we used to train the encoder. If we had used a hold-out set instead, none of this "fake" encoding would have transferred to the training data.

The lesson is that when using a target encoder it's very important to use separate data sets for training the encoder and training the model. Otherwise the results can be very disappointing!