In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from category_encoders.binary import BinaryEncoder
from category_encoders.hashing import HashingEncoder
from category_encoders.james_stein import JamesSteinEncoder
from category_encoders.leave_one_out import LeaveOneOutEncoder
from category_encoders.target_encoder import TargetEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import (
    OneHotEncoder,
    OrdinalEncoder,
    PowerTransformer,
)

In [None]:
X = pd.read_csv("../data/train_values.csv")
y = pd.read_csv("../data/train_labels.csv")
y = y.set_index("building_id")
X = X.set_index("building_id")


sns.set_theme()

In [None]:
from dmgpred.cleaning import clean

X = clean(X)[0]

In [None]:
X.info()

When looking at the categoricals there are different groups. There are the 3 geo_ids that have been converted to categoricals. There is count_families which has also been conveerted to categorical with the three groups 0,1,2+ (ordinal). And there are the categories legal_ownership_status, land_surface_condition foundation_type, roof_type, ground_floor_type, other_floor_type, position, plan_configuration that were categorical from the beginning.

In [None]:
cat_cols = cat_cols = X.select_dtypes(include="category").columns
id_cols = [col for col in X.columns if col.endswith("id")]
cat_cols = [col for col in cat_cols if col not in id_cols]
cat_cols.remove("count_families")

To start with, I will only look at the initial categorical columns and not at geo_ids and count_families. 

In [None]:
X[id_cols] = X[id_cols].astype("int")
X_fam_encoded = OrdinalEncoder().fit_transform([X["count_families"]])
X["count_families"] = X_fam_encoded[0]

### Category Encoders

currently: one-hot encoder for categories except ordinal encoder for count_families
https://medium.com/@ranjanrgia/simplifying-encoder-choosing-for-categorical-variables-868bef970f13

In [None]:
def __plot_cat_encoder_importances(encoder, X, y, cols):
    X_cat_encoded = encoder.fit_transform(X[cols])
    if isinstance(X_cat_encoded, pd.DataFrame):
        X_cat_encoded_df = X_cat_encoded
    else:
        X_cat_encoded_df = pd.DataFrame(
            X_cat_encoded.toarray(), columns=encoder.get_feature_names_out(cols)
        )
    X_copy = X.copy
    X = X.drop(cols, axis=1)
    X[X_cat_encoded_df.columns] = X_cat_encoded_df
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, stratify=y, random_state=0
    )
    normalize_cols = ["area_percentage", "height_percentage"]
    normalizer = PowerTransformer("yeo-johnson").fit(X_train[normalize_cols])
    X_train[normalize_cols] = normalizer.transform(X_train[normalize_cols])
    X_test[normalize_cols] = normalizer.transform(X_test[normalize_cols])
    rf = RandomForestClassifier(n_estimators=50, max_depth=3)
    rf.fit(X_train, y_train)
    print(rf.score(X_test, y_test))
    feature_names = rf.feature_names_in_
    importances = rf.feature_importances_
    importances = pd.Series(importances, index=feature_names).sort_values(
        ascending=True
    )
    importances.plot.barh(figsize=(10, 10))
    plt.title("Overview of all feature importances")
    plt.show()
    importances[encoder.get_feature_names_out(cols)].plot.barh(figsize=(10, 5))
    plt.title("Overview of categorical feature importances")
    plt.show()
    X = X_copy

In [None]:
oh_encoder = OneHotEncoder()
__plot_cat_encoder_importances(oh_encoder, X, y, cat_cols)

It can be seen that with the current encoding the categeorical values have an extremely low importance.

### Binary Encoding

similar to onehot, but stores categories as binary bitstrings  
For tree based algorithms it is better than one-hot encoding

In [None]:
binary_encoder = BinaryEncoder()
__plot_cat_encoder_importances(binary_encoder, X, y, cat_cols)

### Feature Hashing Encoding

Uses a hash function to represent categories  
Good for handling large datasets with high cardinality features

In [None]:
hashing_encoder = HashingEncoder()
__plot_cat_encoder_importances(hashing_encoder, X, y, cat_cols)
hashing_encoder.get_feature_names_in()

Here the column roof type gains importance (up to 0.2) as well as lang surface condition and legal ownership status (both slightly under 0.075).

### Target Encoder 

To use the target encoder the function needs to be changed, because the target encoder also need the y to be fitted.

In [None]:
def __plot_target_encoder_importances(encoder, X, y, cols):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, stratify=y, random_state=0
    )
    normalize_cols = ["area_percentage", "height_percentage"]
    normalizer = PowerTransformer("yeo-johnson").fit(X_train[normalize_cols])
    X_train[normalize_cols] = normalizer.transform(X_train[normalize_cols])
    X_test[normalize_cols] = normalizer.transform(X_test[normalize_cols])

    encoder.fit(X_train[cols], y_train)
    X_train[cols] = encoder.transform(X_train[cols])
    X_test[cols] = encoder.transform(X_test[cols])

    rf = RandomForestClassifier(n_estimators=50, max_depth=3)
    rf.fit(X_train, y_train)
    print(rf.score(X_test, y_test))
    feature_names = rf.feature_names_in_
    importances = rf.feature_importances_
    importances = pd.Series(importances, index=feature_names).sort_values(
        ascending=True
    )
    importances.plot.barh(figsize=(10, 10))
    plt.title("Overview of all feature importances")
    plt.show()
    importances[encoder.get_feature_names_out(cols)].plot.barh(figsize=(10, 5))
    plt.title("Overview of categorical feature importances")
    plt.show()

In [None]:
te = TargetEncoder()
__plot_target_encoder_importances(te, X, y, cat_cols)

Again the feature importance for some features increased notably.

### Leave One Out Encoder

very similar to target encoding but excludes the current row’s target when calculating the mean target for a level to reduce the effect of outliers

In [None]:
loo = LeaveOneOutEncoder()
__plot_target_encoder_importances(loo, X, y, cat_cols)

### James Stein Encoder

Useful in reducing overfitting in small datasets or categorical variables with many levels

Encoding is aimed to improve the estimation of the category’s mean target by shrinking them towards a more central average. The only hyperparameter in the formula is B — the power of shrinking.

In [None]:
js = JamesSteinEncoder()
__plot_target_encoder_importances(js, X, y, cat_cols)

Foundation type increases in feature importance (to about 0.28).

## Include Geo IDs

Now I want to include the Geo Ids for the case that we do not want to keep them as integers.

In [None]:
X[id_cols] = X[id_cols].astype("category")
cols = cat_cols + id_cols
js = JamesSteinEncoder()
__plot_target_encoder_importances(js, X, y, cols)

The geo IDs increased in importance. However here it might be that the classifier overfits, especially to the geo_level_3_id, for which there are only very few buildings per ID

In [None]:
he = HashingEncoder(n_components=11)
__plot_cat_encoder_importances(he, X, y, cols)
he.get_feature_names_in()

Geo_level_2_id becomes the most important categorical feature (about 0.14) and other_floor_type the second most important categorical feature (about 0.13)