In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
X_train = pd.read_csv("../data/train_values.csv")
y_train = pd.read_csv("../data/train_labels.csv")

## Only one value or very skewed distribution

In [None]:
X_train.describe()

In [None]:
nu = X_train[X_train.columns].nunique()
nu

### Binary columns - mostly one value occuring?

There is no column where just one value occurs. However in the pie charts from the data exploration, some of the boolean values were displayed with a (rounded) percentage of 0%, so we should examine those more closely.

In [None]:
binary_cols = [col for col in X_train.columns if col.startswith("has")]
rare_counts = []
for col in binary_cols:
    value_counts = X_train[col].value_counts(normalize=True)
    if value_counts[0] >= value_counts[1]:
        smaller_count = value_counts[1]
    else:
        smaller_count = value_counts[0]
    rare_counts.append(smaller_count)
rare_binary_values = pd.DataFrame(
    np.array([binary_cols, rare_counts]).transpose(), columns=["name", "proportion"]
)
rare_binary_values["proportion"] = rare_binary_values["proportion"].astype(float)
rare_binary_values.sort_values(by="proportion")

There are 8 features where the less frequent bool value occurs in less then 1%. These could be candidates for removal.

### Categorical values

In [None]:
cat_cols = X_train.select_dtypes(include="object").columns
n = len(cat_cols)
fig = plt.figure(figsize=(20, 20))
for i, col in enumerate(cat_cols):
    ax = plt.subplot(n // 3 + 1, 3, i + 1)
    X_train[col].value_counts(normalize=True).plot.pie(autopct="%1.1f%%")
    ax.set_title(col)

plan_configuration and legal_ownership_status have a very high proportion of the most frequent value.

In [None]:
cat_cols_investigate = ["plan_configuration", "legal_ownership_status"]
train_df = X_train.merge(y_train, on="building_id")
fig = plt.figure(figsize=(20, 10))
for i, col in enumerate(cat_cols_investigate):
    ax = plt.subplot(1, 2, i + 1)
    ax = sns.countplot(data=train_df, x=col)
    ax.bar_label(ax.containers[0])

In [None]:
fig = plt.figure(figsize=(20, 20))
for i, col in enumerate(cat_cols_investigate):
    ax = plt.subplot(2, 1, i + 1)
    ax = sns.countplot(data=train_df, x=col, hue="damage_grade")
    for container in ax.containers:
        ax.bar_label(container)

## Duplicated/strongly correlated columns

correlation between height of building and number of floors: possible remove one of those?

correlation between has_secondary_use and has_secondary_use_agriculture: possibly remove one of those?

More correlations? Need to extend multivariate data exploration