In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

sns.set_theme(style="ticks")

In [None]:
X = pd.read_parquet("../data/processed/train.parquet")

In [None]:
X = X.astype("float64")

In [None]:
kurt = X.kurt()
sns.boxenplot(kurt, orient="h")
plt.title("Distribution of Kurtosis across features")
plt.savefig("../output/kurtosis.svg")

In [None]:
max_kurt = X.loc[:, str(kurt.argmax())]
sns.boxenplot(max_kurt, orient="h")

In [None]:
max_kurt.describe()

In [None]:
# top 10 kurtosis features
top_kurt = kurt.nlargest(10)
top_kurt_features = X.loc[:, top_kurt.index.tolist()]
top_kurt_features.describe()

In [None]:
np.abs(top_kurt_features.median() - top_kurt_features.mean())

In [None]:
sns.histplot(kurt, bins=50)

A normal distribution has a kurtosis of 3 (here this would be 0 because pandas outputs the Fishers Definition of kurtosis which is the excess kurtosis over the normal distribution).

However, the data has some features with very high kurtosis. This could indicate that the data is not normally distributed and that the data has some outliers. The non-normality of the data was also confirmed by the kstest in notebook 1_missing_values.ipynb.

It could be wiser to use a robust scaling method instead of the standard scaler.

In [None]:
sns.histplot(X.skew())
plt.title("Distribution of Skewness across features")
plt.savefig("../output/skewness.svg")

In [None]:
sns.boxplot(X.skew(), orient="h")