In [None]:
import numpy as np
import pandas as pd
import polars as pl
import seaborn as sns
from scipy.stats import kstest

sns.set_theme()

In [None]:
X = pl.scan_csv("../data/raw/train_data.data", null_values="?").collect()

In [None]:
y = pd.read_csv("../data/raw/train_gt.csv", index_col=0, names=["target"], skiprows=1)[
    "target"
]
y = y.map({"inactive": 0, "active": 1}).astype(np.int8)
y.head()

In [None]:
y.value_counts(normalize=True)

In [None]:
X.head()

In [None]:
X.write_parquet("../data/processed/train.parquet")

In [None]:
X = pd.read_csv("../data/raw/train_data.data", na_values="?", index_col=0)

In [None]:
X.to_parquet("../data/processed/train.parquet")

In [None]:
X.head()

In [None]:
X = pd.read_parquet("../data/processed/train.parquet")

In [None]:
X.info()

In [None]:
X.head()

In [None]:
X.isna().sum(axis=0)

In [None]:
X = X.drop(columns="5409")

In [None]:
X = X.dropna(how="all")

In [None]:
X.duplicated().sum()

In [None]:
X = X.drop_duplicates()

In [None]:
X.info()

In [None]:
X = X.astype("float32[pyarrow]")

In [None]:
X = X.join(y, how="left")

In [None]:
X.info()

In [None]:
X.to_parquet("../data/processed/train.parquet")

In [None]:
pd.set_option("display.max_columns", 20)

In [None]:
X_missing = X[X.isna().any(axis=1)]

X_missing.head()

In [None]:
X_missing.target.value_counts()

it seems that the missing values are occuring for the first feature group 0 - 4825, which are the 2D electrostatic surfaced based features.

Also, the target is inactive for all observations with missing values.

In [None]:
total_nans = X.isna().sum().sum()

In [None]:
assert total_nans == X.loc[X_missing.index, "0":"4825"].isna().sum(axis=0).sum()

In [None]:
X.loc[X_missing.index, "4826":].isna().sum().any()

In [None]:
X["has_missing"] = X.isna().any(axis=1)

In [None]:
X_no_missing = X.dropna(how="any")

In [None]:
for col in X_no_missing.columns:
    result = kstest(X_no_missing[col], "norm")
    if result.pvalue > 0.025:
        print(f"Column {col} is not standard normally distributed")

On a 5% significance level, we can reject the null hypothesis that the features are standard normally distributed.

In [None]:
rejected = dict()
not_rejected = 0
for col in range(4826, 5407):
    result = kstest(X_missing[str(col)], X_no_missing[str(col)])
    if result.pvalue < 0.05:
        rejected[col] = result
    else:
        not_rejected += 1

In [None]:
len(rejected)

In [None]:
not_rejected

For about 100 features we can reject the null hypothesis that the distributions are identical, but for the rest (about 480) we can't say anything with statistical significance.

For us this means that missing value imputation could be an option, but we can also drop these observations as well.

In [None]:
rejected.keys()

In [None]:
X.hist(by="has_missing", column="4826", bins=50)

In summary, as we cannot reject the hypothesis that the distributions between missing and non-missing observations are identical for most features, it is probably safe to drop these observations or to impute them with some kind of simple or marginal method.

However, we do not know what happened with these observations, but because there is a pattern in the missing values (all 2d based features are missing) there must be reason for this. It could be that the sensor was broken or the data was not recorded.

In [None]:
actives = X[X["target"] == 1]
actives.head()

In [None]:
corr = actives.iloc[:, :5408].corr()
sns.heatmap(corr, cmap="coolwarm", center=0, square=True)