In [None]:
import matplotlib.pyplot as plt
import numpy as np
import polars as pl
import seaborn as sns
import umap
from sklearn.preprocessing import StandardScaler

sns.set_theme(context="notebook", rc={"figure.figsize": (14, 10)}, palette="viridis")

In [None]:
X = pl.scan_csv(
    "../data/train_data.data", null_values="?", row_index_name="index"
).collect()
X.head()

In [None]:
y = pl.scan_csv("../data/train_gt.csv", skip_rows=1, row_index_name="index").collect()
y = y.with_columns(
    replaced=pl.col("inactive").replace(
        {"inactive": 0, "active": 1}, return_dtype=pl.Int8
    )
)
y = y.drop("inactive")
# y = y.select(
#     pl.col("0").cast(pl.Int64)
# )
y.head()

In [None]:
joined_data = X.join(y, on="index")
joined_data

In [None]:
joined_data.drop_in_place("5409")
joined_data.drop_in_place("0_right")
joined_data = joined_data.drop_nulls()
X.shape

In [None]:
joined_data.head()

In [None]:
X_2d_features = joined_data[:, 2:4828]
# X_2d_features.drop_in_place("column_0")
print(f"shape of 2D features: {X_2d_features.shape}")
X_3d_features = joined_data[:, 4829:]
print(f"shape of 3D features: {X_3d_features.shape}")

In [None]:
X_2d_features

In [None]:
joined_data.head()

In [None]:
reducer = umap.UMAP(random_state=42)

In [None]:
X_2d_features_scaled = StandardScaler().fit_transform(X_2d_features)
X_2d_features_scaled = pl.from_numpy(X_2d_features_scaled)
X_3d_features_scaled = StandardScaler().fit_transform(X_3d_features)
X_3d_features_scaled = pl.from_numpy(X_3d_features_scaled)

In [None]:
X_2d_features_scaled.head()

In [None]:
embedding_2D = reducer.fit_transform(X_2d_features_scaled.to_numpy())
print(f"shape of 2D embedding: {embedding_2D.shape}")
embedding_3D = reducer.fit_transform(X_3d_features_scaled.to_numpy())
print(f"shape of 2D embedding: {embedding_3D.shape}")

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 6))

# Scatterplot 1
axes[0].scatter(embedding_2D[:, 0], embedding_2D[:, 1], c=joined_data[:, -1].to_numpy())
axes[0].set_aspect("equal", "datalim")
axes[0].set_title("2D features")

# Scatterplot 2
axes[1].scatter(embedding_3D[:, 0], embedding_3D[:, 1], c=joined_data[:, -1].to_numpy())
axes[1].set_aspect("equal", "datalim")
axes[1].set_title("3D features")

plt.gca().set_aspect("equal", "datalim")
plt.tight_layout()
plt.show()

In [None]:
def draw_umap(
    data, n_neighbors=15, min_dist=0.1, n_components=3, metric="euclidean", title=""
):
    """Draws a UMAP plot based on the given data."""
    fit = umap.UMAP(
        n_neighbors=n_neighbors,
        min_dist=min_dist,
        n_components=n_components,
        metric=metric,
    )
    u = fit.fit_transform(data)
    color_map = {0: "green", 1: "red"}
    color = np.vectorize(color_map.get)(data[:, -1].to_numpy())
    fig = plt.figure()
    if n_components == 1:
        ax = fig.add_subplot(111)
        ax.scatter(u[:, 0], range(len(u)), c=color)
    if n_components == 2:
        ax = fig.add_subplot(111)
        ax.scatter(u[:, 0], u[:, 1], c=color)
    if n_components == 3:
        ax = fig.add_subplot(111, projection="3d")
        ax.scatter(u[:, 0], u[:, 1], u[:, 2], c=color, s=100)
    plt.title(title, fontsize=18)

In [None]:
draw_umap(
    X_3d_features,
    min_dist=0.4,
    n_components=3,
    metric="euclidean",
    title="3D projection of the dataset",
)