In [None]:
import cudf
import cuml
import matplotlib.pyplot as plt
import seaborn as sns
from joblib import dump

In [None]:
X = cudf.read_parquet("../data/processed/train.parquet")

In [None]:
X.head()

In [None]:
y = X.pop("target")

In [None]:
y = cudf.read_csv("../data/raw/train_gt.csv", index_col=0, skiprows=0)

In [None]:
df = X.drop(columns="target").join(y)
df.head()

In [None]:
df = df.dropna()
X = df.drop(columns="5408")

In [None]:
from cuml.decomposition import PCA
from cuml.preprocessing import SimpleImputer, StandardScaler
from imblearn.pipeline import Pipeline

steps = [
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler()),
    ("reducer", PCA(n_components=100)),
    # ("sampler", RandomUnderSampler()),
]


pipe = Pipeline(steps=steps)

In [None]:
pipe.fit(X, y)

In [None]:
from cuml.manifold.umap import UMAP as cuUMAP

trained_UMAP = cuUMAP(n_neighbors=10).fit(X)
X_embedded = trained_UMAP.transform(X)

cu_score = cuml.metrics.trustworthiness(X, X_embedded)

print(" cuml's trustworthiness score : ", cu_score)

# save
dump(trained_UMAP, "UMAP.model")

# to reload the model uncomment the line below
# loaded_model = load('UMAP.model')

In [None]:
X_emb = X_embedded.to_numpy()
df_np = df.to_pandas()

In [None]:
# plot the umap embedding

plt.figure(figsize=(10, 10))
sns.scatterplot(x=X_emb[:, 0], y=X_emb[:, 1], hue=df_np["5408"])