# Dimensionality Reduction — PCA/t-SNE/UMAP

Reduce dimensions for visualization and pre-processing. Compare 2D PCA vs t-SNE/UMAP views and report PCA variance explained.

In [None]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt, warnings

!wget -q https://raw.githubusercontent.com/Jihun-ust/ust-mail-557/main/Unsupervised/unsup_utils.py
import unsup_utils as utils
csv_path = "https://raw.githubusercontent.com/Jihun-ust/ust-mail-557/main/Unsupervised/unsup.csv"
warnings.filterwarnings("ignore")

df = pd.read_csv(csv_path)
X, cols, sc = utils.feature_matrix(df, use_emb=True)

# PCA variance explained
from sklearn.decomposition import PCA
p = PCA(n_components=10, random_state=42).fit(X)
print("PCA explained variance ratio (first 10):", np.round(p.explained_variance_ratio_, 4))
X2 = p.transform(X)[:,:2]
utils.plot_xy(X2, title="PCA 2D — colored by doc_type", labels=pd.Categorical(df["doc_type"]).codes)

# t-SNE (subset for speed)
Xts = utils.tsne_2d(X, n=2000)
utils.plot_xy(Xts, title="t-SNE 2D — colored by doc_type", labels=pd.Categorical(df["doc_type"].iloc[:len(Xts)]).codes)

# UMAP (if available)
Xu, name = utils.try_umap_2d(X, n=2000)
if Xu is not None:
    utils.plot_xy(Xu, title="UMAP 2D — colored by doc_type", labels=pd.Categorical(df['doc_type'].iloc[:len(Xu)]).codes)
else:
    print("umap-learn not available; skipping UMAP.")