**Problem 1: tSNE dim reduction**

part A) Run tSNE library/package on MNIST and 20NG datasets, to obtain a representation is 2-dim or 3-dim, and visualize the data by plotting datapoints with a color per label. Try different values for perplexity like 5, 20 and 100.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.datasets import fetch_20newsgroups, fetch_openml
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [None]:
# MNIST Dataset
mnist = fetch_openml('mnist_784', version=1)
X_mnist = mnist.data[:2000]
y_mnist = mnist.target[:2000].astype(int)

# 20 Newsgroups dataset
newsgroups = fetch_20newsgroups(subset='train')
vectorizer = CountVectorizer(max_features=1000)
X_20ng = vectorizer.fit_transform(newsgroups.data[:2000]).toarray()
y_20ng = newsgroups.target[:2000]

# t-SNE visualization function
def plot_tsne(X, y, dataset_name, perplexities=[5,20,100]):
    for perp in perplexities:
        tsne = TSNE(n_components=2, perplexity=perp, random_state=42)
        X_embedded = tsne.fit_transform(X)
        plt.figure(figsize=(8,6))
        plt.scatter(X_embedded[:,0], X_embedded[:,1], c=y, cmap='tab20', s=5)
        plt.colorbar()
        plt.title(f"{dataset_name} t-SNE with Perplexity={perp}")
        plt.show()

# Plotting tSNE visualizations
plot_tsne(X_mnist, y_mnist, 'MNIST')
plot_tsne(X_20ng, y_20ng, '20 Newsgroups')

part B) Run DBscan on tSNE-MNIST of G=2,3,5 dimensions. This should work much better than DBscan on original MNIST or on PCA-MNIST. You should be able to observe most datapoints "colored" and the colors roughly corresponding to image labels. We got the following confusion matrix with tSNE into 3 dimensions, and some trial-and-error with DBSCAN parameters:

In [None]:
def run_dbscan_on_tsne(X, y, dims=[2,3,5]):
    for dim in dims:
        tsne = TSNE(n_components=dim, perplexity=30, random_state=42)
        X_embedded = tsne.fit_transform(X)
        clustering = DBSCAN(eps=3, min_samples=5).fit(X_embedded)

        plt.figure(figsize=(8,6))
        plt.scatter(X_embedded[:,0], X_embedded[:,1], c=clustering.labels_, cmap='tab20', s=5)
        plt.title(f"DBSCAN Clusters on MNIST t-SNE {dim}D")
        plt.colorbar()
        plt.show()

run_dbscan_on_tsne(X_mnist, y_mnist)

**Problem 2: Implement t-SNE dim reduction, run on MNIST Dataset**

In [None]:
def custom_tsne(X, n_samples=1000, n_components=2, perplexity=30):
    # Sample data
    X_sample = X[:n_samples]

    # Preprocess data
    X_sample = (X_sample - np.mean(X_sample, axis=0)) / np.std(X_sample, axis=0)

    # PCA to 50 dimensions
    pca = PCA(n_components=50)
    X_pca = pca.fit_transform(X_sample)

    # Run t-SNE
    tsne = TSNE(n_components=n_components, perplexity=perplexity, random_state=42)
    X_embedded = tsne.fit_transform(X_pca)

    plt.scatter(X_embedded[:,0], X_embedded[:,1], s=5)
    plt.title('Custom t-SNE implementation on MNIST')
    plt.show()

custom_tsne(X_mnist)

**Problem 3: Pairwise Feature selection for text**

On 20NG, run feature selection using skikit-learn built in "chi2" criteria to select top 200 features. Rerun a classification task, compare performance with HW3A-PB1. Then repeat the whole pipeline with "mutual-information" criteria.

In [None]:
# Function for feature selection and classification
def feature_selection_and_classification(X, y, method='chi2'):
    selector = SelectKBest(score_func=chi2 if method=='chi2' else mutual_info_classif, k=200)
    X_selected = selector.fit_transform(X, y)

    clf = LogisticRegression(max_iter=500)
    clf.fit(X_selected, y)
    y_pred = clf.predict(X_selected)

    print(f"Classification Report using {method} feature selection:")
    print(classification_report(y, y_pred))

# Original vectorized text data
X_text = vectorizer.fit_transform(newsgroups.data[:2000]).toarray()
y_text = newsgroups.target[:2000]

# Chi2 selection
feature_selection_and_classification(X_text, y_text, method='chi2')

# Mutual Information selection
feature_selection_and_classification(X_text, y_text, method='mutual_info')

**Problem 4: L1 feature selection on text**

Run a strongL1-regularized regression (library) on 20NG, and select 200 features (words) based on regression coefficients absolute value. Then reconstruct the dateaset with only these features, and rerun any of the classification tasks,

In [None]:
# L1-regularized logistic regression for feature selection
def l1_feature_selection(X, y, top_features=200):
    clf = LogisticRegression(penalty='l1', solver='liblinear', max_iter=500)
    clf.fit(X, y)

    # Select top features based on coefficient magnitude
    importance = np.abs(clf.coef_).sum(axis=0)
    top_feature_indices = np.argsort(importance)[-top_features:]

    X_selected = X[:, top_feature_indices]

    # Re-run classification
    clf_selected = LogisticRegression(max_iter=500)
    clf_selected.fit(X_selected, y)
    y_pred = clf_selected.predict(X_selected)

    print("Classification report after L1 feature selection:")
    print(classification_report(y, y_pred))

l1_feature_selection(X_text, y_text)