# Manifold Learning Techinques for NLP Embeddings
#### This notebook contains code to visulize BERT embeddings in 3D using manifold learning techniques which in contrast to linear projection of data, look at non-linear structure in embeddings.


##### More on: [Scikit-learn Manifold Learning Techniques](https://scikit-learn.org/1.5/modules/manifold.html)

In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from tqdm import tqdm
import numpy as np

import plotly.express as px
from sklearn.decomposition import TruncatedSVD
# from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
# from sklearn.neighbors import NeighborhoodComponentsAnalysis
from sklearn.ensemble import RandomTreesEmbedding
from sklearn.manifold import (
    Isomap,
    LocallyLinearEmbedding,
    MDS,
    SpectralEmbedding,
    TSNE,
)
from sklearn.pipeline import make_pipeline
from sklearn.random_projection import SparseRandomProjection

import plotly.io as pio
pio.renderers.default = 'notebook'
import plotly.offline as pyo
pyo.init_notebook_mode(connected=True)

In [2]:
# Dataset read and 
df = pd.read_csv("./ner_datasetreference.csv", encoding= 'unicode_escape')
df = df.iloc[:109894]
unique_POS = pd.unique(df[['POS']].values.ravel())

In [None]:
unique_POS

In [4]:
df['Sentence #'].fillna(method='ffill', inplace=True)
df['Complete Sentence'] = df.groupby('Sentence #')['Word'].transform(lambda x: ' '.join(x.dropna()))
df = df[df['Word'].notna()]
df.reset_index(drop=True, inplace=True)

In [None]:
# Load Model
model_name = "vblagoje/bert-english-uncased-finetuned-pos" 
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name, output_hidden_states = True)
model.eval()

In [6]:
sentences = df["Complete Sentence"].unique()

id2lbel = {
    "0": "ADJ",
    "1": "ADP",
    "2": "ADV",
    "3": "AUX",
    "4": "CCONJ",
    "5": "DET",
    "6": "INTJ",
    "7": "NOUN",
    "8": "NUM",
    "9": "PART",
    "10": "PRON",
    "11": "PROPN",
    "12": "PUNCT",
    "13": "SCONJ",
    "14": "SYM",
    "15": "VERB",
    "16": "X"
  }

output_list = []

In [None]:
for sentence in tqdm(sentences):
    tokens = tokenizer.tokenize(sentence)
    tokenized_text = ["[CLS]"] + tokens + ["[SEP]"]

    # Encode the tokens
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

    tokens_tensor = torch.tensor([indexed_tokens])

    # Pass the input through the model
    with torch.no_grad():
        outputs = model(tokens_tensor)

    # print(outputs.keys())

    # Predicted POS tags
    predicted_tags = torch.argmax(outputs.logits, dim=2).squeeze(0)

    # Last layer hidden states
    last_layer_hidden_states = outputs.hidden_states[-1][0]

    # Remove details of extra special tokens
    tokens = tokenized_text[1:-1]  # Remove [CLS] and [SEP]
    predicted_tags = predicted_tags[1:-1]  # Remove tags for [CLS] and [SEP]
    embeddings = last_layer_hidden_states[1:-1]  # Remove embeddings for [CLS] and [SEP]

    for token, tag_idx, embedding in zip(tokens, predicted_tags, embeddings):
        output_list.append([token, id2lbel[str(tag_idx.item())], embedding.tolist()])

In [None]:
output_dataframe = pd.DataFrame(output_list, columns =['Token', 'POS', 'Embedding'])
output_dataframe

In [None]:
# cropping data to reduce processing time
# smaller_chunk_df = output_dataframe[:10000]

# cropping based on tokens potentially prone to bias
bias_related_tokens = [
    # Race & Ethnicity
    "black", "white", "asian", "hispanic", "latino", "latina", "african", "european", "indian", "native",
    "indigenous", "middle eastern", "arab", "jewish", "gypsy", "caucasian",

    # Religion
    "muslim", "christian", "jew", "hindu", "buddhist", "sikh", "atheist", "agnostic", "catholic", "protestant",
    "orthodox", "islamic", "secular", "cleric", "rabbi", "imam", "monk", "priest", "pope", "missionary",

    # Crime & Violence
    "terrorism", "terrorist", "extremist", "radical", "bomb", "explosion", "attack", "war", "violence",
    "crime", "criminal", "thief", "robbery", "murder", "homicide", "assault", "gang", "drug", "cartel",
    "weapon", "gun", "knife", "arrest", "prison", "jail", "court", "lawyer", "judge", "felon",

    # Wealth & Social Class
    "rich", "poor", "wealthy", "middle class", "homeless", "privileged", "underprivileged", "elite",
    "billionaire", "millionaire", "poverty", "welfare", "donor", "charity",

    # Gender & Sexuality
    "man", "woman", "boy", "girl", "male", "female", "nonbinary", "transgender", "gay", "lesbian", "bisexual",
    "straight", "queer", "feminist", "misogyny", "patriarchy", "matriarchy",

    # Professions & Status
    "doctor", "nurse", "scientist", "engineer", "teacher", "professor", "researcher", "ceo", "manager",
    "politician", "president", "prime minister", "soldier", "activist", "journalist", "lawyer",

    # Immigration & Nationality
    "refugee", "immigrant", "migrant", "illegal", "citizen", "foreigner", "passport", "asylum", "visa",
    "border", "deportation",

    # Political Affiliation & Ideology
    "democrat", "republican", "liberal", "conservative", "socialist", "communist", "capitalist", "fascist",
    "anarchist", "dictator", "monarchy", "feminism", "nationalist", "globalist",

    # Other Societal Factors
    "education", "intelligence", "genius", "dumb", "success", "failure", "addiction", "mental health",
    "therapy", "disability", "autism", "depression", "anxiety", "bipolar", "schizophrenia"
]


bias_related_tokens = set(map(str.lower, bias_related_tokens))
smaller_chunk_df = output_dataframe[output_dataframe['Token'].str.lower().isin(bias_related_tokens)]
print(smaller_chunk_df.head())


In [10]:
smaller_chunk_df["Token"].value_counts().reset_index()
token, X, y = smaller_chunk_df["Token"], smaller_chunk_df["Embedding"].values.tolist(), smaller_chunk_df["POS"].to_numpy()
X = np.array(X)
n_samples, n_features = X.shape
n_neighbors = 60 # hyperparameter for manifold learning techniques

In [11]:
import time

#### Random Projection

In [None]:
start_time = time.time()

## Random projection embedding
sp = SparseRandomProjection(
        n_components=3, random_state=42
    )
X_sp = sp.fit_transform(X)
elapsed_time = time.time() - start_time
print(f"Elapsed time: {elapsed_time} seconds")

In [13]:
df_random = {
    "First dimension": X_sp[:, 0],
    "Second dimension": X_sp[:, 1],
    "Third dimension": X_sp[:, 2],
    "colors": y,
    "word": token
}

In [None]:
fig = px.scatter_3d(df_random, x="First dimension", y="Second dimension", z="Third dimension", color="colors", hover_name="word", height=600, size_max=2, title="Random Projection")
fig.show()

#### Isomap Embeddings

In [None]:
start_time = time.time()
## Isomap embedding
iso = Isomap(n_neighbors=n_neighbors, n_components=3)
X_iso = iso.fit_transform(X)
elapsed_time = time.time() - start_time
print(f"Elapsed time: {elapsed_time} seconds")

In [19]:
df_1 = {
    "First dimension": X_iso[:, 0],
    "Second dimension": X_iso[:, 1],
    "Third dimension": X_iso[:, 2],
    "colors": y,
    "word": token
}

In [None]:
fig = px.scatter_3d(df_1, x="First dimension", y="Second dimension", z="Third dimension", color="colors", hover_name="word", height=800, size_max=2, title="ISOmap")
fig.show()

#### Standard LLE Embeddings

In [None]:
start_time = time.time()
## Standard LLE embedding
lle_s = LocallyLinearEmbedding(
        n_neighbors=n_neighbors, n_components=3, method="standard"
    )
X_lle_s = lle_s.fit_transform(X)
elapsed_time = time.time() - start_time
print(f"Elapsed time: {elapsed_time} seconds")

In [22]:
df_2 = {
    "First dimension": X_lle_s[:, 0],
    "Second dimension": X_lle_s[:, 1],
    "Third dimension": X_lle_s[:, 2],
    "colors": y,
    "word": token
}

In [None]:
fig = px.scatter_3d(df_2, x="First dimension", y="Second dimension", z="Third dimension", color="colors", hover_name="word", height=800, size_max=2, title="Standard LLE embedding")
fig.show()

#### Modified LLE Embeddings

In [None]:
start_time = time.time()
## Modified LLE embedding
lle_m = LocallyLinearEmbedding(
        n_neighbors=n_neighbors, n_components=3, method="modified"
    )
X_lle_m = lle_m.fit_transform(X)
elapsed_time = time.time() - start_time
print(f"Elapsed time: {elapsed_time} seconds")

In [25]:
df_3 = {
    "First dimension": X_lle_m[:, 0],
    "Second dimension": X_lle_m[:, 1],
    "Third dimension": X_lle_m[:, 2],
    "colors": y,
    "word": token
}

In [None]:
fig = px.scatter_3d(df_3, x="First dimension", y="Second dimension", z="Third dimension", color="colors", hover_name="word", height=800, size_max=2, title="Modified LLE embedding")
fig.show()

#### MDS Embeddings

In [None]:
start_time = time.time()
## MDS embedding
mds = MDS(
        n_components=3, normalized_stress="auto"
    )
X_mds = mds.fit_transform(X)
elapsed_time = time.time() - start_time
print(f"Elapsed time: {elapsed_time} seconds")

In [28]:
df_4 = {
    "First dimension": X_mds[:, 0],
    "Second dimension": X_mds[:, 1],
    "Third dimension": X_mds[:, 2],
    "colors": y,
    "word": token
}

In [None]:
fig = px.scatter_3d(df_4, x="First dimension", y="Second dimension", z="Third dimension", color="colors", hover_name="word", height=800, size_max=2, title="MDS embedding")
fig.show()

#### Random Trees embedding

In [None]:
start_time = time.time()
## Random Trees embedding
rte = make_pipeline(
        RandomTreesEmbedding(n_estimators=200, max_depth=5, random_state=0),
        TruncatedSVD(n_components=3),
    )
X_rte = rte.fit_transform(X)
elapsed_time = time.time() - start_time
print(f"Elapsed time: {elapsed_time} seconds")

In [31]:
df_5 = {
    "First dimension": X_rte[:, 0],
    "Second dimension": X_rte[:, 1],
    "Third dimension": X_rte[:, 2],
    "colors": y,
    "word": token
}

In [None]:
fig = px.scatter_3d(df_5, x="First dimension", y="Second dimension", z="Third dimension", color="colors", hover_name="word", height=800, size_max=2, title="Random Trees embedding")
fig.show()

#### Spectral embedding

In [None]:
start_time = time.time()
## Spectral embedding
spectral = SpectralEmbedding(
        n_components=3, random_state=0, eigen_solver="arpack"
    )
X_spectral = spectral.fit_transform(X)
elapsed_time = time.time() - start_time
print(f"Elapsed time: {elapsed_time} seconds")

In [34]:
df_6 = {
    "First dimension": X_spectral[:, 0],
    "Second dimension": X_spectral[:, 1],
    "Third dimension": X_spectral[:, 2],
    "colors": y,
    "word": token
}

In [None]:
fig = px.scatter_3d(df_6, x="First dimension", y="Second dimension", z="Third dimension", color="colors", hover_name="word", height=800, size_max=2, title="Spectral embedding")
fig.show()

#### T-SNE

In [None]:
start_time = time.time()
## T-SNE
tsne = TSNE(n_components=3, random_state=42)
X_tsne = tsne.fit_transform(X)
elapsed_time = time.time() - start_time
print(f"Elapsed time: {elapsed_time} seconds")

In [37]:
df_7 = {
    "First dimension": X_tsne[:, 0],
    "Second dimension": X_tsne[:, 1],
    "Third dimension": X_tsne[:, 2],
    "colors": y,
    "word": token
}

In [None]:
fig = px.scatter_3d(df_7, x="First dimension", y="Second dimension", z="Third dimension", color="colors", hover_name="word", height=800, size_max=2, title="T-SNE")
fig.show()