# Manifold Learning Techinques for NLP Embeddings
#### This notebook contains code to visulize BERT embeddings in 3D using manifold learning techniques which in contrast to linear projection of data, look at non-linear structure in embeddings.


##### More on: [Scikit-learn Manifold Learning Techniques](https://scikit-learn.org/1.5/modules/manifold.html)

In [1]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from tqdm import tqdm
import numpy as np

import plotly.express as px
from sklearn.decomposition import TruncatedSVD
# from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
# from sklearn.neighbors import NeighborhoodComponentsAnalysis
from sklearn.ensemble import RandomTreesEmbedding
from sklearn.manifold import (
    Isomap,
    LocallyLinearEmbedding,
    MDS,
    SpectralEmbedding,
    TSNE,
)
from sklearn.pipeline import make_pipeline
from sklearn.random_projection import SparseRandomProjection

import plotly.io as pio
pio.renderers.default = 'notebook'
import plotly.offline as pyo
pyo.init_notebook_mode(connected=True)

In [2]:
# Dataset read and 
df = pd.read_csv("./ner_datasetreference.csv", encoding= 'unicode_escape')
df = df.iloc[:109894]
unique_POS = pd.unique(df[['POS']].values.ravel())

In [3]:
unique_POS

array(['NNS', 'IN', 'VBP', 'VBN', 'NNP', 'TO', 'VB', 'DT', 'NN', 'CC',
       'JJ', '.', 'VBD', 'WP', '``', 'CD', 'PRP', 'VBZ', 'POS', 'VBG',
       'RB', ',', 'WRB', 'PRP$', 'MD', 'WDT', 'JJR', ':', 'JJS', 'WP$',
       'RP', 'PDT', 'NNPS', 'EX', 'RBS', 'LRB', 'RRB', '$', 'RBR', ';',
       'UH'], dtype=object)

In [4]:
df['Sentence #'].fillna(method='ffill', inplace=True)
df['Complete Sentence'] = df.groupby('Sentence #')['Word'].transform(lambda x: ' '.join(x.dropna()))
df = df[df['Word'].notna()]
df.reset_index(drop=True, inplace=True)

In [5]:
# Load Model
model_name = "vblagoje/bert-english-uncased-finetuned-pos" 
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name, output_hidden_states = True)
model.eval()

Some weights of the model checkpoint at vblagoje/bert-english-uncased-finetuned-pos were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [6]:
sentences = df["Complete Sentence"].unique()

id2lbel = {
    "0": "ADJ",
    "1": "ADP",
    "2": "ADV",
    "3": "AUX",
    "4": "CCONJ",
    "5": "DET",
    "6": "INTJ",
    "7": "NOUN",
    "8": "NUM",
    "9": "PART",
    "10": "PRON",
    "11": "PROPN",
    "12": "PUNCT",
    "13": "SCONJ",
    "14": "SYM",
    "15": "VERB",
    "16": "X"
  }

output_list = []

In [7]:
for sentence in tqdm(sentences):
    tokens = tokenizer.tokenize(sentence)
    tokenized_text = ["[CLS]"] + tokens + ["[SEP]"]

    # Encode the tokens
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

    tokens_tensor = torch.tensor([indexed_tokens])

    # Pass the input through the model
    with torch.no_grad():
        outputs = model(tokens_tensor)

    # print(outputs.keys())

    # Predicted POS tags
    predicted_tags = torch.argmax(outputs.logits, dim=2).squeeze(0)

    # Last layer hidden states
    last_layer_hidden_states = outputs.hidden_states[-1][0]

    # Remove details of extra special tokens
    tokens = tokenized_text[1:-1]  # Remove [CLS] and [SEP]
    predicted_tags = predicted_tags[1:-1]  # Remove tags for [CLS] and [SEP]
    embeddings = last_layer_hidden_states[1:-1]  # Remove embeddings for [CLS] and [SEP]

    for token, tag_idx, embedding in zip(tokens, predicted_tags, embeddings):
        output_list.append([token, id2lbel[str(tag_idx.item())], embedding.tolist()])

100%|██████████| 4996/4996 [03:38<00:00, 22.84it/s]


In [8]:
output_dataframe = pd.DataFrame(output_list, columns =['Token', 'POS', 'Embedding'])
output_dataframe

Unnamed: 0,Token,POS,Embedding
0,thousands,NOUN,"[0.648224413394928, 0.07481403648853302, 0.013..."
1,of,ADP,"[-0.3064924478530884, -0.1777660846710205, 0.7..."
2,demonstrators,NOUN,"[0.2297319918870926, 0.037552088499069214, -0...."
3,have,AUX,"[1.1666436195373535, 0.008594166487455368, -0...."
4,marched,VERB,"[-0.08344964683055878, 0.19849278032779694, -0..."
...,...,...,...
123338,town,NOUN,"[0.32227596640586853, -0.1483808010816574, -0...."
123339,of,ADP,"[-0.5095762014389038, -0.16979153454303741, 0...."
123340,bai,PROPN,"[0.5318405628204346, 0.23350577056407928, -0.3..."
123341,##ji,PROPN,"[0.2193199247121811, -0.014787234365940094, -0..."


In [12]:
# cropping data to reduce processing time
# smaller_chunk_df = output_dataframe[:10000]

# cropping based on tokens potentially prone to bias
bias_related_tokens = [
    # Race & Ethnicity
    "black", "white", "asian", "hispanic", "latino", "latina", "african", "european", "indian", "native",
    "indigenous", "middle eastern", "arab", "jewish", "gypsy", "caucasian",

    # Religion
    "muslim", "christian", "jew", "hindu", "buddhist", "sikh", "atheist", "agnostic", "catholic", "protestant",
    "orthodox", "islamic", "secular", "cleric", "rabbi", "imam", "monk", "priest", "pope", "missionary",

    # Crime & Violence
    "terrorism", "terrorist", "extremist", "radical", "bomb", "explosion", "attack", "war", "violence",
    "crime", "criminal", "thief", "robbery", "murder", "homicide", "assault", "gang", "drug", "cartel",
    "weapon", "gun", "knife", "arrest", "prison", "jail", "court", "lawyer", "judge", "felon",

    # Wealth & Social Class
    "rich", "poor", "wealthy", "middle class", "homeless", "privileged", "underprivileged", "elite",
    "billionaire", "millionaire", "poverty", "welfare", "donor", "charity",

    # Gender & Sexuality
    "man", "woman", "boy", "girl", "male", "female", "nonbinary", "transgender", "gay", "lesbian", "bisexual",
    "straight", "queer", "feminist", "misogyny", "patriarchy", "matriarchy",

    # Professions & Status
    "doctor", "nurse", "scientist", "engineer", "teacher", "professor", "researcher", "ceo", "manager",
    "politician", "president", "prime minister", "soldier", "activist", "journalist", "lawyer",

    # Immigration & Nationality
    "refugee", "immigrant", "migrant", "illegal", "citizen", "foreigner", "passport", "asylum", "visa",
    "border", "deportation",

    # Political Affiliation & Ideology
    "democrat", "republican", "liberal", "conservative", "socialist", "communist", "capitalist", "fascist",
    "anarchist", "dictator", "monarchy", "feminism", "nationalist", "globalist",

    # Other Societal Factors
    "education", "intelligence", "genius", "dumb", "success", "failure", "addiction", "mental health",
    "therapy", "disability", "autism", "depression", "anxiety", "bipolar", "schizophrenia"
]


bias_related_tokens = set(map(str.lower, bias_related_tokens))
smaller_chunk_df = output_dataframe[output_dataframe['Token'].str.lower().isin(bias_related_tokens)]
print(smaller_chunk_df.head())


         Token    POS                                          Embedding
10         war   NOUN  [0.5291779041290283, 0.2582028806209564, 0.203...
46   terrorist   NOUN  [0.30027148127555847, 0.6429552435874939, -0.9...
152        war   NOUN  [0.6740910410881042, -0.0490100160241127, -0.3...
255   european  PROPN  [0.8492451310157776, 0.5001541376113892, -0.24...
300  president  PROPN  [0.25401070713996887, 0.5593031048774719, -0.9...


In [13]:
smaller_chunk_df["Token"].value_counts().reset_index()
token, X, y = smaller_chunk_df["Token"], smaller_chunk_df["Embedding"].values.tolist(), smaller_chunk_df["POS"].to_numpy()
X = np.array(X)
n_samples, n_features = X.shape
n_neighbors = 60 # hyperparameter for manifold learning techniques

In [14]:
import time

#### Random Projection

In [15]:
start_time = time.time()

## Random projection embedding
sp = SparseRandomProjection(
        n_components=3, random_state=42
    )
X_sp = sp.fit_transform(X)
elapsed_time = time.time() - start_time
print(f"Elapsed time: {elapsed_time} seconds")

Elapsed time: 0.02000141143798828 seconds


In [16]:
df_random = {
    "First dimension": X_sp[:, 0],
    "Second dimension": X_sp[:, 1],
    "Third dimension": X_sp[:, 2],
    "colors": y,
    "word": token
}

In [17]:
fig = px.scatter_3d(df_random, x="First dimension", y="Second dimension", z="Third dimension", color="colors", hover_name="word", height=600, size_max=2, title="Random Projection")
fig.show()

#### Isomap Embeddings

In [18]:
start_time = time.time()
## Isomap embedding
iso = Isomap(n_neighbors=n_neighbors, n_components=3)
X_iso = iso.fit_transform(X)
elapsed_time = time.time() - start_time
print(f"Elapsed time: {elapsed_time} seconds")

Elapsed time: 4.743452548980713 seconds


In [19]:
df_1 = {
    "First dimension": X_iso[:, 0],
    "Second dimension": X_iso[:, 1],
    "Third dimension": X_iso[:, 2],
    "colors": y,
    "word": token
}

In [20]:
fig = px.scatter_3d(df_1, x="First dimension", y="Second dimension", z="Third dimension", color="colors", hover_name="word", height=800, size_max=2, title="ISOmap")
fig.show()

#### Standard LLE Embeddings

In [21]:
start_time = time.time()
## Standard LLE embedding
lle_s = LocallyLinearEmbedding(
        n_neighbors=n_neighbors, n_components=3, method="standard"
    )
X_lle_s = lle_s.fit_transform(X)
elapsed_time = time.time() - start_time
print(f"Elapsed time: {elapsed_time} seconds")

Elapsed time: 1.10599946975708 seconds


In [22]:
df_2 = {
    "First dimension": X_lle_s[:, 0],
    "Second dimension": X_lle_s[:, 1],
    "Third dimension": X_lle_s[:, 2],
    "colors": y,
    "word": token
}

In [23]:
fig = px.scatter_3d(df_2, x="First dimension", y="Second dimension", z="Third dimension", color="colors", hover_name="word", height=800, size_max=2, title="Standard LLE embedding")
fig.show()

#### Modified LLE Embeddings

In [24]:
start_time = time.time()
## Modified LLE embedding
lle_m = LocallyLinearEmbedding(
        n_neighbors=n_neighbors, n_components=3, method="modified"
    )
X_lle_m = lle_m.fit_transform(X)
elapsed_time = time.time() - start_time
print(f"Elapsed time: {elapsed_time} seconds")

Elapsed time: 2.325000047683716 seconds


In [25]:
df_3 = {
    "First dimension": X_lle_m[:, 0],
    "Second dimension": X_lle_m[:, 1],
    "Third dimension": X_lle_m[:, 2],
    "colors": y,
    "word": token
}

In [26]:
fig = px.scatter_3d(df_3, x="First dimension", y="Second dimension", z="Third dimension", color="colors", hover_name="word", height=800, size_max=2, title="Modified LLE embedding")
fig.show()

#### MDS Embeddings

In [27]:
start_time = time.time()
## MDS embedding
mds = MDS(
        n_components=3, normalized_stress="auto"
    )
X_mds = mds.fit_transform(X)
elapsed_time = time.time() - start_time
print(f"Elapsed time: {elapsed_time} seconds")

Elapsed time: 126.1089940071106 seconds


In [28]:
df_4 = {
    "First dimension": X_mds[:, 0],
    "Second dimension": X_mds[:, 1],
    "Third dimension": X_mds[:, 2],
    "colors": y,
    "word": token
}

In [29]:
fig = px.scatter_3d(df_4, x="First dimension", y="Second dimension", z="Third dimension", color="colors", hover_name="word", height=800, size_max=2, title="MDS embedding")
fig.show()

#### Random Trees embedding

In [30]:
start_time = time.time()
## Random Trees embedding
rte = make_pipeline(
        RandomTreesEmbedding(n_estimators=200, max_depth=5, random_state=0),
        TruncatedSVD(n_components=3),
    )
X_rte = rte.fit_transform(X)
elapsed_time = time.time() - start_time
print(f"Elapsed time: {elapsed_time} seconds")

Elapsed time: 0.41901302337646484 seconds


In [31]:
df_5 = {
    "First dimension": X_rte[:, 0],
    "Second dimension": X_rte[:, 1],
    "Third dimension": X_rte[:, 2],
    "colors": y,
    "word": token
}

In [32]:
fig = px.scatter_3d(df_5, x="First dimension", y="Second dimension", z="Third dimension", color="colors", hover_name="word", height=800, size_max=2, title="Random Trees embedding")
fig.show()

#### Spectral embedding

In [33]:
start_time = time.time()
## Spectral embedding
spectral = SpectralEmbedding(
        n_components=3, random_state=0, eigen_solver="arpack"
    )
X_spectral = spectral.fit_transform(X)
elapsed_time = time.time() - start_time
print(f"Elapsed time: {elapsed_time} seconds")

Elapsed time: 2.052999496459961 seconds


In [34]:
df_6 = {
    "First dimension": X_spectral[:, 0],
    "Second dimension": X_spectral[:, 1],
    "Third dimension": X_spectral[:, 2],
    "colors": y,
    "word": token
}

In [35]:
fig = px.scatter_3d(df_6, x="First dimension", y="Second dimension", z="Third dimension", color="colors", hover_name="word", height=800, size_max=2, title="Spectral embedding")
fig.show()

#### T-SNE

In [36]:
start_time = time.time()
## T-SNE
tsne = TSNE(n_components=3, random_state=42)
X_tsne = tsne.fit_transform(X)
elapsed_time = time.time() - start_time
print(f"Elapsed time: {elapsed_time} seconds")

Elapsed time: 59.79841470718384 seconds


In [37]:
df_7 = {
    "First dimension": X_tsne[:, 0],
    "Second dimension": X_tsne[:, 1],
    "Third dimension": X_tsne[:, 2],
    "colors": y,
    "word": token
}

In [38]:
fig = px.scatter_3d(df_7, x="First dimension", y="Second dimension", z="Third dimension", color="colors", hover_name="word", height=800, size_max=2, title="T-SNE")
fig.show()