## Install Dependencies 

In [1]:
%pip install torch transformers


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


## Import Dependencies 

In [2]:
import torch
from transformers import AutoModel, AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


## Define model and tokenizer

In [3]:
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

## Sample Sentence and Tokenization

In [4]:
sample_text = ["Sometimes I get lost in thought and have to write my ideas down in order to keep a level of creativity within my work", 
               "Music is such a fun and expressive medium through which I can communicate abstract ideas", 
               "Deep learning was a very difficult class but I learned a lot", 
               "Traveling can be fun if you know who to travel with", 
               "My favorite food is pizza and I'm excited because this weekend I get to eat tons of it"]

In [5]:
inputs = [tokenizer(text, return_tensors="pt",
                   truncation=True, max_length=512) for text in sample_text]

## Generate Embedding

In [6]:
outputs = []
for i in inputs:    
    with torch.no_grad():
        outputs.append(model(**i))

# Assuming you want the last hidden states (token-level embeddings)
embeddings = [output[0] for output in outputs]

# To access embeddings of the first sample
embeddings1 = embeddings[0]
embeddings[0]

tensor([[[ 0.3864,  0.2770, -0.3731,  ..., -0.1788,  0.0870,  0.2748],
         [ 0.0174,  0.3331, -0.0200,  ..., -0.2398,  0.5042,  0.1241],
         [ 0.6898,  0.1359, -0.1013,  ..., -0.3224,  0.2585, -0.0607],
         ...,
         [ 0.5960,  0.5854,  0.3574,  ..., -0.4544,  0.0283, -0.1506],
         [ 0.5079,  0.6034, -0.9316,  ..., -0.1446,  0.4718, -0.4244],
         [ 0.6415,  0.0545, -0.4683,  ...,  0.0541, -0.4921, -0.1414]]])

In [7]:
%pip install umap-learn matplotlib


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


## Prepare Your Data:
Since UMAP operates on NumPy arrays, you'll need to convert your PyTorch tensors to NumPy arrays. If embeddings is your list of tensors, you can do:

In [8]:
# import numpy as np

# cls_embeddings = [e[:, 0, :]
#                   for e in embeddings]  # Extracting the [CLS] token embedding
# embeddings_combined = torch.vstack(cls_embeddings).numpy()

## Apply UMAP:
UMAP will reduce your high-dimensional embeddings to 2D.

In [None]:
# import umap

# reducer = umap.UMAP(n_neighbors=min(5, len(embeddings_combined)-1))

# embeddings_2d = reducer.fit_transform(embeddings_combined)

## Plotting 

In [None]:
# import matplotlib.pyplot as plt

# plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1])

# for i, text in enumerate(sample_text):
#     plt.annotate(text, (embeddings_2d[i, 0], embeddings_2d[i, 1]))

# plt.title('2D UMAP Projection of Embeddings')
# plt.xlabel('UMAP Dimension 1')
# plt.ylabel('UMAP Dimension 2')
# plt.show()