In [1]:
import os
from transformers import AutoTokenizer, AutoModel
from datasets import load_dataset
import torch
from torch import nn
import torch.nn.functional as F
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from umap import UMAP
from sklearn.preprocessing import MinMaxScaler

from tqdm.notebook import tqdm

DATA_DIR = "data" # This may need to be changed on different machines
# Make sure we're in the correct directory and make sure the data directory exists
if not os.path.exists(DATA_DIR):
    os.chdir("../../..") # Move up two directories because we're in src/nb and the data directory/path should be in/start at the root directory 
    assert os.path.exists(DATA_DIR), f"ERROR: DATA_DIR={DATA_DIR} not found"  # If we still can't see the data directory something is wrong

In [2]:
big_model = "sentence-transformers/all-mpnet-base-v2"
small_model = "sentence-transformers/all-MiniLM-L6-v2"

# Parameters for the notebook

In [3]:
model_name = small_model # Some huggingface model
dataset_prefix = "train" # dev train test
dataset_balanced = True # True or False
text_type = "paraphrase" # "paraphrase" or "text"

In [4]:
dataset_path = os.path.join(
    DATA_DIR,
    "decoded_cds",
    "balanced" if dataset_balanced else "unbalanced",
    f"{dataset_prefix}.csv"
)


# get dataset as PyTorch Dataset
dataset = load_dataset("csv", data_files=dataset_path, names=["label", "text", "paraphrase"])["train"] # Not sure if "train" is always what we want... it seems to be the default name
print(dataset)

Using custom data configuration default-b043f26e1a2a855c
Reusing dataset csv (/home/bill/.cache/huggingface/datasets/csv/default-b043f26e1a2a855c/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/1 [00:00<?, ?it/s]

Dataset({
    features: ['label', 'text', 'paraphrase', '__index_level_0__'],
    num_rows: 273373
})


In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

model = model.to(device)

Downloading:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/86.7M [00:00<?, ?B/s]

In [6]:
def tokenize(batch):
    return tokenizer(batch, padding=True, truncation=True, return_tensors="pt")

encoded_paraphrase = tokenize(dataset[text_type])
encoded_paraphrase

{'input_ids': tensor([[  101, 11498,  8458,  ...,     0,     0,     0],
        [  101,  2002,  2987,  ...,     0,     0,     0],
        [  101,  2002,  1005,  ...,     0,     0,     0],
        ...,
        [  101,  1996,  2214,  ...,     0,     0,     0],
        [  101,  2002,  8451,  ...,     0,     0,     0],
        [  101,  2296,  2305,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [None]:
# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] # First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

# Compute token embeddings
inputs = {
    "input_ids": encoded_paraphrase["input_ids"],
    "attention_mask": encoded_paraphrase["attention_mask"]
}
all_sentence_embeddings = []
batch_size = 512
with torch.no_grad():
    for i in tqdm(range(0, len(encoded_paraphrase["input_ids"]), batch_size)):
        input_ids = encoded_paraphrase["input_ids"][i:i+batch_size].to(device)
        attention_mask = encoded_paraphrase["attention_mask"][i:i+batch_size].to(device)

        batch_output = model(input_ids, attention_mask)

        # Perform pooling
        sentence_embeddings = mean_pooling(batch_output, attention_mask)

        # Normalize embeddings
        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

        # take the sentence embeddings off the GPU
        sentence_embeddings = sentence_embeddings.cpu()

        # Append to list
        all_sentence_embeddings.append(sentence_embeddings)

# Concatenate all embeddings
sentence_embeddings = torch.cat(all_sentence_embeddings, dim=0)

# Embedding Shape
print(f"Sentence Embeddings Shape: {sentence_embeddings.shape}")

In [None]:
# save the embeddings as npy DATA_DIR/embedded_cds/is_balanced/prefix_dim.npy
embedding_save_file = os.path.join(
    DATA_DIR,
    "embedded_cds",
    "balanced" if dataset_balanced else "unbalanced",
    f"{dataset_prefix}_{model_name.split('/')[-1]}.npy"
)

np.save(embedding_save_file, sentence_embeddings.numpy())

In [None]:
# load the embeddings
sentence_embeddings = np.load("data/paraphrase_embeddings.npy")

In [None]:
# project the sentence emebeddings to 2D
tsne = TSNE(n_components=2, random_state=42)
tsne_embeddings = tsne.fit_transform(sentence_embeddings.numpy())

# visualize the embeddings
plt.figure(figsize=(16, 10))
plt.scatter(tsne_embeddings[:, 0], tsne_embeddings[:, 1])

# save the plot
plt.savefig("data/sentence_embeddings.png")

plt.show()


In [None]:
# use pca to reduce the dimensionality of the embeddings to 2D

pca = PCA(n_components=2)
pca_embeddings = pca.fit_transform(sentence_embeddings)

# save pca embeddings
np.save("data/pca_embeddings.npy", pca_embeddings)

hexbin_plot(pca_embeddings[:, 0], pca_embeddings[:, 1], np.array(cds["validation"]["label"]))
plt.savefig("data/pca_hexbin.png", facecolor="white")
plt.show()

In [None]:
# save tsne_embeddings as npy
np.save("data/tsne_embeddings.npy", tsne_embeddings)

In [None]:
tsne_embeddings.shape

In [None]:

plt.figure(figsize=(16, 10))
# plot the tsne embeddings and color them by the labels
# random color

labels_to_color = {k:np.random.random(size=3) for k in set(cds["validation"]["label"])}



random_inds = np.random.choice(len(pca_embeddings), size=100000, replace=False).astype(int)
random_embeddings = pca_embeddings[random_inds]
random_labels = np.array(cds["validation"]["label"])[random_inds]

colors = list(map(lambda x: labels_to_color[x], random_labels))

for label in set(cds["validation"]["label"]):
    inds = np.where(random_labels == label)[0]
    plt.scatter(random_embeddings[inds, 0], random_embeddings[inds, 1], label=label)

plt.legend()

# save the plot
plt.savefig("data/sentence_embeddings.png")

plt.show()

In [None]:


scaled_embeddings = MinMaxScaler().fit_transform(sentence_embeddings)
mapper = UMAP(n_components=2, metric="cosine").fit(scaled_embeddings)

df_emb = pd.DataFrame(mapper.embedding_, columns=["x", "y"])
df_emb["label"] = cds["validation"]["label"]
df_emb.head()

In [None]:
hexbin_plot(df_emb["x"], df_emb["y"], df_emb["label"])
plt.savefig("data/umap_hexbin.png", facecolor="white")
plt.show()

In [None]:
# train a linear classifier on the embeddings
embedding_dim = sentence_embeddings.shape[1]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

linear_layer = nn.Linear(embedding_dim, 11).to(device)

string_labels = np.array(cds["validation"]["label"])
label_to_int = {k:i for i, k in enumerate(set(string_labels))}
# integer labels
int_labels = np.array(list(map(lambda x: label_to_int[x], string_labels)))

shuffled_inds = np.random.choice(len(int_labels), size=len(int_labels), replace=False).astype(int)

train_inds = shuffled_inds[:int(len(int_labels) * 0.8)]
val_inds = shuffled_inds[int(len(int_labels) * 0.8):]

optimizer = torch.optim.Adam(linear_layer.parameters(), lr=0.001)

batch_size = 512


In [None]:
for epoch in range(25):
    loss_ewma = 0
    loss_count = 0
    pbar = tqdm(range(0, len(train_inds), batch_size))
    for batch in pbar:
        inds = train_inds[batch:batch+batch_size]

        batch_embeddings = sentence_embeddings[inds]
        batch_labels = int_labels[inds]

        batch_embeddings = torch.from_numpy(batch_embeddings).to(device)
        batch_labels = torch.from_numpy(batch_labels).to(device)

        batch_output = linear_layer(batch_embeddings)

        loss = F.cross_entropy(batch_output, batch_labels)
        loss_count += 1

        loss_ewma = loss_ewma * 0.9 + loss * 0.1

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # update pbar
        pbar.set_description(f"Epoch {epoch} Loss: {loss_ewma.item():.4f}")

In [None]:
# evaluate the model
val_embeddings = torch.from_numpy(sentence_embeddings[val_inds]).to(device)
preds = linear_layer(val_embeddings).argmax(dim=1)
val_labels = torch.from_numpy(int_labels[val_inds]).to(device)
acc = (preds == val_labels).float().mean()
print(f"Validation accuracy: {acc.item():.4f}")


In [None]:
# plot of a confusion matrix

y_true = val_labels.cpu().numpy()
y_pred = preds.cpu().numpy()

classes = list(set(y_true))

conf_mat = np.zeros((len(classes), len(classes)))
for r in range(len(classes)):
    for c in range(len(classes)):
        conf_mat[r, c] = ((y_true == classes[r]) & (y_pred == classes[c])).sum()

conf_mat /= conf_mat.sum(axis=1, keepdims=True)
# conf_mat /= conf_mat.sum() * 0.01

# plot the matrix
plt.figure(figsize=(16, 10))
plt.imshow(conf_mat)

# Draw the values inside the matrix
for i in range(len(classes)):
    for j in range(len(classes)):
        plt.text(j, i, f"{conf_mat[i, j]:.2f}", ha="center", va="center", color="red")


plt.xticks(np.arange(len(classes)), set(string_labels), rotation=45)
plt.yticks(np.arange(len(classes)), set(string_labels))
plt.ylabel("True label")
plt.xlabel("Predicted label")
plt.title(f"Confusion matrix for validation set")
plt.savefig("data/confusion_matrix.png", facecolor="white")
plt.show()

In [None]:
# print the total label counts for each class
for label in set(string_labels):
    print(f"{label}: {(string_labels == label).sum()}")

# Make a bar graph of the label counts
plt.figure(figsize=(16, 10))
plt.bar(np.arange(len(classes)), [(string_labels == label).sum()/len(string_labels) for label in set(string_labels)])
plt.xticks(np.arange(len(classes)), set(string_labels), rotation=45)
plt.ylabel("Fraction of samples")
plt.xlabel("Label")
plt.title(f"Fraction of samples for each label")
plt.savefig("data/label_counts.png", facecolor="white")
plt.show()
