# Task 2: Visualize hidden represenations of a model

## Part 1: Extract and save representations

In [3]:
import h5py

import matplotlib.pyplot as plt
import numpy as np
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

In [4]:
!pip install datasets

# import dependencies
import torch

from datasets import load_dataset, load_dataset_builder, get_dataset_split_names, get_dataset_config_names
from transformers import XGLMTokenizer, XGLMTokenizerFast, XGLMForCausalLM, AutoModelForCausalLM, AutoTokenizer, GenerationConfig
from torch.utils.data import DataLoader

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed datasets

In [5]:
MODEL_NAME = "facebook/xglm-564M"
DATASET_NAME = "facebook/flores"
BATCH_SIZE = 2
RANDOM_SUBSET_SIZE = 2

LANGUAGES = [
    "eng_Latn",
    "spa_Latn",
    "deu_Latn",
    "arb_Arab",
    "tam_Taml",
    "quy_Latn"
]

In [6]:
# Load dataset
flores_dataset={}
flores_dataset_builder={}
for language in LANGUAGES :
    dataset = load_dataset("facebook/flores", language)
    flores_dataset[language] = dataset

# Limit dataset to a random subset of 200 sentences for each language
random_subset = {lang: flores_dataset[lang]['dev'].shuffle(seed=42).select(range(RANDOM_SUBSET_SIZE)) for lang in LANGUAGES}

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/11.2k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/11.8k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/25.6M [00:00<?, ?B/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating devtest split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating devtest split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating devtest split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating devtest split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating devtest split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating devtest split: 0 examples [00:00, ? examples/s]

In [7]:
# tokenize the data

# load a pre-trained tokenizer from the huggingface hub
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# gpt2 does not have a padding token, so we have to add it manually
if MODEL_NAME == "gpt2":
    tokenizer.add_special_tokens({'pad_token': tokenizer.unk_token})

# specify the tokenization function
def tokenization(example):
    return tokenizer(example['sentence'], padding='max_length', truncation=True, max_length=42)

# tokenize your random subset
tokenized_datasets = {}
for language, dataset in random_subset.items():
    tokenized_datasets[language] = dataset.map(tokenization, batched=True)

tokenizer_config.json:   0%|          | 0.00/433 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.92M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.03M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/276 [00:00<?, ?B/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

In [8]:
for language, dataset in tokenized_datasets.items():
    dataset.set_format(type='torch')

data_loaders = {}
for language, dataset in tokenized_datasets.items():
    data_loaders[language] = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

In [9]:
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, output_hidden_states=True)

# Set model to evaluation mode
model.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

config.json:   0%|          | 0.00/546 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

XGLMForCausalLM(
  (model): XGLMModel(
    (embed_tokens): Embedding(256008, 1024, padding_idx=1)
    (embed_positions): XGLMSinusoidalPositionalEmbedding()
    (layers): ModuleList(
      (0-23): 24 x XGLMDecoderLayer(
        (self_attn): XGLMAttention(
          (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
        )
        (activation_fn): GELUActivation()
        (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (fc1): Linear(in_features=1024, out_features=4096, bias=True)
        (fc2): Linear(in_features=4096, out_features=1024, bias=True)
        (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      )
    )
    (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine

In [20]:
# TODO: your code goes here

# Function to extract hidden representations
def extract_hidden_representations(example, model):
    # Move input to CPU
    input_ids = example['input_ids']
    attention_mask = example['attention_mask']
    with torch.no_grad():
        # Move input to GPU if available
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)

    hidden_states = outputs.hidden_states  # Get hidden states from all layers
    return hidden_states

# Initialize HDF5 file for storage
with h5py.File("representations.h5", "w") as f:
    for lang, lang_dataset in data_loaders.items():
        print(f'Processing {lang}....')
        for example in lang_dataset:
            # Extract hidden representations
            hidden_states = extract_hidden_representations(example, model)
            # print(hidden_states.size())
            # Iterate through layers and tokens
            for layer, layer_hidden_states in enumerate(hidden_states):
                # print(layer_hidden_states.size())
                # print(example['attention_mask'][0].size()
                sentence_index = 0

                # save sentence details in file
                if(layer == 0): # only need to save it once
                    f.create_dataset(
                        f"{lang}/Sample_{example['id'][sentence_index]}/sentence",
                        data=example['sentence'][sentence_index]
                    )
                    # save token_ids in file
                    token_ids_sample = example['input_ids'][sentence_index]
                    token_ids_saved = []
                    mask_index = 0
                    for mask in example['attention_mask'][sentence_index]:
                        if(mask == 1): # only save non-padding tokens
                            token_ids_saved.append(token_ids_sample[mask_index])
                        mask_index +=1
                    token_ids_saved = token_ids_saved[1:] # don't save first token, string start token
                    f.create_dataset(
                        f"{lang}/Sample_{example['id'][sentence_index]}/input_ids",
                        data=token_ids_saved
                    )


                for sentence in layer_hidden_states:
                    for token_index, token_hidden_state in enumerate(sentence):
                        # Skip 1st token, marks the start of the sentence
                        if(token_index == 0):
                            continue

                        # Skip padding tokens
                        if (example['attention_mask'][sentence_index][token_index] == 0):
                            continue

                        # Save hidden representation for each token
                        # print(token_index)
                        f.create_dataset(
                            f"{lang}/Sample_{example['id'][sentence_index]}/layers/layer_{layer}/token_{token_index}",
                            data=token_hidden_state.numpy()
                        )

                    # Calculate mean-pooled sentence representation
                    mean_pooled_sentence = np.mean(sentence.numpy(), axis=0)

                    # Save mean-pooled sentence representation
                    # print(f'Creating {lang}/Sample #{example["id"][sentence_index]}/mean_pooled_sentence')
                    f.create_dataset(
                        f"{lang}/Sample_{example['id'][sentence_index]}/layers/layer_{layer}/mean_pooled_sentence",
                        data=mean_pooled_sentence
                    )
                    # print(f'Created {lang}/Sample #{example["id"][sentence_index]}/mean_pooled_sentence')
                    sentence_index += 1

        print('Done.')
        print()



Processing eng_Latn....
Done.

Processing spa_Latn....
Done.

Processing deu_Latn....
Done.

Processing arb_Arab....
Done.

Processing tam_Taml....
Done.

Processing quy_Latn....
Done.



## Part 2: Visualize representations

In [17]:
import altair as alt
import pandas as pd

In [61]:
import os
import shutil

paths = ['visualizations/one_sentence_one_lang/pca',
         'visualizations/one_sentence_one_lang/tsne',
         'visualizations/all_sentence_all_lang/pca',
         'visualizations/all_sentence_all_lang/tsne']

# Create directories for visualizations
for path in paths:
    # Check if the directory exists
    if os.path.exists(path):
        # If it exists, remove it and all its contents
        shutil.rmtree(path)
    os.makedirs(path)
    print("Directory created successfully at:", path)


Directory created successfully at: visualizations/one_sentence_one_lang/pca
Directory created successfully at: visualizations/one_sentence_one_lang/tsne
Directory created successfully at: visualizations/all_sentence_all_lang/pca
Directory created successfully at: visualizations/all_sentence_all_lang/tsne


### Visualizations, 1 sentence on each layer for 1 language

In [48]:
def pca_visualize_one_sentence_one_layer_one_language(hidden_representations, lang, sample_id, layer_id, visualized_sentence, sentence_input_ids):
    # convert to 2D array for pca input
    pca_input = np.vstack(list(hidden_representations.values()))

    # Apply PCA
    pca = PCA(n_components=2)
    pca_result = pca.fit_transform(pca_input)

    # separate mean-pooled-sentence from all other token points
    mean_pooled_sentence_index = list(hidden_representations.keys()).index('mean_pooled_sentence')
    mean_pooled_point = pca_result[mean_pooled_sentence_index]

    # Create a DataFrame from the reshaped data
    df = pd.DataFrame(pca_result, columns=['PC 1', 'PC 2'])
    df['label'] = 'Other Tokens'
    df.loc[mean_pooled_sentence_index, 'label'] = 'Mean Pooled Sentence'

    descriptions = []
    for token in sentence_input_ids:
        descriptions.append(tokenizer.decode(token))
    descriptions.insert(mean_pooled_sentence_index, visualized_sentence[()].decode('utf-8'))
    df['description'] = descriptions

    chart = alt.Chart(df).mark_circle(size=60).encode(
        x='PC 1',
        y='PC 2',
        color='label',
        tooltip=['PC 1', 'PC 2', 'label', 'description']
    ).properties(
        width=700,
        height=500
    ).interactive()

    chart.save(f'visualizations/one_sentence_one_lang/pca/pca-{lang}-{sample_id}-{layer_id}.html')


In [54]:
def tsne_visualize_one_sentence_one_layer_one_language(hidden_representations, lang, sample_id, layer_id, visualized_sentence, sentence_input_ids):
    # convert to 2D array for tsne input
    tsne_input = np.vstack(list(hidden_representations.values()))
    # print(tsne_input)

    # set perplexity according to sample_size
    perplexity_value = min(tsne_input.shape[0] - 1, 30)

    # Apply t-SNE
    tsne = TSNE(n_components=2, perplexity=perplexity_value, n_iter=1000, random_state=42)
    tsne_result = tsne.fit_transform(tsne_input)

    # separate mean-pooled-sentence from all other token points
    mean_pooled_sentence_index = list(hidden_representations.keys()).index('mean_pooled_sentence')
    mean_pooled_point = tsne_result[mean_pooled_sentence_index]

    # Create a DataFrame from the reshaped data
    df = pd.DataFrame(tsne_result, columns=['tSNE Component 1', 'tSNE Component 2'])
    df['label'] = 'Other Tokens'
    df.loc[mean_pooled_sentence_index, 'label'] = 'Mean Pooled Sentence'

    descriptions = []
    for token in sentence_input_ids:
        descriptions.append(tokenizer.decode(token))
    descriptions.insert(mean_pooled_sentence_index, visualized_sentence[()].decode('utf-8'))
    df['description'] = descriptions

    chart = alt.Chart(df).mark_circle(size=60).encode(
        x='tSNE Component 1',
        y='tSNE Component 2',
        color='label',
        tooltip=['tSNE Component 1', 'tSNE Component 2', 'label', 'description']
    ).properties(
        width=700,
        height=500
    ).interactive()

    chart.save(f'visualizations/one_sentence_one_lang/tsne/tsne-{lang}-{sample_id}-{layer_id}.html')

In [62]:
FILE_PATH = "representations.h5"
with h5py.File(FILE_PATH, "r") as f:

    list_langs = list(f.keys())
    # print(list_langs)
    list_sample_ids = list(f[list_langs[0]].keys())
    # print(list_sample_ids)
    list_layer_ids = list(f[list_langs[0]][list_sample_ids[0]]['layers'].keys())
    # print(list_layer_ids)

    # visualize any 1 sentence on any 1 layer for any 1 language (token reps and mean-pooled sentences)
    lang_index = 2
    sample_index = 0
    # layer_index = 0

    for layer_index in range(len(list_layer_ids)):
        visualized_sentence = f[f"{list_langs[lang_index]}/{list_sample_ids[sample_index]}/sentence"]
        sentence_input_ids = f[f"{list_langs[lang_index]}/{list_sample_ids[sample_index]}/input_ids"]

        pca_visualize_one_sentence_one_layer_one_language(f[f"{list_langs[lang_index]}/{list_sample_ids[sample_index]}/layers/{list_layer_ids[layer_index]}"], list_langs[lang_index], list_sample_ids[sample_index], list_layer_ids[layer_index], visualized_sentence, sentence_input_ids)

        tsne_visualize_one_sentence_one_layer_one_language(f[f"{list_langs[lang_index]}/{list_sample_ids[sample_index]}/layers/{list_layer_ids[layer_index]}"], list_langs[lang_index], list_sample_ids[sample_index], list_layer_ids[layer_index], visualized_sentence, sentence_input_ids)

    print('Visualization charts stored in visualizations/one_sentence_one_lang. Open to view in browser.')


Visualization charts stored in visualizations/one_sentence_one_lang. Open to view in browser.


### Visualizations, all sentences on 1 layer for all languages