In [None]:
# Own Packages
from Masterarbeit_utils.model_utils import get_tokenizer, load_and_modify_model, load_pretrained_Tokenizer

# Site-Packages
import dask.dataframe as dd
import torch
import psutil
import os
import sys
import pickle as pk
import pandas as pd
import numpy as np
import bokeh
import time

# Dimension reduction algorithms
#from cuml.manifold import TSNE
from sklearn.manifold import TSNE
# Bokeh
from bokeh.io import output_notebook
from bokeh.plotting import figure, show, ColumnDataSource
from bokeh.models import HoverTool
from bokeh.io import export_png

from transformers import AutoTokenizer, OPTForCausalLM'
from tokenizers.processors import TemplateProcessing
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding
from torch.utils.data import Dataset

%matplotlib inline
output_notebook()

# Loading the Model, the Tokenizer and the Theme, Viepoint and F-Term Descriptions

In [None]:
"""
The Paths to important folders have to be changed for your system.
"""

# Name of this experiment
model_name = 'gal_125_aug_1'
checkpoint = 148000 

# This folder will be created and filled with txt.files for each sample after you run the Pytorch Dataset Notebook
dataset_folder = f'data/dataset_samples'

# The folder at which the model will be saved. This folder has to be created for your system 
model_folder = f'data/models/{model_name}'
os.makedirs(model_folder, exist_ok=True)


# Folder in which the tokenizer will be saved
tokenizer_folder = f'data/tokenizers/{model_name}'
os.makedirs(tokenizer_folder, exist_ok=True)

# Folder at which all pickle files are stored. This folder is fixed for this project and should not be changed
dump_dir = r'PK_DUMP'

# Model parameters 
'''
mini	125 M
base	1.3 B
standard	6.7 B
large	30 B
huge	120 B'''
base_model_name = 'mini'

# All new Torch-objects will be by default in this dtype
# if default_type = float16 fp16 must be False
default_dtype = torch.float32
torch.backends.cuda.matmul.allow_tf32 = True
torch.set_default_dtype(default_dtype)

# Default device on which the model will be loaded
default_device = 'cpu'

# Number of GPUs the model will be parallelised to 
num_gpus = 1
# If you change 'default_device' to 'cpu', make sure to set num_gpus to zero.
if default_device == 'cpu':
    num_gpus = 0

tensor_parallel = False


###########################
# Loading the Model
###########################
device_map=None
max_memory = {}
if num_gpus > 0:
    # based on https://github.com/huggingface/accelerate/blob/5315290b55ea9babd95a281a27c51d87b89d7c85/src/accelerate/utils/modeling.py#L274
    for i in range(num_gpus):
        _ = torch.tensor([0], device=i)
    for i in range(num_gpus):
        max_memory[i] = torch.cuda.mem_get_info(i)[0]
    device_map = "auto"
max_memory["cpu"] = psutil.virtual_memory().available
             
model = OPTForCausalLM.from_pretrained(f'{model_folder}/checkpoint-{checkpoint}', torch_dtype=default_dtype, low_cpu_mem_usage=True,
                                               device_map=device_map, max_memory=max_memory)

###########################
# Loading the Tokenizer
###########################
tokenizer = AutoTokenizer.from_pretrained(tokenizer_folder)
n_f_terms = len(tokenizer) - tokenizer.vocab_size
print('Loadede Tokenizer from serialized instance!')    
print(f'There are {n_f_terms} different F-Terms in the whole Dataset!')


###########################
# Loading Descriptions
###########################
with open(f'{dump_dir}/themes_descriptions.pk', 'rb') as f:
    theme_dict = pk.load(f)
with open(f'{dump_dir}/viewpoints_descriptions.pk', 'rb') as f:
    viewpoint_dict = pk.load(f)
with open(f'{dump_dir}/numbers_descriptions.pk', 'rb') as f:
    number_dict = pk.load(f)
with open(f'{dump_dir}/full_descriptions.pk', 'rb') as f:
        full_descriptions_dict = pk.load(f)

###########################
# Embeddings
###########################
# Output Embeddings
out_emb = model.get_output_embeddings()
out_emb = next(out_emb.parameters()).detach().numpy()[2:]
# Input Embeddings
inp_emb = model.get_input_embeddings()
inp_emb = inp_emb(torch.arange(len(tokenizer)))

# Calculating and Plotting a Single T-SNE Plot For a Models Output Embeddings

In [3]:
# Extracting the matching F_terms for the weights and creating lists with the defintions
tokens = [tokenizer.decode(i) for i in range(len(tokenizer))]
f_term_tokens = tokens[50002:]


def try_split(token):
    """
    This funktion tries to split a token into Theme, viewpoint and number 
    and returns them as a list
    if it is not possible just theme and viepoint or just the theme are returned"""
    token = token[:-1]
    split_token = token.split('/')
    theme = split_token[0] 
    
    if len(split_token) == 1: 
        return [theme]

    if len(split_token[1]) ==2:
        viewpoint = theme + '/' + split_token[1] 
        return [theme, viewpoint]

    viewpoint = theme + '/' + split_token[1][:2]
    number = token 
    return [theme, viewpoint, number]


theme_descriptions = []
viewpoint_descriptions = []
number_descriptions = []


for i, token in enumerate(f_term_tokens):
    split = try_split(token)
    try: 
        theme_descriptions.append(theme_dict[split[0]])
    except (KeyError, IndexError):
        theme_descriptions.append('no definition')

    try: 
        viewpoint_descriptions.append(viewpoint_dict[split[1]])
    except (KeyError, IndexError):
        viewpoint_descriptions.append('no definition')

    try: 
        number_descriptions.append(number_dict[split[2]])
    except (KeyError, IndexError):
        number_descriptions.append('no definition')


In [4]:
# Calculating the TSNE Representation of all output embeddigns
if os.path.isfile(f'{model_folder}/output_tsne_rep_{checkpoint}.pk'):
    with open(f'{model_folder}/output_tsne_rep_{checkpoint}.pk', 'rb') as f:
        tsne_rep = pk.load(f)
else:
    tsne = TSNE(n_components=2, verbose=0, random_state=69) 
    tsne_rep = tsne.fit_transform(out_emb)
    with open(f'{model_folder}/output_tsne_rep_{checkpoint}.pk', 'wb') as f:
        pk.dump(tsne_rep, f)

In [5]:
# Plotting the TSNE Representation

# Generating the colors representing the themes
themes = []
t_dict = {}
for f_term in f_term_tokens:
    theme = f_term.split('/')[0]
    try: 
        i = t_dict[theme]
    except KeyError:
        i = len(t_dict)+1
        t_dict[theme] = i
    themes.append(i)


bokeh_palette = bokeh.palettes.Turbo256
color_palette = bokeh_palette
    
colors = [color_palette[theme%256] for theme in themes]

# Splitting  the t-sne representation, the colors and the definitions to theme, viewpoint and f-term

theme_tokens = []
theme_tsne = []
theme_colors = []
theme_desc = []

viewpoint_tokens = []
viewpoint_tsne = []
viewpoint_colors = []
viewpoint_theme = []
viewpoint_desc = []

number_tokens = []
number_tsne = []
number_colors = []
number_theme = []
number_viewpoint = []
number_desc = []


for i, token in enumerate(f_term_tokens):
    split = try_split(token)

    if len(split) == 3:
        number_tokens.append(token)
        number_tsne.append(tsne_rep[i])
        number_colors.append(colors[i])
        number_theme.append(theme_descriptions[i])
        number_viewpoint.append(viewpoint_descriptions[i])
        number_desc.append(number_descriptions[i])
        continue

    if len(split) == 2:
        viewpoint_tokens.append(token)
        viewpoint_tsne.append(tsne_rep[i])
        viewpoint_colors.append(colors[i])
        viewpoint_theme.append(theme_descriptions[i])
        viewpoint_desc.append(viewpoint_descriptions[i])
        continue

    if len(split) ==1:
        theme_tokens.append(token)
        theme_tsne.append(tsne_rep[i])
        theme_colors.append(colors[i])
        theme_desc.append(theme_descriptions[i])
        continue

number_tsne = np.stack(number_tsne, axis=0)
viewpoint_tsne = np.stack(viewpoint_tsne, axis=0)
theme_tsne = np.stack(theme_tsne, axis=0)

# Genrating the datasources 
datasource_themes = ColumnDataSource(
        data=dict(
            x = theme_tsne[:,0],
            y = theme_tsne[:,1],
            tokens = theme_tokens,
            theme = theme_desc,
            colors = theme_colors
        )
    )

datasource_viewpoints = ColumnDataSource(
        data=dict(
            x = viewpoint_tsne[:,0],
            y = viewpoint_tsne[:,1],
            tokens = viewpoint_tokens,
            theme = viewpoint_theme,
            viewpoint = viewpoint_desc,
            colors = viewpoint_colors
        )
    )

datasource_numbers = ColumnDataSource(
        data=dict(
            x = number_tsne[:,0],
            y = number_tsne[:,1],
            tokens = number_tokens,
            theme = number_theme,
            viewpoint = number_viewpoint,
            number = number_desc,
            colors = number_colors
        )
    )


hover_tsne = HoverTool(tooltips='<div style="font-size: 12px;"><b>Token:</b> @tokens<br><b>Theme:</b> @theme<br><b>viewpoint:</b> @viewpoint<br><b>Number:</b> @number</div>', mode='mouse')
tools_tsne = [hover_tsne, 'pan', 'wheel_zoom', 'reset']
plot_tsne = figure(width=1500, height=1500, tools=tools_tsne, title='Output Embeddings')
    
plot_tsne.square('x', 'y', size=20, fill_color='colors', 
                     alpha=0.7, line_width=0, source=datasource_themes, name="Themes")
    
plot_tsne.circle('x', 'y', size=15, fill_color='colors', 
                     alpha=0.7, line_width=0, source=datasource_viewpoints, name="Viewpoints")

plot_tsne.diamond('x', 'y', size=10, fill_color='colors', 
                     alpha=0.7, line_width=0, source=datasource_numbers, name="Numbers / F-Terms")

show(plot_tsne)


ValueError: need at least one array to stack

# Calculating the T-SNE Representations Individually for Themes Viewpoints and Numbers

In [None]:
# Splitting the Embeddings into Themes, viewpoints and Numbers


themes = []
viewpoints = []
numbers = []

theme_out_emb = []
viewpoint_out_emb = []
number_out_emb = []

for emb_vector, f_term_token in zip(out_emb, f_term_tokens):
    if len(f_term_token) == 6:
        theme_out_emb.append(emb_vector)
        themes.append(f_term_token)
        
    if len(f_term_token) == 9:
        viewpoint_out_emb.append(emb_vector)
        viewpoints.append(f_term_token)
        
    if len(f_term_token) == 11:
        number_out_emb.append(emb_vector)
        numbers.append(f_term_token)
    
theme_out_emb = np.stack(theme_out_emb, 0)
viewpoint_out_emb = np.stack(viewpoint_out_emb, 0)
number_out_emb = np.stack(number_out_emb, 0)

theme_out_emb.shape, viewpoint_out_emb.shape, number_out_emb.shape

In [None]:
# Calculating the T-SNE Represenations

if os.path.isfile(f'{model_folder}/output_tsne_rep_themes{checkpoint}.pk'):
    with open(f'{model_folder}/output_tsne_rep_themes{checkpoint}.pk', 'rb') as f:
        tsne_rep_theme = pk.load(f)
else:
    print('Calculating the T-SNE Representations of the Themes')
    tsne = TSNE(n_components=2, verbose=0, random_state=69) 
    tsne_rep_theme = tsne.fit_transform(theme_out_emb)
    
    with open(f'{model_folder}/output_tsne_rep_themes{checkpoint}.pk', 'wb') as f:
        pk.dump(tsne_rep_theme, f)

if os.path.isfile(f'{model_folder}/output_tsne_rep_viewpoints{checkpoint}.pk'):
    with open(f'{model_folder}/output_tsne_rep_viewpoints{checkpoint}.pk', 'rb') as f:
        tsne_rep_viewpoint = pk.load(f)
else:
    print('Calculating the T-SNE Representations of the Viewpoints')
    tsne = TSNE(n_components=2, verbose=0, random_state=69) 
    tsne_rep_viewpoint = tsne.fit_transform(viewpoint_out_emb)
    
    with open(f'{model_folder}/output_tsne_rep_viewpoints{checkpoint}.pk', 'wb') as f:
        pk.dump(tsne_rep_viewpoint, f)

if os.path.isfile(f'{model_folder}/output_tsne_rep_numbers{checkpoint}.pk'):
    with open(f'{model_folder}/output_tsne_rep_numbers{checkpoint}.pk', 'rb') as f:
        tsne_rep_number = pk.load(f)
else:
    print('Calculating the T-SNE Representations of the numbers')
    tsne = TSNE(n_components=2, verbose=0, random_state=69) 
    tsne_rep_number = tsne.fit_transform(number_out_emb)
    
    with open(f'{model_folder}/output_tsne_rep_numbers{checkpoint}.pk', 'wb') as f:
        pk.dump(tsne_rep_number, f)

# Plotting the theme T-SNE Representations

In [None]:
# Theme T-SNE

datasource_themes = ColumnDataSource(
        data=dict(
            x = tsne_rep_theme[:,0],
            y = tsne_rep_theme[:,1],
            tokens = themes,
            theme = [theme_dict[theme[:-1]] for theme in themes]
        )
    )

hover_tsne = HoverTool(tooltips='<div style="font-size: 12px;"><b>Token:</b> @tokens<br><b>Theme:</b> @theme</div>', mode='mouse')
tools_tsne = [hover_tsne, 'pan', 'wheel_zoom', 'reset']
plot_tsne = figure(width=1500, height=1500, tools=tools_tsne, title='Theme Output Embeddings')
    
plot_tsne.square('x', 'y', size=20, fill_color='blue', 
                     alpha=0.7, line_width=0, source=datasource_themes, name="Themes")

show(plot_tsne)

In [None]:
# Viewpoint T-SNE

colors = []
colors_dict = {theme[:-1]: i for i, theme in enumerate(themes)}
desc = []

for viewpoint in viewpoints:
    theme = viewpoint.split('/')[0]
    colors.append(colors_dict[theme])
    try: 
        d = viewpoint_dict[viewpoint[:-1]]
        desc.append(d)
    except KeyError:
        desc.append('description not found')


bokeh_palette = bokeh.palettes.Turbo256
color_palette = bokeh_palette

colors = [color_palette[c%256] for c in colors]


datasource_viewpoints = ColumnDataSource(
        data=dict(
            x = tsne_rep_viewpoint[:,0],
            y = tsne_rep_viewpoint[:,1],
            tokens = viewpoints,
            viewpoint = desc,
            colors = colors
        )
    )

hover_tsne = HoverTool(tooltips='<div style="font-size: 12px;"><b>Token:</b> @tokens<br><b>viewpoint:</b> @viewpoint</div>', mode='mouse')
tools_tsne = [hover_tsne, 'pan', 'wheel_zoom', 'reset']
plot_tsne = figure(width=1500, height=1500, tools=tools_tsne, title='Viewpoints Output Embeddings')
    
plot_tsne.circle('x', 'y', size=15, fill_color='colors', 
                     alpha=0.7, line_width=0, source=datasource_viewpoints, name="Viewpoints")

show(plot_tsne)

In [None]:
# Number T-SNE

colors = []
colors_dict = {theme[:-1]: i for i, theme in enumerate(themes)}
desc = []
viewpoint_desc = []
theme_desc = []

for number in numbers:
    theme = number.split('/')[0]
    theme_desc.append(theme_dict[theme])
    try:
        viewpoint_desc.append(viewpoint_dict[number[:-3]])
    except KeyError:
        viewpoint_desc.append('description not found')
    colors.append(colors_dict[theme])
    try: 
        d = number_dict[number[:-1]]
        desc.append(d)
    except KeyError:
        desc.append('description not found')

bokeh_palette = bokeh.palettes.Turbo256
color_palette = bokeh_palette

colors = [color_palette[c%256] for c in colors]


datasource_numbers = ColumnDataSource(
        data=dict(
            x = tsne_rep_number[:,0],
            y = tsne_rep_number[:,1],
            tokens = numbers,
            theme = theme_desc,
            viewpoint = viewpoint_desc,
            desc = desc,
            colors = colors
        )
    )

hover_tsne = HoverTool(tooltips='<div style="font-size: 12px;"><b>Token:</b> @tokens<br><b>Theme:</b> @theme<br><b>viewpoint:</b> @viewpoint<br><b>Number:</b> @desc</div>', mode='mouse')
tools_tsne = [hover_tsne, 'pan', 'wheel_zoom', 'reset']
plot_tsne = figure(width=1500, height=1500, tools=tools_tsne, title='Nubers Output Embeddings')
    
plot_tsne.diamond('x', 'y', size=8, fill_color='colors', 
                     alpha=0.7, line_width=0, source=datasource_numbers, name="Numbers")

show(plot_tsne)

# Subtracting Themes from viewpoints and plotting the residual vectors

In [None]:
# Splitting the Embeddings into Themes, viewpoints and Numbers


themes = []
viewpoints = []
numbers = []

theme_out_emb = []
viewpoint_out_emb = []
number_out_emb = []

for emb_vector, f_term_token in zip(out_emb, f_term_tokens):
    if len(f_term_token) == 6:
        theme_out_emb.append(emb_vector)
        themes.append(f_term_token)
        
    if len(f_term_token) == 9:
        viewpoint_out_emb.append(emb_vector)
        viewpoints.append(f_term_token)
        
    if len(f_term_token) == 11:
        number_out_emb.append(emb_vector)
        numbers.append(f_term_token)

theme_out_dict = {theme[:-1]: vec for theme, vec in zip(themes, theme_out_emb)}

clean_viewpoint_emb = []

for viewpoint, vec in zip(viewpoints, viewpoint_out_emb):
    theme = viewpoint.split('/')[0]
    theme_vec = theme_out_dict[theme]
    vec -= theme_vec
    clean_viewpoint_emb.append(vec)

clean_viewpoint_emb = np.stack(clean_viewpoint_emb, 0)

if os.path.isfile(f'{model_folder}/output_tsne_rep_viewpoints_clean{checkpoint}.pk'):
    with open(f'{model_folder}/output_tsne_rep_viewpoints_clean{checkpoint}.pk', 'rb') as f:
        tsne_rep_viewpoint_clean = pk.load(f)
else:
    # Calculating T-SNE
    print('Calculating Viewpoint T-SNE Representations')
    tsne = TSNE(n_components=2, verbose=0, random_state=69) 
    tsne_rep_viewpoint_clean = tsne.fit_transform(clean_viewpoint_emb)
    
    with open(f'{model_folder}/output_tsne_rep_viewpoints_clean{checkpoint}.pk', 'wb') as f:
        pk.dump(tsne_rep_viewpoint_clean, f)

# Plotting 

colors = []
colors_dict = {theme[:-1]: i for i, theme in enumerate(themes)}
desc = []
themes = []

for viewpoint in viewpoints:
    theme = viewpoint.split('/')[0]
    #
    theme_desc = theme_dict[theme]
    themes.append(theme_desc)
    colors.append(colors_dict[theme])
    try: 
        d = viewpoint_dict[viewpoint[:-1]]
        desc.append(d)
    except KeyError:
        desc.append('description not found')


bokeh_palette = bokeh.palettes.Turbo256
color_palette = bokeh_palette

colors = [color_palette[c%256] for c in colors]


datasource_viewpoints = ColumnDataSource(
        data=dict(
            x = tsne_rep_viewpoint_clean[:,0],
            y = tsne_rep_viewpoint_clean[:,1],
            tokens = viewpoints,
            themes = themes, 
            
            viewpoint = desc,
            colors = colors
        )
    )

hover_tsne = HoverTool(tooltips='<div style="font-size: 12px;"><b>Token:</b> @tokens<br><b>Theme: </b> @themes<br><b>viewpoint:</b> @viewpoint</div>', mode='mouse')
tools_tsne = [hover_tsne, 'pan', 'wheel_zoom', 'reset']
plot_tsne = figure(width=1500, height=1500, tools=tools_tsne, title='viewpoints Output Embeddings No Themes')
    
plot_tsne.circle('x', 'y', size=15, fill_color='colors', 
                     alpha=0.7, line_width=0, source=datasource_viewpoints, name="Viewpoints no Themes")

show(plot_tsne)

# Substracting Themes and Viewpoints from Numbers

In [None]:
# Splitting the Embeddings into Themes, viewpoints and Numbers

themes = []
viewpoints = []
numbers = []

theme_out_emb = []
viewpoint_out_emb = []
number_out_emb = []

for emb_vector, f_term_token in zip(out_emb, f_term_tokens):
    if len(f_term_token) == 6:
        theme_out_emb.append(emb_vector)
        themes.append(f_term_token)
        
    if len(f_term_token) == 9:
        viewpoint_out_emb.append(emb_vector)
        viewpoints.append(f_term_token)
        
    if len(f_term_token) == 11:
        number_out_emb.append(emb_vector)
        numbers.append(f_term_token)

# creating dictionarys to look up the embedding vectors for the individual themes and viewpoints
theme_out_dict = {theme[:-1]: vec for theme, vec in zip(themes, theme_out_emb)}
viewpoint_out_dict = {viewpoint[:-1]: vec for viewpoint, vec in zip(viewpoints, viewpoint_out_emb)}

clean_number_emb = []

themes_for_plot = []
viewpoints_for_plot = []

for number, vec in zip(numbers, number_out_emb):
    theme, vp_nr = number.split('/')
    vp = vp_nr[:2]
    themes_for_plot.append(theme)
    viewpoints_for_plot.append(theme+'/'+vp)
    theme_vec = theme_out_dict[theme]
    viewpoint_vec = viewpoint_out_dict[theme+'/'+vp]
    vec -= theme_vec
    vec -= viewpoint_vec
    clean_number_emb.append(vec)


clean_number_emb = np.stack(clean_number_emb, 0)

if os.path.isfile(f'{model_folder}/output_tsne_rep_numbers_clean{checkpoint}.pk'):
    with open(f'{model_folder}/output_tsne_rep_numbers_clean{checkpoint}.pk', 'rb') as f:
        tsne_rep_number_clean = pk.load(f)
else:
    # Calculating T-SNE
    print('Calculating Viewpoint T-SNE Representations')
    tsne = TSNE(n_components=2, verbose=0, random_state=69) 
    tsne_rep_number_clean = tsne.fit_transform(clean_number_emb)
    
    with open(f'{model_folder}/output_tsne_rep_numbers_clean{checkpoint}.pk', 'wb') as f:
        pk.dump(tsne_rep_number_clean, f)

# Plotting 
colors = []
colors_dict = {viewpoint[:-1]: i for i, viewpoint in enumerate(viewpoints)}
desc = []

for number in numbers:
    theme, vpnr = number.split('/')
    vp = theme +'/' + vpnr[:2]
    colors.append(colors_dict[vp])
    try: 
        d = number_dict[number[:-1]]
        desc.append(d)
    except KeyError:
        desc.append('description not found')


bokeh_palette = bokeh.palettes.Turbo256
color_palette = bokeh_palette

colors = [color_palette[c%256] for c in colors]

theme_desc = []
for theme in themes_for_plot:
    try:
        desc = theme_dict[theme]
    except KeyError:
        desc = 'description not found'
    theme_desc.append(desc)

vp_desc =  []
for vp in viewpoints_for_plot:
    try: 
        desc = viewpoint_dict[vp]
    except KeyError:
        desc = 'descritpion not found'
    vp_desc.append( desc)

desc = []

for number in numbers:
    try: 
        desc.append(number_dict[number[:-1]])
    except KeyError:
        desc.append('description not found')

datasource_viewpoints = ColumnDataSource(
        data=dict(
            x = tsne_rep_number_clean[:,0],
            y = tsne_rep_number_clean[:,1],
            tokens = numbers,
            theme = theme_desc,
            viewpoint = vp_desc,
            number = desc,
            colors = colors
        )
    )

hover_tsne = HoverTool(tooltips='<div style="font-size: 12px;"><b>Token:</b> @tokens<br><b>Theme:</b> @theme<br><b>viewpoint:</b> @viewpoint<br><b>Number:</b> @number</div>', mode='mouse')
tools_tsne = [hover_tsne, 'pan', 'wheel_zoom', 'reset']
plot_tsne = figure(width=1500, height=1500, tools=tools_tsne, title='Numbers Output Embeddings No Themes')
    
plot_tsne.circle('x', 'y', size=15, fill_color='colors', 
                     alpha=0.7, line_width=0, source=datasource_viewpoints, name="Numbers no Themes")

show(plot_tsne)

# Calculating TSNE Plots

In [None]:
for checkpoint in range(0, 300000, 500):
    print('Processing checkpoint', checkpoint)
    device_map=None
    max_memory = {}
    if num_gpus > 0:
        # based on https://github.com/huggingface/accelerate/blob/5315290b55ea9babd95a281a27c51d87b89d7c85/src/accelerate/utils/modeling.py#L274
        for i in range(num_gpus):
             _ = torch.tensor([0], device=i)
        for i in range(num_gpus):
            max_memory[i] = torch.cuda.mem_get_info(i)[0]
        device_map = "auto"
    max_memory["cpu"] = psutil.virtual_memory().available
             
    try:
        model = OPTForCausalLM.from_pretrained(f'{model_folder}/checkpoint-{checkpoint}', torch_dtype=default_dtype, low_cpu_mem_usage=True,
                                               device_map=device_map, max_memory=max_memory)
    except Exception:
        print('failed to load the model continuing to the next one')
        continue
        
    emb = model.get_input_embeddings()
    out_emb = model.get_output_embeddings()
    
    with open(f'{dump_dir}/full_descriptions.pk', 'rb') as f:
        full_descriptions_dict = pk.load(f)

    # Embeddins for text and f-term tokens
    
    tokens = [tokenizer.decode(i) for i in range(len(tokenizer))]
    f_term_definition = []
    for i in range(len(tokenizer)):
        if i%1000 ==0:
            print(i, end='\r')
        try:
            definition = full_descriptions_dict[tokenizer.decode(i)[:-1]]
            f_term_definition.append(definition)
        except KeyError:
            f_term_definition.append('-')
    emb_vectors = emb(torch.arange(len(tokenizer)))

    print('Calculating output TSNE')
    tsne = TSNE(n_components=2, verbose=0, random_state=69) 
    tsne_rep_out_emb = tsne.fit_transform(next(out_emb.parameters()).detach().numpy())
    
    with open(f'{model_folder}/output_emb_tsne_{checkpoint}.pk', 'wb') as f: 
        pk.dump(tsne_rep_out_emb, f)

    # Debug remove later
    continue
    
    print('Calculating input TSNE')
    tsne = TSNE(n_components=2, verbose=0, random_state=69)
    tsne_rep = tsne.fit_transform(emb_vectors.detach().numpy())
    
    with open(f'{model_folder}/input_emb_tsne_{checkpoint}.pk', 'wb') as f:
        pk.dump(tsne_rep, f)
    
    

In [None]:
show_plot = True

for checkpoint in range(10000, 300000, 10000):
    checkpoint=140000
    print('Processing checkpoint', checkpoint)

    try: 
        with open(f'{model_folder}/output_emb_tsne_{checkpoint}.pk', 'rb') as f:
            tsne_rep_out_emb = pk.load(f)
    except FileNotFoundError:
        print(f'No saved output embeddings TSNE representation found for {model_folder}/output_emb_tsne_{checkpoint}.pk!')
        continue
    
    try:
        with open(f'{model_folder}/input_emb_tsne_{checkpoint}.pk', 'rb') as f:
            tsne_rep = pk.load(f)
    except FileNotFoundError:
        print(f'No saved TSNE representation found for {model_folder}/input_emb_tsne_{checkpoint}.pk!')
        

    

    # Generating the metadata for the plots
    with open(f'{dump_dir}/full_descriptions.pk', 'rb') as f:
        full_descriptions_dict = pk.load(f)

    tokens = [tokenizer.decode(i) for i in range(len(tokenizer))]
    f_term_definition = []
    for i in range(len(tokenizer)):
        if i%1000 ==0:
            print(i, end='\r')
        try:
            definition = full_descriptions_dict[tokenizer.decode(i)[:-1]]
            f_term_definition.append(definition)
        except KeyError:
            f_term_definition.append('-')


    f_terms = tokens[50000:]

    themes = []
    theme_dict = {}
    for f_term in f_terms:
        theme = f_term.split('/')[0]
        try: 
            i = theme_dict[theme]
        except KeyError:
            i = len(theme_dict)+1
            theme_dict[theme] = i
        themes.append(i)


    bokeh_palette = bokeh.palettes.Turbo256
    color_palette = bokeh_palette
    
    colors = [color_palette[theme%256] for theme in themes]
    len(colors), colors[-1]

    # Generating the output TSNE repräsentation
    datasource_f_terms = ColumnDataSource(
        data=dict(
            x = tsne_rep_out_emb[:,0],
            y = tsne_rep_out_emb[:,1],
            desc = tsne_rep_out_emb[50000:,1],
            F_Terms = tokens[50000:],
            Definitions = f_term_definition[50000:],
            colors = colors
            )
        )
    
    
    
    
    hover_tsne = HoverTool(tooltips='<div style="font-size: 12px;"><b>F-Term:</b> @F_Terms<br><b>Definition:</b> @Definitions</div>', mode='mouse')
    tools_tsne = [hover_tsne, 'pan', 'wheel_zoom', 'reset']
    plot_tsne = figure(width=1500, height=1500, tools=tools_tsne, title='Output Embeddings')
    
    plot_tsne.square('x', 'y', size=5, fill_color='colors', 
                     alpha=0.7, line_width=0, source=datasource_f_terms, name="F-Terms")

    # Saving the Image
    if show_plot:
        show(plot_tsne)
    else:
        export_png(plot_tsne, filename=f'{model_folder}/Output_TSNE_{checkpoint}.png')
    
    # Generating the input embeddings
    datasource_f_terms = ColumnDataSource(
        data=dict(
            x = tsne_rep[50000:,0],
            y = tsne_rep[50000:,1],
            desc = tsne_rep[50000:,1],
            F_Terms = tokens[50000:],
            Definitions = f_term_definition[50000:]
        )
    )
    
    datasource_text_tokens = ColumnDataSource(
            data=dict(
                x = tsne_rep[:50000,0],
                y = tsne_rep[:50000,1],
                desc = tsne_rep[:50000,1],
                Tokens = tokens[:50000],
                
            )
        )
    
    
    hover_tsne = HoverTool(tooltips='<div style="font-size: 12px;"><b>F-Term:</b> @F_Terms<br><b>Definition:</b> @Definitions<br><b>Token:</b> @Tokens</div>', mode='mouse')
    tools_tsne = [hover_tsne, 'pan', 'wheel_zoom', 'reset']
    plot_tsne = figure(width=1500, height=1500, tools=tools_tsne, title='Input Embeddings')
    
    plot_tsne.square('x', 'y', size=5, fill_color='orange', 
                     alpha=0.7, line_width=0, source=datasource_f_terms, name="F-Terms")
    
    plot_tsne.circle('x', 'y', size=5, fill_color='blue', 
                     alpha=0.7, line_width=0, source=datasource_text_tokens, name="Text Tokens")

    if show_plot:
        
        show(plot_tsne)
        break
    else:
        export_png(plot_tsne, filename=f'{model_folder}/Input_TSNE_{checkpoint}.png')
    
    

In [None]:
device_map=None
max_memory = {}
if num_gpus > 0:
    # based on https://github.com/huggingface/accelerate/blob/5315290b55ea9babd95a281a27c51d87b89d7c85/src/accelerate/utils/modeling.py#L274
    for i in range(num_gpus):
         _ = torch.tensor([0], device=i)
    for i in range(num_gpus):
        max_memory[i] = torch.cuda.mem_get_info(i)[0]
    device_map = "auto"
max_memory["cpu"] = psutil.virtual_memory().available
         

model = OPTForCausalLM.from_pretrained(f'{model_folder}/checkpoint-30000', torch_dtype=default_dtype, low_cpu_mem_usage=True,
                                           device_map=device_map, max_memory=max_memory)



In [None]:
emb = model.get_input_embeddings()
out_emb = model.get_output_embeddings()


In [None]:
# Embeddins for text and f-term tokens

with open(f'{dump_dir}/full_descriptions.pk', 'rb') as f:
    full_descriptions_dict = pk.load(f)

tokens = [tokenizer.decode(i) for i in range(len(tokenizer))]
f_term_definition = []
for i in range(len(tokenizer)):
    if i%1000 ==0:
        print(i, end='\r')
    try:
        definition = full_descriptions_dict[tokenizer.decode(i)[:-1]]
        f_term_definition.append(definition)
    except KeyError:
        f_term_definition.append('-')
emb_vectors = emb(torch.arange(len(tokenizer)))

In [None]:
with open(f'{model_folder}/input_emb_tsne.pk', 'rb') as f:
    tsne_rep = pk.load(f)

with open(f'{model_folder}/output_emb_tsne.pk', 'rb') as f:
   tsne_rep_out_emb = pk.load(f)

tsne = TSNE(n_components=2, verbose=0, random_state=69)
tsne_rep_out_emb = tsne.fit_transform(next(out_emb.parameters()).detach().numpy())

tsne = TSNE(n_components=2, verbose=0, random_state=69)
tsne_rep = tsne.fit_transform(emb_vectors.detach().numpy())

with open(f'{model_folder}/input_emb_tsne.pk', 'wb') as f:
    pk.dump(tsne_rep, f)

with open(f'{model_folder}/output_emb_tsne.pk', 'wb') as f:
    pk.dump(tsne_rep_out_emb, f)

In [None]:
datasource_f_terms = ColumnDataSource(
        data=dict(
            x = tsne_rep_out_emb[:,0],
            y = tsne_rep_out_emb[:,1],
            desc = tsne_rep[50000:,1],
            F_Terms = tokens[50000:],
            Definitions = f_term_definition[50000:]
        )
    )




hover_tsne = HoverTool(tooltips='<div style="font-size: 12px;"><b>F-Term:</b> @F_Terms<br><b>Definition:</b> @Definitions</div>', mode='mouse')
tools_tsne = [hover_tsne, 'pan', 'wheel_zoom', 'reset']
plot_tsne = figure(width=1500, height=1500, tools=tools_tsne, title='Output Embeddings')

plot_tsne.square('x', 'y', size=5, fill_color='orange', 
                 alpha=0.7, line_width=0, source=datasource_f_terms, name="F-Terms")

show(plot_tsne)

In [None]:
datasource_f_terms = ColumnDataSource(
        data=dict(
            x = tsne_rep[50000:,0],
            y = tsne_rep[50000:,1],
            desc = tsne_rep[50000:,1],
            F_Terms = tokens[50000:],
            Definitions = f_term_definition[50000:]
        )
    )

datasource_text_tokens = ColumnDataSource(
        data=dict(
            x = tsne_rep[:50000,0],
            y = tsne_rep[:50000,1],
            desc = tsne_rep[:50000,1],
            Tokens = tokens[:50000],
            
        )
    )


hover_tsne = HoverTool(tooltips='<div style="font-size: 12px;"><b>F-Term:</b> @F_Terms<br><b>Definition:</b> @Definitions<br><b>Token:</b> @Tokens</div>', mode='mouse')
tools_tsne = [hover_tsne, 'pan', 'wheel_zoom', 'reset']
plot_tsne = figure(width=1500, height=1500, tools=tools_tsne, title='Input Embeddings')

plot_tsne.square('x', 'y', size=5, fill_color='orange', 
                 alpha=0.7, line_width=0, source=datasource_f_terms, name="F-Terms")

plot_tsne.circle('x', 'y', size=5, fill_color='blue', 
                 alpha=0.7, line_width=0, source=datasource_text_tokens, name="Text Tokens")

show(plot_tsne)

In [None]:
datasource_text_tokens = ColumnDataSource(
        data=dict(
            x = tsne_rep[:50000,0],
            y = tsne_rep[:50000,1],
            desc = tsne_rep[:50000,1],
            Tokens = tokens[:50000],
            
        )
    )



hover_tsne = HoverTool(tooltips='<div stype="font-size: 15px;"><b>Token:</b> @Tokens</div>')
tools_tsne = [hover_tsne, 'pan', 'wheel_zoom', 'reset']
plot_tsne = figure(width=1500, height=1500, tools=tools_tsne, title='Input Text Embeddings')

plot_tsne.circle('x', 'y', size=5, fill_color='blue', 
                 alpha=0.7, line_width=0, source=datasource_text_tokens, name="Text Tokens")

show(plot_tsne)

In [None]:
# Name of this experiment
model_name = 'gal_125_1'

# This folder will be created and filled with txt.files for each sample after you run the Pytorch Dataset Notebook
dataset_folder = f'data/dataset_samples'

# The folder at which the model will be saved. This folder has to be created for your system 
model_folder = f'data/models/{model_name}'
os.makedirs(model_folder, exist_ok=True)


# Folder in which the tokenizer will be saved
tokenizer_folder = f'data/tokenizers/{model_name}'
os.makedirs(tokenizer_folder, exist_ok=True)

with open(f'{model_folder}/input_emb_tsne.pk', 'rb') as f:
    tsne_rep1 = pk.load(f)

with open(f'{model_folder}/output_emb_tsne.pk', 'rb') as f:
   tsne_rep_out_emb1 = pk.load(f)


tokenizer = AutoTokenizer.from_pretrained(tokenizer_folder)
n_f_terms = len(tokenizer) - tokenizer.vocab_size
print('Loadede Tokenizer from serialized instance!')    
print(f'There are {n_f_terms} different F-Terms in the whole Dataset!')
# Embeddins for text and f-term tokens

tokens = [tokenizer.decode(i) for i in range(len(tokenizer))]
f_term_definition = []
for i in range(len(tokenizer)):
    if i%1000 ==0:
        print(i, end='\r')
    try:
        definition = full_descriptions_dict[tokenizer.decode(i)[:-1]]
        f_term_definition.append(definition)
    except KeyError:
        f_term_definition.append('-')


In [None]:
datasource_f_terms = ColumnDataSource(
        data=dict(
            x = tsne_rep_out_emb1[:,0],
            y = tsne_rep_out_emb1[:,1],
            desc = tsne_rep_out_emb1[:,1],
            F_Terms = tokens[50000:],
            Definitions = f_term_definition[50000:]
        )
    )




hover_tsne = HoverTool(tooltips='<div style="font-size: 12px;"><b>F-Term:</b> @F_Terms<br><b>Definition:</b> @Definitions</div>', mode='mouse')
tools_tsne = [hover_tsne, 'pan', 'wheel_zoom', 'reset']
plot_tsne = figure(width=1500, height=1500, tools=tools_tsne, title='Output Embeddings Model 1')

plot_tsne.square('x', 'y', size=5, fill_color='orange', 
                 alpha=0.7, line_width=0, source=datasource_f_terms, name="F-Terms")

show(plot_tsne)

In [None]:
datasource_f_terms = ColumnDataSource(
        data=dict(
            x = tsne_rep1[50000:,0],
            y = tsne_rep1[50000:,1],
            desc = tsne_rep1[50000:,1],
            F_Terms = tokens[50000:],
            Definitions = f_term_definition[50000:]
        )
    )

datasource_text_tokens = ColumnDataSource(
        data=dict(
            x = tsne_rep1[:50000,0],
            y = tsne_rep1[:50000,1],
            desc = tsne_rep1[:50000,1],
            Tokens = tokens[:50000],
            
        )
    )


hover_tsne = HoverTool(tooltips='<div style="font-size: 12px;"><b>F-Term:</b> @F_Terms<br><b>Definition:</b> @Definitions<br><b>Token:</b> @Tokens</div>', mode='mouse')

tools_tsne = [hover_tsne, 'pan', 'wheel_zoom', 'reset']
plot_tsne = figure(width=1500, height=1500, tools=tools_tsne, title='Input Embeddings Model 1')

plot_tsne.square('x', 'y', size=5, fill_color='orange', 
                 alpha=0.7, line_width=0, source=datasource_f_terms, name="F-Terms")

plot_tsne.circle('x', 'y', size=5, fill_color='blue', 
                 alpha=0.7, line_width=0, source=datasource_text_tokens, name="Text Tokens")

show(plot_tsne)

In [None]:
datasource_text_tokens = ColumnDataSource(
        data=dict(
            x = tsne_rep1[:50000,0],
            y = tsne_rep1[:50000,1],
            desc = tsne_rep1[:50000,1],
            Tokens = tokens[:50000],
            
        )
    )



hover_tsne = HoverTool(tooltips='<div stype="font-size: 12px;"><b>Token:</b> @Tokens</div>')
tools_tsne = [hover_tsne, 'pan', 'wheel_zoom', 'reset']
plot_tsne = figure(width=1500, height=1500, tools=tools_tsne, title='Input Text Embeddings Model 1')

plot_tsne.circle('x', 'y', size=5, fill_color='blue', 
                 alpha=0.7, line_width=0, source=datasource_text_tokens, name="Text Tokens")

show(plot_tsne)

In [None]:
for c1, c2 in zip(tsne_rep1, tsne_rep)[:100]:
    print(c1, c2, c1-c2)

In [None]:
model_name = 'gal_125_untrained'
model_folder = f'data/models/{model_name}'
print(model_folder)

with open(f'{model_folder}/input_emb_tsne.pk', 'rb') as f:
    tsne_rep_untrained = pk.load(f)

with open(f'{model_folder}/output_emb_tsne.pk', 'rb') as f:
   tsne_rep_out_emb_untrained = pk.load(f)

In [None]:
datasource_f_terms = ColumnDataSource(
        data=dict(
            x = tsne_rep_out_emb_untrained[:,0],
            y = tsne_rep_out_emb_untrained[:,1],
            desc = tsne_rep_out_emb_untrained[:,1],
            F_Terms = tokens[50000:],
            Definitions = f_term_definition[50000:]
        )
    )




hover_tsne = HoverTool(tooltips='<div style="font-size: 12px;"><b>F-Term:</b> @F_Terms<br><b>Definition:</b> @Definitions</div>', mode='mouse')
tools_tsne = [hover_tsne, 'pan', 'wheel_zoom', 'reset']
plot_tsne = figure(width=1500, height=1500, tools=tools_tsne, title='Output Embeddings Model untrained')

plot_tsne.square('x', 'y', size=5, fill_color='orange', 
                 alpha=0.7, line_width=0, source=datasource_f_terms, name="F-Terms")

show(plot_tsne)

In [None]:
datasource_f_terms = ColumnDataSource(
        data=dict(
            x = tsne_rep_untrained[50000:,0],
            y = tsne_rep_untrained[50000:,1],
            desc = tsne_rep_untrained[50000:,1],
            F_Terms = tokens[50000:],
            Definitions = f_term_definition[50000:]
        )
    )

datasource_text_tokens = ColumnDataSource(
        data=dict(
            x = tsne_rep_untrained[:50000,0],
            y = tsne_rep_untrained[:50000,1],
            desc = tsne_rep_untrained[:50000,1],
            Tokens = tokens[:50000],
            
        )
    )


hover_tsne = HoverTool(tooltips='<div style="font-size: 12px;"><b>F-Term:</b> @F_Terms<br><b>Definition:</b> @Definitions<br><b>Token:</b> @Tokens</div>', mode='mouse')

tools_tsne = [hover_tsne, 'pan', 'wheel_zoom', 'reset']
plot_tsne = figure(width=1500, height=1500, tools=tools_tsne, title='Input Embeddings Model untrained')

plot_tsne.square('x', 'y', size=5, fill_color='orange', 
                 alpha=0.7, line_width=0, source=datasource_f_terms, name="F-Terms")

plot_tsne.circle('x', 'y', size=5, fill_color='blue', 
                 alpha=0.7, line_width=0, source=datasource_text_tokens, name="Text Tokens")

show(plot_tsne)

In [None]:
datasource_text_tokens = ColumnDataSource(
        data=dict(
            x = tsne_rep_untrained[:50000,0],
            y = tsne_rep_untrained[:50000,1],
            desc = tsne_rep_untrained[:50000,1],
            Tokens = tokens[:50000],
            
        )
    )



hover_tsne = HoverTool(tooltips='<div stype="font-size: 12px;"><b>Token:</b> @Tokens</div>')
tools_tsne = [hover_tsne, 'pan', 'wheel_zoom', 'reset']
plot_tsne = figure(width=1500, height=1500, tools=tools_tsne, title='Input Text Embeddings Model untrained')

plot_tsne.circle('x', 'y', size=5, fill_color='blue', 
                 alpha=0.7, line_width=0, source=datasource_text_tokens, name="Text Tokens")

show(plot_tsne)