In [None]:
# Own Packages
from Masterarbeit_utils.model_utils import get_tokenizer, load_and_modify_model, load_pretrained_Tokenizer

# Site-Packages
import dask.dataframe as dd
import torch
import psutil
import os
import sys
import pickle as pk
import pandas as pd
import numpy as np
import bokeh

# Dimension reduction algorithms
from cuml.manifold import TSNE as cuml_tsne
from sklearn.manifold import TSNE
# Bokeh
from bokeh.io import output_notebook
from bokeh.plotting import figure, show, ColumnDataSource
from bokeh.models import HoverTool
from bokeh.io import export_png

from transformers import AutoTokenizer, OPTForCausalLM
from tokenizers.processors import TemplateProcessing
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding
from torch.utils.data import Dataset

%matplotlib inline
output_notebook()

# Loading the Model and Tokenizer

In [None]:
############
#Parameters
############

"""
The Paths to important folders have to be changed for your system.
"""

# Name of this experiment
model_name = 'gal_125_1'

checkpoint = 140000

# This folder will be created and filled with txt.files for each sample after you run the Pytorch Dataset Notebook
dataset_folder = f'data/dataset_samples'

# The folder at which the model will be saved. This folder has to be created for your system 
model_folder = f'data/models/{model_name}'
os.makedirs(model_folder, exist_ok=True)


# Folder in which the tokenizer will be saved
tokenizer_folder = f'data/tokenizers/{model_name}'
os.makedirs(tokenizer_folder, exist_ok=True)

# Folder at which all pickle files are stored. This folder is fixed for this project and should not be changed
dump_dir = r'PK_DUMP'

# Model parameters 
'''
mini	125 M
base	1.3 B
standard	6.7 B
large	30 B
huge	120 B'''
base_model_name = 'mini'

# All new Torch-objects will be by default in this dtype
# if default_type = float16 fp16 must be False
default_dtype = torch.float32
torch.backends.cuda.matmul.allow_tf32 = True
torch.set_default_dtype(default_dtype)

# Default device on which the model will be loaded
default_device = 'cpu'

# Number of GPUs the model will be parallelised to 
num_gpus = 1
# If you change 'default_device' to 'cpu', make sure to set num_gpus to zero.
if default_device == 'cpu':
    num_gpus = 0

tensor_parallel = False



###########################
# Loading the Model
###########################
device_map=None
max_memory = {}
if num_gpus > 0:
    # based on https://github.com/huggingface/accelerate/blob/5315290b55ea9babd95a281a27c51d87b89d7c85/src/accelerate/utils/modeling.py#L274
    for i in range(num_gpus):
        _ = torch.tensor([0], device=i)
    for i in range(num_gpus):
        max_memory[i] = torch.cuda.mem_get_info(i)[0]
    device_map = "auto"
max_memory["cpu"] = psutil.virtual_memory().available
             
model = OPTForCausalLM.from_pretrained(f'{model_folder}/checkpoint-{checkpoint}', torch_dtype=default_dtype, low_cpu_mem_usage=True,
                                               device_map=device_map, max_memory=max_memory)

###########################
# Loading the Tokenizer
###########################
tokenizer = AutoTokenizer.from_pretrained(tokenizer_folder)
n_f_terms = len(tokenizer) - tokenizer.vocab_size
print('Loadede Tokenizer from serialized instance!')    
print(f'There are {n_f_terms} different F-Terms in the whole Dataset!')


###########################
# Loading Descriptions
###########################
with open(f'{dump_dir}/themes_descriptions.pk', 'rb') as f:
    theme_dict = pk.load(f)
with open(f'{dump_dir}/viewpoints_descriptions.pk', 'rb') as f:
    viewpoint_dict = pk.load(f)
with open(f'{dump_dir}/numbers_descriptions.pk', 'rb') as f:
    number_dict = pk.load(f)
with open(f'{dump_dir}/full_descriptions.pk', 'rb') as f:
        full_descriptions_dict = pk.load(f)

# Extracting the Embeddings

In [None]:
emb = model.get_input_embeddings()
emb = emb(torch.arange(len(tokenizer))).detach().numpy()

out_emb = model.get_output_embeddings()
out_emb = next(out_emb.parameters()).detach().numpy()

print(emb.shape, out_emb.shape)

# Sorting the F-Terms by Theme

In [None]:
all_tokens = [tokenizer.decode(i) for i in range(len(tokenizer))]
f_terms = all_tokens[50001:]

# Generating Theme Clusters
theme_clusters = {}

for emb_vector, f_term in zip(out_emb[1:], f_terms):
    theme = f_term.split('/')[0]
    try:
        _ = theme_clusters[theme]
        theme_clusters[theme].append(emb_vector)

    except KeyError:
        theme_clusters[theme] = [emb_vector]

print(f'Number of Themes:{len(theme_clusters)}')
        
# Averaging the Vectors to generate the vectors corrisponding to the overarching Theme
mean_theme_vectors = {}

for theme, vectors in theme_clusters.items():
    mean_theme_vectors[theme] = np.mean(vectors, axis=-2)
    

# Plotting the Theme Vectors

In [None]:
vectors = [vector for vector in mean_theme_vectors.values()]
vectors = np.stack(vectors, axis=0)

themes = [key for key in mean_theme_vectors.keys()]



# Calculating a TSNE Representation
print('Calculating T-SNE Representation')
tsne = TSNE(n_components=2, verbose=0, random_state=69) 
tsne_rep = tsne.fit_transform(vectors)

# Plotting the Themes
print('Plotting')
datasource_themes = ColumnDataSource(
        data=dict(
            x = tsne_rep[:,0],
            y = tsne_rep[:,1],
            themes = themes, 
            descriptions = [theme_dict[theme] for theme in themes]
            )
        )
    
    
    
    
hover_tsne = HoverTool(tooltips='<div style="font-size: 12px;"><b>Theme:</b> @themes<br><b>Description:</b> @descriptions</div>', mode='mouse')
tools_tsne = [hover_tsne, 'pan', 'wheel_zoom', 'reset']
plot_tsne = figure(width=1500, height=1500, tools=tools_tsne, title='Theme Vectors')
    
    
plot_tsne.square('x', 'y', size=5, fill_color='blue', 
                     alpha=0.7, line_width=0, source=datasource_themes, name="F-Terms")

show(plot_tsne)

# Plotting F-Term vectors without Theme Vectors

In [None]:
# Subtracting the theme Vector from the F-Terms
out_emb_no_theme = []
themes = []

for i, x in enumerate(zip(out_emb[1:], f_terms)):
    emb_vector, f_term = x
    theme = f_term.split('/')[0]
    themes.append(theme)
    emb_vector -= mean_theme_vectors[theme]
    out_emb_no_theme.append(emb_vector)

out_emb_no_theme = np.stack(out_emb_no_theme, axis=0)



# Calculating the T-SNE Representation
print('Calculating T-SNE Representation')
tsne = TSNE(n_components=2, verbose=0, random_state=69) 
tsne_rep = tsne.fit_transform(out_emb_no_theme)
# The calculation takes really long for this reason the plotting is done in the next cell for rapid changes to the plot without recalculating the T-SNE representation.


In [None]:
with open(f'{model_folder}/No Theme F-Term Emb TSNE.pk', 'rb') as f:
    tsne_rep= pk.load(f)

tsne_rep.shape, tsne_rep[:1000]

In [None]:
# Color Palette to make the plot prettier
bokeh_palette = bokeh.palettes.Turbo256
color_palette = bokeh_palette
color_theme_dict = {theme: color_palette[i%256] for i, theme in enumerate(set(themes))}

print(len(themes))
colors = [color_theme_dict[theme] for theme in themes]

x = -1
# Plotting the Themes
print('Plotting', colors[:2], themes[:2], len(colors))
datasource_themes = ColumnDataSource(
        data=dict(
            x = tsne_rep[:x,0],
            y = tsne_rep[:x,1],
            themes = themes[:x], 
            colors = colors[:x], 
            descriptions = [full_descriptions_dict[f_term[:-1]] for f_term in f_terms][:x]
            )
        )
    
    
    
    
hover_tsne = HoverTool(tooltips='<div style="font-size: 12px;"><b>Theme:</b> @themes<br><b>Description:</b> @descriptions</div>', mode='mouse')
tools_tsne = [hover_tsne, 'pan', 'wheel_zoom', 'reset']
plot_tsne = figure(width=1500, height=1500, tools=tools_tsne, title='F Terms with no Theme Vectors')
    
    
plot_tsne.square('x', 'y', size=5, fill_color='colors', 
                     alpha=0.7, line_width=0, source=datasource_themes, name="F-Terms")

show(plot_tsne)

with open(f'{model_folder}/No Theme F-Term Emb TSNE.pk', 'wb') as f:
    pk.dump(tsne_rep, f)